cursor-buddy 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +419 -0
  2. package/package.json +3 -2
package/README.md ADDED
@@ -0,0 +1,419 @@
1
+ # cursor-buddy
2
+
3
+ AI Agent that lives in your cursor, built for web apps. Push-to-talk voice assistant that can see your screen and point at things.
4
+
5
+ Customize its prompt, pass custom tools, choose between browser or server-side speech APIs, use any AI SDK models and customize the UI to fit your needs.
6
+
7
+ ## Features
8
+
9
+ - **Push-to-talk voice input** — Hold a hotkey to speak, release to send
10
+ - **Browser-first live transcription** — Realtime transcript while speaking, with server fallback
11
+ - **Annotated screenshot context** — AI sees your current viewport with numbered interactive elements
12
+ - **Voice responses** — Browser or server TTS, with optional streaming playback
13
+ - **Cursor pointing** — AI can point at UI elements it references
14
+ - **Voice interruption** — Start talking again to cut off current response
15
+ - **Framework agnostic** — Core client written in Typescript, adapter-based architecture
16
+ - **Customizable** — CSS variables, custom components, headless mode
17
+ - **Configurable** — Choose any AI SDK models, equip the agent with tools, or modify the system prompt
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ npm install cursor-buddy
23
+ # or
24
+ pnpm add cursor-buddy
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ ### 1. Server Setup
30
+
31
+ Create an API route that handles chat, transcription, and TTS.
32
+
33
+ Keep `transcriptionModel` configured if you want browser transcription to fall
34
+ back to the server in `auto` mode. Keep `speechModel` configured if you want
35
+ server speech or browser speech fallback in `auto` mode.
36
+
37
+ ```ts
38
+ // lib/cursor-buddy.ts
39
+ import { createCursorBuddyHandler } from "cursor-buddy/server"
40
+ import { openai } from "@ai-sdk/openai"
41
+
42
+ export const cursorBuddy = createCursorBuddyHandler({
43
+ model: openai("gpt-4o"),
44
+ speechModel: openai.speech("tts-1"),
45
+ transcriptionModel: openai.transcription("whisper-1"),
46
+ })
47
+ ```
48
+
49
+ #### Next.js App Router
50
+
51
+ ```ts
52
+ // app/api/cursor-buddy/[...path]/route.ts
53
+ import { toNextJsHandler } from "cursor-buddy/server/next"
54
+ import { cursorBuddy } from "@/lib/cursor-buddy"
55
+
56
+ export const { GET, POST } = toNextJsHandler(cursorBuddy)
57
+ ```
58
+
59
+ ### 2. Client Setup
60
+
61
+ Add the `<CursorBuddy />` component to your app.
62
+
63
+ ```tsx
64
+ // app/layout.tsx
65
+ import { CursorBuddy } from "cursor-buddy/react"
66
+
67
+ export default function RootLayout({ children }) {
68
+ return (
69
+ <html>
70
+ <body>
71
+ {children}
72
+ <CursorBuddy endpoint="/api/cursor-buddy" />
73
+ </body>
74
+ </html>
75
+ )
76
+ }
77
+ ```
78
+
79
+ That's it! Hold **Ctrl+Alt** to speak, release to send.
80
+
81
+ ## Server Configuration
82
+
83
+ ```ts
84
+ createCursorBuddyHandler({
85
+ // Required
86
+ model: LanguageModel, // AI SDK chat model
87
+ speechModel: SpeechModel, // Optional server TTS model
88
+ transcriptionModel: TranscriptionModel, // Optional server fallback for STT
89
+
90
+ // Optional
91
+ system: string | ((ctx) => string), // Custom system prompt
92
+ tools: Record<string, Tool>, // AI SDK tools
93
+ maxHistory: number, // Max conversation history (default: 10)
94
+ })
95
+ ```
96
+
97
+ ### Custom System Prompt
98
+
99
+ ```ts
100
+ createCursorBuddyHandler({
101
+ model: openai("gpt-4o"),
102
+ speechModel: openai.speech("tts-1"),
103
+ transcriptionModel: openai.transcription("whisper-1"),
104
+
105
+ // Extend the default prompt
106
+ system: ({ defaultPrompt }) => `
107
+ ${defaultPrompt}
108
+
109
+ You are helping users navigate a project management dashboard.
110
+ The sidebar contains: Projects, Tasks, Calendar, Settings.
111
+ `,
112
+ })
113
+ ```
114
+
115
+ ## Client Configuration
116
+
117
+ ```tsx
118
+ <CursorBuddy
119
+ // Required
120
+ endpoint="/api/cursor-buddy"
121
+
122
+ // Optional
123
+ hotkey="ctrl+alt" // Push-to-talk hotkey (default: "ctrl+alt")
124
+ container={element} // Portal container (default: document.body)
125
+ transcription={{ mode: "auto" }} // "auto" | "browser" | "server"
126
+ speech={{ mode: "server", allowStreaming: false }}
127
+ // mode: "auto" | "browser" | "server"
128
+ // allowStreaming: speak sentence-by-sentence while chat streams
129
+
130
+ // Custom components
131
+ cursor={(props) => <CustomCursor {...props} />}
132
+ speechBubble={(props) => <CustomBubble {...props} />}
133
+ waveform={(props) => <CustomWaveform {...props} />}
134
+
135
+ // Callbacks
136
+ onTranscript={(text) => {}} // Called when speech is transcribed
137
+ onResponse={(text) => {}} // Called when AI responds
138
+ onPoint={(target) => {}} // Called when AI points at element
139
+ onStateChange={(state) => {}} // Called on state change
140
+ onError={(error) => {}} // Called on error
141
+ />
142
+ ```
143
+
144
+ ### Transcription Modes
145
+
146
+ - `"auto"` — Try browser speech recognition first, then fall back to the
147
+ server transcription route if needed.
148
+ - `"browser"` — Require browser speech recognition. If it fails, the turn
149
+ errors and no server fallback is attempted.
150
+ - `"server"` — Skip browser speech recognition and always use the server
151
+ transcription route.
152
+
153
+ ### Speech Modes
154
+
155
+ - `"auto"` — Try browser speech synthesis first, then fall back to the server
156
+ TTS route if browser speech is unavailable or fails.
157
+ - `"browser"` — Require browser speech synthesis. If it fails, the turn
158
+ errors and no server fallback is attempted.
159
+ - `"server"` — Skip browser speech synthesis and always use the server TTS
160
+ route.
161
+
162
+ ### Speech Streaming
163
+
164
+ - `speech.allowStreaming: false` — Wait for the full `/chat` response, then
165
+ speak it once.
166
+ - `speech.allowStreaming: true` — Speak completed sentence segments as the chat
167
+ stream arrives.
168
+
169
+ ## Customization
170
+
171
+ ### CSS Variables
172
+
173
+ Cursor buddy styles are customizable via CSS variables. Override them in your stylesheet:
174
+
175
+ ```css
176
+ :root {
177
+ /* Cursor colors by state */
178
+ --cursor-buddy-color-idle: #3b82f6;
179
+ --cursor-buddy-color-listening: #ef4444;
180
+ --cursor-buddy-color-processing: #eab308;
181
+ --cursor-buddy-color-responding: #22c55e;
182
+
183
+ /* Speech bubble */
184
+ --cursor-buddy-bubble-bg: #ffffff;
185
+ --cursor-buddy-bubble-text: #1f2937;
186
+ --cursor-buddy-bubble-radius: 8px;
187
+ --cursor-buddy-bubble-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
188
+
189
+ /* Waveform */
190
+ --cursor-buddy-waveform-color: #ef4444;
191
+ }
192
+ ```
193
+
194
+ ### Custom Components
195
+
196
+ Replace default components with your own:
197
+
198
+ ```tsx
199
+ import { CursorBuddy, type CursorRenderProps } from "cursor-buddy/react"
200
+
201
+ function MyCursor({ state, rotation, scale }: CursorRenderProps) {
202
+ return (
203
+ <div style={{ transform: `rotate(${rotation}rad) scale(${scale})` }}>
204
+ {state === "listening" ? "Listening..." : "Point"}
205
+ </div>
206
+ )
207
+ }
208
+
209
+ <CursorBuddy
210
+ endpoint="/api/cursor-buddy"
211
+ cursor={(props) => <MyCursor {...props} />}
212
+ />
213
+ ```
214
+
215
+ ## Headless Mode
216
+
217
+ For full control, use the provider and hook directly:
218
+
219
+ ```tsx
220
+ import {
221
+ CursorBuddyProvider,
222
+ useCursorBuddy
223
+ } from "cursor-buddy/react"
224
+
225
+ function App() {
226
+ return (
227
+ <CursorBuddyProvider endpoint="/api/cursor-buddy">
228
+ <MyCustomUI />
229
+ </CursorBuddyProvider>
230
+ )
231
+ }
232
+
233
+ function MyCustomUI() {
234
+ const {
235
+ state, // "idle" | "listening" | "processing" | "responding"
236
+ liveTranscript, // In-progress transcript while speaking
237
+ transcript, // Latest user speech
238
+ response, // Latest AI response
239
+ audioLevel, // 0-1, for waveform visualization
240
+ isEnabled,
241
+ isPointing,
242
+ error,
243
+
244
+ // Actions
245
+ startListening,
246
+ stopListening,
247
+ setEnabled,
248
+ pointAt, // Manually point at coordinates
249
+ dismissPointing,
250
+ reset,
251
+ } = useCursorBuddy()
252
+
253
+ return (
254
+ <div>
255
+ <p>State: {state}</p>
256
+ <p>Live transcript: {liveTranscript}</p>
257
+ <button
258
+ onMouseDown={startListening}
259
+ onMouseUp={stopListening}
260
+ >
261
+ Hold to speak
262
+ </button>
263
+ </div>
264
+ )
265
+ }
266
+ ```
267
+
268
+ Complete Render Props types:
269
+
270
+ ```ts
271
+ interface CursorRenderProps {
272
+ state: "idle" | "listening" | "processing" | "responding"
273
+ isPointing: boolean
274
+ rotation: number // Radians, direction of travel
275
+ scale: number // 1.0 normal, up to 1.3 during flight
276
+ }
277
+
278
+ interface SpeechBubbleRenderProps {
279
+ text: string
280
+ isVisible: boolean
281
+ }
282
+
283
+ interface WaveformRenderProps {
284
+ audioLevel: number // 0-1
285
+ isListening: boolean
286
+ }
287
+ ```
288
+
289
+ ## Framework-Agnostic Usage
290
+
291
+ For non-React environments, use the core client directly:
292
+
293
+ ```ts
294
+ import { CursorBuddyClient } from "cursor-buddy"
295
+
296
+ const client = new CursorBuddyClient("/api/cursor-buddy", {
297
+ transcription: { mode: "auto" },
298
+ speech: { mode: "server", allowStreaming: false },
299
+ onStateChange: (state) => console.log("State:", state),
300
+ onTranscript: (text) => console.log("Transcript:", text),
301
+ onResponse: (text) => console.log("Response:", text),
302
+ onError: (err) => console.error("Error:", err),
303
+ })
304
+
305
+ // Subscribe to state changes
306
+ client.subscribe(() => {
307
+ const snapshot = client.getSnapshot()
308
+ console.log(snapshot)
309
+ })
310
+
311
+ // Trigger voice interaction
312
+ client.startListening()
313
+ // ... user speaks ...
314
+ client.stopListening()
315
+ ```
316
+
317
+ ## API Reference
318
+
319
+ ### Core Exports (`cursor-buddy`)
320
+
321
+ | Export | Description |
322
+ |--------|-------------|
323
+ | `CursorBuddyClient` | Framework-agnostic client class |
324
+ | `VoiceState` | Type: `"idle" \| "listening" \| "processing" \| "responding"` |
325
+ | `PointingTarget` | Type: `{ x: number, y: number, label: string }` |
326
+ | `Point` | Type: `{ x: number, y: number }` |
327
+
328
+ ### Server Exports (`cursor-buddy/server`)
329
+
330
+ | Export | Description |
331
+ |--------|-------------|
332
+ | `createCursorBuddyHandler` | Create the main request handler |
333
+ | `DEFAULT_SYSTEM_PROMPT` | Default system prompt for reference |
334
+ | `CursorBuddyHandlerConfig` | Type for handler configuration |
335
+ | `CursorBuddyHandler` | Return type of `createCursorBuddyHandler` |
336
+
337
+ ### Server Adapters (`cursor-buddy/server/next`)
338
+
339
+ | Export | Description |
340
+ |--------|-------------|
341
+ | `toNextJsHandler` | Convert handler to Next.js App Router format |
342
+
343
+ ### React Exports (`cursor-buddy/react`)
344
+
345
+ | Export | Description |
346
+ |--------|-------------|
347
+ | `CursorBuddy` | Drop-in component with built-in UI |
348
+ | `CursorBuddyProvider` | Headless provider for custom UI |
349
+ | `useCursorBuddy` | Hook to access state and actions |
350
+
351
+ ### Types (`cursor-buddy/react`)
352
+
353
+ | Export | Description |
354
+ |--------|-------------|
355
+ | `CursorBuddyProps` | Props for `<CursorBuddy />` |
356
+ | `CursorBuddyProviderProps` | Props for `<CursorBuddyProvider />` |
357
+ | `UseCursorBuddyReturn` | Return type of `useCursorBuddy()` |
358
+ | `CursorRenderProps` | Props passed to custom cursor |
359
+ | `SpeechBubbleRenderProps` | Props passed to custom speech bubble |
360
+ | `WaveformRenderProps` | Props passed to custom waveform |
361
+
362
+ ## How It Works
363
+
364
+ 1. User holds the hotkey
365
+ 2. Microphone captures audio, waveform shows audio level, and browser speech recognition starts when available
366
+ 3. User releases hotkey
367
+ 4. An annotated screenshot of the viewport is captured, with numbered markers on visible interactive elements, based on [agent-browser](https://github.com/vercel-labs/agent-browser) implementation.
368
+ 5. The client prefers the browser transcript; if it is unavailable or empty in `auto` mode, the recorded audio is transcribed on the server
369
+ 6. Screenshot + marker context are sent to the AI model
370
+ 7. AI responds with text, optionally including a pointing tag:
371
+ - Preferred: `[POINT:5:Submit]` for numbered interactive elements
372
+ - Fallback: `[POINT:640,360:Error text]` for arbitrary screen coordinates
373
+ 8. Response is spoken in the browser or on the server based on `speech.mode`,
374
+ and can either wait for the full response or stream sentence-by-sentence
375
+ based on `speech.allowStreaming`
376
+ 9. If a marker tag is present, it is resolved back to the live DOM element; if a coordinate tag is present, it is mapped back to the live viewport; then the cursor animates to the target location
377
+ 10. **If user presses hotkey again at any point, current response is interrupted**
378
+
379
+ ## Security Best Practices
380
+
381
+ Since the cursor-buddy endpoints allow direct LLM communication, it is strongly recommended to configure CORS and rate limiting to prevent abuse, unauthorized access, and unexpected API costs.
382
+
383
+ Wrap the handler with CORS and rate limiting:
384
+
385
+ ```ts
386
+ // app/api/cursor-buddy/[...path]/route.ts
387
+ import { toNextJsHandler } from "cursor-buddy/server/next"
388
+ import { cursorBuddy } from "@/lib/cursor-buddy"
389
+
390
+ const handler = toNextJsHandler(cursorBuddy)
391
+
392
+ export async function POST(request: Request) {
393
+ // Verify origin
394
+ const origin = request.headers.get("origin")
395
+ if (origin !== process.env.ALLOWED_ORIGIN) {
396
+ return new Response("Unauthorized", { status: 403 })
397
+ }
398
+
399
+ // Check rate limit (e.g., 10 requests per minute)
400
+ const ip = request.headers.get("x-forwarded-for") || "unknown"
401
+ const { success } = await rateLimiter.limit(ip)
402
+ if (!success) {
403
+ return new Response("Rate limit exceeded", { status: 429 })
404
+ }
405
+
406
+ return handler(request)
407
+ }
408
+
409
+ export const GET = POST
410
+ ```
411
+
412
+ ## TODOs
413
+
414
+ - [ ] High: Make tool calls first class: Pointing becomes tool call (once per turn) + re-use pointing bubble UI for tool calls
415
+ - [ ] Medium: Proper test structure without relying on `as any` for audio and voice capture
416
+
417
+ ## License
418
+
419
+ MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cursor-buddy",
3
- "version": "0.0.5",
3
+ "version": "0.0.6",
4
4
  "description": "AI-powered cursor companion for web apps",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -36,7 +36,8 @@
36
36
  "release:major": "npm version major"
37
37
  },
38
38
  "files": [
39
- "dist"
39
+ "dist",
40
+ "README.md"
40
41
  ],
41
42
  "publishConfig": {
42
43
  "access": "public"