cursor-buddy 0.0.10-beta.0 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { t as pointTool } from "../point-tool-DZJmhD8e.mjs";
2
- import { experimental_generateSpeech, experimental_transcribe, streamText } from "ai";
2
+ import { experimental_generateSpeech, experimental_transcribe, stepCountIs, streamText } from "ai";
3
3
  //#region src/server/system-prompt.ts
4
4
  const DEFAULT_SYSTEM_PROMPT = `You are a helpful AI assistant that lives inside a web page as a cursor companion.
5
5
 
@@ -149,14 +149,32 @@ async function handleChat(request, config) {
149
149
  }
150
150
  ]
151
151
  }];
152
+ const tools = {
153
+ point: pointTool,
154
+ ...config.tools
155
+ };
156
+ const mustContinueUntilText = ({ steps }) => {
157
+ const lastStep = steps.at(-1);
158
+ if (!lastStep) return false;
159
+ const stepText = typeof lastStep.text === "string" ? lastStep.text.trim() : "";
160
+ const hadToolResults = Array.isArray(lastStep.toolResults) && lastStep.toolResults.length > 0;
161
+ if (stepText.length > 0) return true;
162
+ if (hadToolResults) return false;
163
+ return false;
164
+ };
152
165
  return streamText({
153
166
  model: config.model,
154
167
  system: systemPrompt,
155
168
  providerOptions: config?.modelProviderMetadata,
156
169
  messages,
157
- tools: {
158
- point: pointTool,
159
- ...config.tools
170
+ tools,
171
+ stopWhen: [mustContinueUntilText, stepCountIs(3)],
172
+ prepareStep: async ({ stepNumber, steps }) => {
173
+ if (stepNumber === 0) return {};
174
+ const previousStep = steps.at(-1);
175
+ const prevText = typeof previousStep?.text === "string" ? previousStep.text.trim() : "";
176
+ if ((previousStep?.toolCalls?.some((call) => call.toolName === "point") ?? false) && prevText.length === 0) return { activeTools: Object.keys(tools).filter((name) => name !== "point") };
177
+ return {};
160
178
  }
161
179
  }).toUIMessageStreamResponse();
162
180
  }
@@ -1 +1 @@
1
- {"version":3,"file":"index.mjs","names":["transcribe","generateSpeech"],"sources":["../../src/server/system-prompt.ts","../../src/server/routes/chat.ts","../../src/server/routes/transcribe.ts","../../src/server/routes/tts.ts","../../src/server/handler.ts"],"sourcesContent":["export const DEFAULT_SYSTEM_PROMPT = `You are a helpful AI assistant that lives inside a web page as a cursor companion.\n\nYou can see the user's current screen and hear what they say. Respond conversationally. Your response will be spoken aloud with text-to-speech, so keep it natural, concise, and easy to follow.\n\n## Core behavior\n\n- Speak like a helpful companion, not a robot\n- Keep most responses to 1-3 short sentences\n- Focus on what is visible right now on the user's screen\n- If something is unclear or not visible, say that plainly\n- Do not mention screenshots, overlays, internal helper data, or the DOM snapshot to the user\n- Never describe the internal element IDs to the user - they are for your reference only\n\n## Visual Context: DOM Snapshot\n\nYou receive a screenshot of the user's viewport along with a DOM snapshot that lists visible elements in a compact, hierarchical format. The DOM snapshot looks like this:\n\n\\`\\`\\`\n# viewport 1440x900\n@1 nav \"Sidebar\"\n @2 link \"Projects\" [x=24 y=96 w=96 h=28]\n @3 link \"Tasks\" [x=24 y=132 w=72 h=28]\n@4 main\n @5 heading \"Q2 Roadmap\"\n @6 textbox \"Search tasks\" [x=320 y=120 w=280 h=36]\n @7 button \"Filter\" [x=612 y=120 w=84 h=36] [expanded=false]\n @8 checkbox \"Selected\" [checked=false] [x=340 y=220 w=16 h=16]\n\\`\\`\\`\n\n**How to read the DOM snapshot:**\n- Each element starts with \\`@X\\` where X is its unique ID\n- The element's role follows (button, link, textbox, heading, nav, main, etc.)\n- Text content is in quotes after the role\n- \\`[x=... y=... w=... h=...]\\` shows the element's position and size for your reference\n- \\`[key=value]\\` brackets show element state (checked, expanded, disabled, etc.)\n- Indentation shows parent-child relationships\n\n**The DOM snapshot is invisible to the user.** It helps you understand the page structure and identify specific elements to point at. Never mention it to the user.\n\n## The point tool\n\nYou have a \\`point\\` tool that can visually indicate an element on the user's screen.\n\nUse the \\`point\\` tool when the user is asking you to identify, locate, indicate, highlight, or show a specific visible target on screen.\n\nCommon cases where you should use \\`point\\`:\n- the user asks where something is\n- the user asks what to click\n- the user says things like \"show me\", \"point to it\", \"where is it\", \"which one\", \"what should I click\", or \"highlight that\"\n\nDo not use the \\`point\\` tool when spoken guidance alone is enough and the user is not asking you to identify a specific on-screen target.\n\nExamples where spoken guidance alone may be enough:\n- explaining what a page does\n- answering a general question about what is on screen\n- giving brief next-step advice that does not depend on locating a specific element\n\nIf using the \\`point\\` tool:\n- first give the spoken response\n- then call the tool\n- call it at most once per response\n- point only at the most relevant target\n- never replace the tool call with plain text like \"(point here)\" or \"I'm pointing at it now\"\n\nIf the user asks where something is on screen, what to click, or asks you to point something out, you should usually use the point tool rather than only describing it in words.\nDo not say things like \"I can point to it if you want\" when the user already asked where it is. In that case, answer briefly and use the point tool.\n\n## How to point using the point tool\n\nThe point tool accepts an \\`elementId\\` parameter which is the numeric ID from the DOM snapshot (the number after \\`@\\`).\n\n**Example:** To point at the \"Filter\" button from the example above (which is \\`@7\\`):\n\\`\\`\\`\nelementId: 7\nlabel: \"Filter button\"\n\\`\\`\\`\n\n**Steps:**\n1. Find the element in the DOM snapshot by reading its text/role\n2. Note its \\`@X\\` ID\n3. Call the point tool with that numeric ID (just the number, without the @ symbol)\n4. Provide a brief, natural label describing what you're pointing at\n\nThe element's position is resolved in real-time when the cursor moves, so it will point accurately even if the page has changed slightly.\n\n## What to say\n\nWhen the user asks you to point something out:\n- briefly answer in a natural spoken way\n- then use the tool if the request is about locating or indicating something on screen\n\nGood spoken style:\n- \"Click this button right here.\"\n- \"The error message is over here.\"\n- \"This is the field you want.\"\n- \"That setting is in this section.\"\n\nAvoid:\n- mentioning element IDs (like \"@5\" or \"element 12\")\n- mentioning internal tools\n- describing internal reasoning\n- saying you are looking at a screenshot\n\n## If the target is not clear\n\nIf you cannot confidently find the requested thing on screen:\n- say you cannot see it clearly or cannot find it\n- do not point at a random or uncertain target\n\n## Priority\n\nYour first priority is being helpful and correct.\nYour second priority is using the \\`point\\` tool whenever the user is asking you to visually identify a specific thing on screen.\n`\n","import { streamText } from \"ai\"\nimport { pointTool } from \"../../shared/point-tool\"\nimport { DEFAULT_SYSTEM_PROMPT } from \"../system-prompt\"\nimport type { ChatRequestBody, CursorBuddyHandlerConfig } from \"../types\"\n\n/**\n * Handle chat requests: screenshot + transcript → AI SSE stream\n */\nexport async function handleChat(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n const body = (await request.json()) as ChatRequestBody\n const { screenshot, transcript, history, capture, domSnapshot } = body\n\n // Resolve system prompt (string or function)\n const systemPrompt =\n typeof config.system === \"function\"\n ? config.system({ defaultPrompt: DEFAULT_SYSTEM_PROMPT })\n : (config.system ?? DEFAULT_SYSTEM_PROMPT)\n\n // Trim history to maxHistory (default 10 exchanges = 20 messages)\n const maxMessages = (config.maxHistory ?? 10) * 2\n const trimmedHistory = history.slice(-maxMessages)\n\n // Build capture context with DOM snapshot\n const captureContextParts: string[] = []\n\n if (capture) {\n captureContextParts.push(\n `Screenshot size: ${capture.width}x${capture.height} pixels.`,\n )\n }\n\n if (domSnapshot) {\n captureContextParts.push(\n \"\",\n \"Visible page structure (each element has @X ID for pointing):\",\n domSnapshot,\n )\n }\n\n const captureContext =\n captureContextParts.length > 0 ? captureContextParts.join(\"\\n\") : null\n\n // Build messages array with vision content\n const messages = [\n ...trimmedHistory.map((msg) => ({\n role: msg.role as \"user\" | \"assistant\",\n content: msg.content,\n })),\n {\n role: \"user\" as const,\n content: [\n ...(captureContext\n ? [\n {\n type: \"text\" as const,\n text: captureContext,\n },\n ]\n : []),\n {\n type: \"image\" as const,\n image: screenshot,\n },\n {\n type: \"text\" as const,\n text: transcript,\n },\n ],\n },\n ]\n\n const result = streamText({\n model: config.model,\n system: systemPrompt,\n providerOptions: config?.modelProviderMetadata,\n messages,\n tools: {\n point: pointTool,\n ...config.tools,\n },\n })\n\n return result.toUIMessageStreamResponse()\n}\n","import { experimental_transcribe as transcribe } from \"ai\"\nimport type { CursorBuddyHandlerConfig, TranscribeResponse } from \"../types\"\n\n/**\n * Handle transcription requests: audio file → text\n */\nexport async function handleTranscribe(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n if (!config.transcriptionModel) {\n return new Response(\n JSON.stringify({\n error:\n \"Server transcription is not configured. Provide a transcriptionModel or use browser transcription only.\",\n }),\n {\n status: 501,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n\n const formData = await request.formData()\n const audioFile = formData.get(\"audio\")\n\n if (!audioFile || !(audioFile instanceof File)) {\n return new Response(JSON.stringify({ error: \"No audio file provided\" }), {\n status: 400,\n headers: { \"Content-Type\": \"application/json\" },\n })\n }\n\n const audioBuffer = await audioFile.arrayBuffer()\n\n const result = await transcribe({\n model: config.transcriptionModel,\n audio: new Uint8Array(audioBuffer),\n })\n\n const response: TranscribeResponse = { text: result.text }\n\n return new Response(JSON.stringify(response), {\n headers: { \"Content-Type\": \"application/json\" },\n })\n}\n","import { experimental_generateSpeech as generateSpeech } from \"ai\"\nimport type { CursorBuddyHandlerConfig, TTSRequestBody } from \"../types\"\n\n/**\n * Handle TTS requests: text → audio\n */\nexport async function handleTTS(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n if (!config.speechModel) {\n return new Response(\n JSON.stringify({\n error:\n \"Server speech is not configured. Provide a speechModel or use browser speech only.\",\n }),\n {\n status: 501,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n\n const outputFormat = \"wav\"\n const body = (await request.json()) as TTSRequestBody\n const { text } = body\n\n if (!text) {\n return new Response(JSON.stringify({ error: \"No text provided\" }), {\n status: 400,\n headers: { \"Content-Type\": \"application/json\" },\n })\n }\n\n const result = await generateSpeech({\n model: config.speechModel,\n text,\n voice: config?.speechVoice,\n outputFormat,\n })\n\n // Create a new ArrayBuffer copy to satisfy TypeScript's strict typing\n const audioData = new Uint8Array(result.audio.uint8Array)\n\n return new Response(audioData, {\n headers: {\n \"Content-Type\": \"audio/wav\",\n },\n })\n}\n","import { handleChat } from \"./routes/chat\"\nimport { handleTranscribe } from \"./routes/transcribe\"\nimport { handleTTS } from \"./routes/tts\"\nimport type { CursorBuddyHandler, CursorBuddyHandlerConfig } from \"./types\"\n\n/**\n * Create a cursor buddy request handler.\n *\n * The handler responds to three routes based on the last path segment:\n * - /chat - Screenshot + transcript → AI SSE stream\n * - /transcribe - Audio → text\n * - /tts - Text → audio\n *\n * @example\n * ```ts\n * import { createCursorBuddyHandler } from \"cursor-buddy/server\"\n * import { openai } from \"@ai-sdk/openai\"\n *\n * const cursorBuddy = createCursorBuddyHandler({\n * model: openai(\"gpt-4o\"),\n * speechModel: openai.speech(\"tts-1\"), // optional for browser-only speech\n * transcriptionModel: openai.transcription(\"whisper-1\"),\n * })\n * ```\n */\nexport function createCursorBuddyHandler(\n config: CursorBuddyHandlerConfig,\n): CursorBuddyHandler {\n const handler = async (request: Request): Promise<Response> => {\n const url = new URL(request.url)\n const pathSegments = url.pathname.split(\"/\").filter(Boolean)\n const route = pathSegments[pathSegments.length - 1]\n\n switch (route) {\n case \"chat\":\n return handleChat(request, config)\n\n case \"transcribe\":\n return handleTranscribe(request, config)\n\n case \"tts\":\n return handleTTS(request, config)\n\n default:\n return new Response(\n JSON.stringify({\n error: \"Not found\",\n availableRoutes: [\"/chat\", \"/transcribe\", \"/tts\"],\n }),\n {\n status: 404,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n }\n\n return { handler, config }\n}\n"],"mappings":";;;AAAA,MAAa,wBAAwB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ACQrC,eAAsB,WACpB,SACA,QACmB;CAEnB,MAAM,EAAE,YAAY,YAAY,SAAS,SAAS,gBADpC,MAAM,QAAQ,MAAM;CAIlC,MAAM,eACJ,OAAO,OAAO,WAAW,aACrB,OAAO,OAAO,EAAE,eAAe,uBAAuB,CAAC,GACtD,OAAO,UAAU;CAGxB,MAAM,eAAe,OAAO,cAAc,MAAM;CAChD,MAAM,iBAAiB,QAAQ,MAAM,CAAC,YAAY;CAGlD,MAAM,sBAAgC,EAAE;AAExC,KAAI,QACF,qBAAoB,KAClB,oBAAoB,QAAQ,MAAM,GAAG,QAAQ,OAAO,UACrD;AAGH,KAAI,YACF,qBAAoB,KAClB,IACA,iEACA,YACD;CAGH,MAAM,iBACJ,oBAAoB,SAAS,IAAI,oBAAoB,KAAK,KAAK,GAAG;CAGpE,MAAM,WAAW,CACf,GAAG,eAAe,KAAK,SAAS;EAC9B,MAAM,IAAI;EACV,SAAS,IAAI;EACd,EAAE,EACH;EACE,MAAM;EACN,SAAS;GACP,GAAI,iBACA,CACE;IACE,MAAM;IACN,MAAM;IACP,CACF,GACD,EAAE;GACN;IACE,MAAM;IACN,OAAO;IACR;GACD;IACE,MAAM;IACN,MAAM;IACP;GACF;EACF,CACF;AAaD,QAXe,WAAW;EACxB,OAAO,OAAO;EACd,QAAQ;EACR,iBAAiB,QAAQ;EACzB;EACA,OAAO;GACL,OAAO;GACP,GAAG,OAAO;GACX;EACF,CAAC,CAEY,2BAA2B;;;;;;;AC/E3C,eAAsB,iBACpB,SACA,QACmB;AACnB,KAAI,CAAC,OAAO,mBACV,QAAO,IAAI,SACT,KAAK,UAAU,EACb,OACE,2GACH,CAAC,EACF;EACE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CACF;CAIH,MAAM,aADW,MAAM,QAAQ,UAAU,EACd,IAAI,QAAQ;AAEvC,KAAI,CAAC,aAAa,EAAE,qBAAqB,MACvC,QAAO,IAAI,SAAS,KAAK,UAAU,EAAE,OAAO,0BAA0B,CAAC,EAAE;EACvE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CAAC;CAGJ,MAAM,cAAc,MAAM,UAAU,aAAa;CAOjD,MAAM,WAA+B,EAAE,OALxB,MAAMA,wBAAW;EAC9B,OAAO,OAAO;EACd,OAAO,IAAI,WAAW,YAAY;EACnC,CAAC,EAEkD,MAAM;AAE1D,QAAO,IAAI,SAAS,KAAK,UAAU,SAAS,EAAE,EAC5C,SAAS,EAAE,gBAAgB,oBAAoB,EAChD,CAAC;;;;;;;ACtCJ,eAAsB,UACpB,SACA,QACmB;AACnB,KAAI,CAAC,OAAO,YACV,QAAO,IAAI,SACT,KAAK,UAAU,EACb,OACE,sFACH,CAAC,EACF;EACE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CACF;CAGH,MAAM,eAAe;CAErB,MAAM,EAAE,SADM,MAAM,QAAQ,MAAM;AAGlC,KAAI,CAAC,KACH,QAAO,IAAI,SAAS,KAAK,UAAU,EAAE,OAAO,oBAAoB,CAAC,EAAE;EACjE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CAAC;CAGJ,MAAM,SAAS,MAAMC,4BAAe;EAClC,OAAO,OAAO;EACd;EACA,OAAO,QAAQ;EACf;EACD,CAAC;CAGF,MAAM,YAAY,IAAI,WAAW,OAAO,MAAM,WAAW;AAEzD,QAAO,IAAI,SAAS,WAAW,EAC7B,SAAS,EACP,gBAAgB,aACjB,EACF,CAAC;;;;;;;;;;;;;;;;;;;;;;;;ACvBJ,SAAgB,yBACd,QACoB;CACpB,MAAM,UAAU,OAAO,YAAwC;EAE7D,MAAM,eADM,IAAI,IAAI,QAAQ,IAAI,CACP,SAAS,MAAM,IAAI,CAAC,OAAO,QAAQ;AAG5D,UAFc,aAAa,aAAa,SAAS,IAEjD;GACE,KAAK,OACH,QAAO,WAAW,SAAS,OAAO;GAEpC,KAAK,aACH,QAAO,iBAAiB,SAAS,OAAO;GAE1C,KAAK,MACH,QAAO,UAAU,SAAS,OAAO;GAEnC,QACE,QAAO,IAAI,SACT,KAAK,UAAU;IACb,OAAO;IACP,iBAAiB;KAAC;KAAS;KAAe;KAAO;IAClD,CAAC,EACF;IACE,QAAQ;IACR,SAAS,EAAE,gBAAgB,oBAAoB;IAChD,CACF;;;AAIP,QAAO;EAAE;EAAS;EAAQ"}
1
+ {"version":3,"file":"index.mjs","names":["transcribe","generateSpeech"],"sources":["../../src/server/system-prompt.ts","../../src/server/routes/chat.ts","../../src/server/routes/transcribe.ts","../../src/server/routes/tts.ts","../../src/server/handler.ts"],"sourcesContent":["export const DEFAULT_SYSTEM_PROMPT = `You are a helpful AI assistant that lives inside a web page as a cursor companion.\n\nYou can see the user's current screen and hear what they say. Respond conversationally. Your response will be spoken aloud with text-to-speech, so keep it natural, concise, and easy to follow.\n\n## Core behavior\n\n- Speak like a helpful companion, not a robot\n- Keep most responses to 1-3 short sentences\n- Focus on what is visible right now on the user's screen\n- If something is unclear or not visible, say that plainly\n- Do not mention screenshots, overlays, internal helper data, or the DOM snapshot to the user\n- Never describe the internal element IDs to the user - they are for your reference only\n\n## Visual Context: DOM Snapshot\n\nYou receive a screenshot of the user's viewport along with a DOM snapshot that lists visible elements in a compact, hierarchical format. The DOM snapshot looks like this:\n\n\\`\\`\\`\n# viewport 1440x900\n@1 nav \"Sidebar\"\n @2 link \"Projects\" [x=24 y=96 w=96 h=28]\n @3 link \"Tasks\" [x=24 y=132 w=72 h=28]\n@4 main\n @5 heading \"Q2 Roadmap\"\n @6 textbox \"Search tasks\" [x=320 y=120 w=280 h=36]\n @7 button \"Filter\" [x=612 y=120 w=84 h=36] [expanded=false]\n @8 checkbox \"Selected\" [checked=false] [x=340 y=220 w=16 h=16]\n\\`\\`\\`\n\n**How to read the DOM snapshot:**\n- Each element starts with \\`@X\\` where X is its unique ID\n- The element's role follows (button, link, textbox, heading, nav, main, etc.)\n- Text content is in quotes after the role\n- \\`[x=... y=... w=... h=...]\\` shows the element's position and size for your reference\n- \\`[key=value]\\` brackets show element state (checked, expanded, disabled, etc.)\n- Indentation shows parent-child relationships\n\n**The DOM snapshot is invisible to the user.** It helps you understand the page structure and identify specific elements to point at. Never mention it to the user.\n\n## The point tool\n\nYou have a \\`point\\` tool that can visually indicate an element on the user's screen.\n\nUse the \\`point\\` tool when the user is asking you to identify, locate, indicate, highlight, or show a specific visible target on screen.\n\nCommon cases where you should use \\`point\\`:\n- the user asks where something is\n- the user asks what to click\n- the user says things like \"show me\", \"point to it\", \"where is it\", \"which one\", \"what should I click\", or \"highlight that\"\n\nDo not use the \\`point\\` tool when spoken guidance alone is enough and the user is not asking you to identify a specific on-screen target.\n\nExamples where spoken guidance alone may be enough:\n- explaining what a page does\n- answering a general question about what is on screen\n- giving brief next-step advice that does not depend on locating a specific element\n\nIf using the \\`point\\` tool:\n- first give the spoken response\n- then call the tool\n- call it at most once per response\n- point only at the most relevant target\n- never replace the tool call with plain text like \"(point here)\" or \"I'm pointing at it now\"\n\nIf the user asks where something is on screen, what to click, or asks you to point something out, you should usually use the point tool rather than only describing it in words.\nDo not say things like \"I can point to it if you want\" when the user already asked where it is. In that case, answer briefly and use the point tool.\n\n## How to point using the point tool\n\nThe point tool accepts an \\`elementId\\` parameter which is the numeric ID from the DOM snapshot (the number after \\`@\\`).\n\n**Example:** To point at the \"Filter\" button from the example above (which is \\`@7\\`):\n\\`\\`\\`\nelementId: 7\nlabel: \"Filter button\"\n\\`\\`\\`\n\n**Steps:**\n1. Find the element in the DOM snapshot by reading its text/role\n2. Note its \\`@X\\` ID\n3. Call the point tool with that numeric ID (just the number, without the @ symbol)\n4. Provide a brief, natural label describing what you're pointing at\n\nThe element's position is resolved in real-time when the cursor moves, so it will point accurately even if the page has changed slightly.\n\n## What to say\n\nWhen the user asks you to point something out:\n- briefly answer in a natural spoken way\n- then use the tool if the request is about locating or indicating something on screen\n\nGood spoken style:\n- \"Click this button right here.\"\n- \"The error message is over here.\"\n- \"This is the field you want.\"\n- \"That setting is in this section.\"\n\nAvoid:\n- mentioning element IDs (like \"@5\" or \"element 12\")\n- mentioning internal tools\n- describing internal reasoning\n- saying you are looking at a screenshot\n\n## If the target is not clear\n\nIf you cannot confidently find the requested thing on screen:\n- say you cannot see it clearly or cannot find it\n- do not point at a random or uncertain target\n\n## Priority\n\nYour first priority is being helpful and correct.\nYour second priority is using the \\`point\\` tool whenever the user is asking you to visually identify a specific thing on screen.\n`\n","import { type StopCondition, stepCountIs, streamText } from \"ai\"\nimport { pointTool } from \"../../shared/point-tool\"\nimport { DEFAULT_SYSTEM_PROMPT } from \"../system-prompt\"\nimport type { ChatRequestBody, CursorBuddyHandlerConfig } from \"../types\"\n\n/**\n * Handle chat requests: screenshot + transcript → AI SSE stream\n */\nexport async function handleChat(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n const body = (await request.json()) as ChatRequestBody\n const { screenshot, transcript, history, capture, domSnapshot } = body\n\n // Resolve system prompt (string or function)\n const systemPrompt =\n typeof config.system === \"function\"\n ? config.system({ defaultPrompt: DEFAULT_SYSTEM_PROMPT })\n : (config.system ?? DEFAULT_SYSTEM_PROMPT)\n\n // Trim history to maxHistory (default 10 exchanges = 20 messages)\n const maxMessages = (config.maxHistory ?? 10) * 2\n const trimmedHistory = history.slice(-maxMessages)\n\n // Build capture context with DOM snapshot\n const captureContextParts: string[] = []\n\n if (capture) {\n captureContextParts.push(\n `Screenshot size: ${capture.width}x${capture.height} pixels.`,\n )\n }\n\n if (domSnapshot) {\n captureContextParts.push(\n \"\",\n \"Visible page structure (each element has @X ID for pointing):\",\n domSnapshot,\n )\n }\n\n const captureContext =\n captureContextParts.length > 0 ? captureContextParts.join(\"\\n\") : null\n\n // Build messages array with vision content\n const messages = [\n ...trimmedHistory.map((msg) => ({\n role: msg.role as \"user\" | \"assistant\",\n content: msg.content,\n })),\n {\n role: \"user\" as const,\n content: [\n ...(captureContext\n ? [\n {\n type: \"text\" as const,\n text: captureContext,\n },\n ]\n : []),\n { type: \"image\" as const, image: screenshot },\n { type: \"text\" as const, text: transcript },\n ],\n },\n ]\n\n const tools = {\n point: pointTool,\n ...config.tools,\n }\n\n const mustContinueUntilText: StopCondition<typeof tools> = ({ steps }) => {\n const lastStep = steps.at(-1)\n if (!lastStep) return false\n\n const stepText =\n typeof lastStep.text === \"string\" ? lastStep.text.trim() : \"\"\n const hadToolResults =\n Array.isArray(lastStep.toolResults) && lastStep.toolResults.length > 0\n\n // Stop only after we have actual assistant text.\n // If the step was tool-only, continue the loop.\n if (stepText.length > 0) return true\n if (hadToolResults) return false\n\n return false\n }\n\n const result = streamText({\n model: config.model,\n system: systemPrompt,\n providerOptions: config?.modelProviderMetadata,\n messages,\n tools,\n\n // Allow a follow-up step after tool use instead of the default single step.\n stopWhen: [mustContinueUntilText, stepCountIs(3)],\n\n prepareStep: async ({ stepNumber, steps }) => {\n // Normal first pass: let the model speak and optionally point.\n if (stepNumber === 0) {\n return {}\n }\n\n const previousStep = steps.at(-1)\n\n const prevText =\n typeof previousStep?.text === \"string\" ? previousStep.text.trim() : \"\"\n\n const usedPoint =\n previousStep?.toolCalls?.some((call) => call.toolName === \"point\") ??\n false\n\n // If the previous step pointed but did not speak, force the next step\n // to be text-only by removing the point tool.\n if (usedPoint && prevText.length === 0) {\n const toolNames = Object.keys(tools) as Array<keyof typeof tools>\n\n return {\n activeTools: toolNames.filter((name) => name !== \"point\"),\n }\n }\n\n return {}\n },\n })\n\n return result.toUIMessageStreamResponse()\n}\n","import { experimental_transcribe as transcribe } from \"ai\"\nimport type { CursorBuddyHandlerConfig, TranscribeResponse } from \"../types\"\n\n/**\n * Handle transcription requests: audio file → text\n */\nexport async function handleTranscribe(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n if (!config.transcriptionModel) {\n return new Response(\n JSON.stringify({\n error:\n \"Server transcription is not configured. Provide a transcriptionModel or use browser transcription only.\",\n }),\n {\n status: 501,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n\n const formData = await request.formData()\n const audioFile = formData.get(\"audio\")\n\n if (!audioFile || !(audioFile instanceof File)) {\n return new Response(JSON.stringify({ error: \"No audio file provided\" }), {\n status: 400,\n headers: { \"Content-Type\": \"application/json\" },\n })\n }\n\n const audioBuffer = await audioFile.arrayBuffer()\n\n const result = await transcribe({\n model: config.transcriptionModel,\n audio: new Uint8Array(audioBuffer),\n })\n\n const response: TranscribeResponse = { text: result.text }\n\n return new Response(JSON.stringify(response), {\n headers: { \"Content-Type\": \"application/json\" },\n })\n}\n","import { experimental_generateSpeech as generateSpeech } from \"ai\"\nimport type { CursorBuddyHandlerConfig, TTSRequestBody } from \"../types\"\n\n/**\n * Handle TTS requests: text → audio\n */\nexport async function handleTTS(\n request: Request,\n config: CursorBuddyHandlerConfig,\n): Promise<Response> {\n if (!config.speechModel) {\n return new Response(\n JSON.stringify({\n error:\n \"Server speech is not configured. Provide a speechModel or use browser speech only.\",\n }),\n {\n status: 501,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n\n const outputFormat = \"wav\"\n const body = (await request.json()) as TTSRequestBody\n const { text } = body\n\n if (!text) {\n return new Response(JSON.stringify({ error: \"No text provided\" }), {\n status: 400,\n headers: { \"Content-Type\": \"application/json\" },\n })\n }\n\n const result = await generateSpeech({\n model: config.speechModel,\n text,\n voice: config?.speechVoice,\n outputFormat,\n })\n\n // Create a new ArrayBuffer copy to satisfy TypeScript's strict typing\n const audioData = new Uint8Array(result.audio.uint8Array)\n\n return new Response(audioData, {\n headers: {\n \"Content-Type\": \"audio/wav\",\n },\n })\n}\n","import { handleChat } from \"./routes/chat\"\nimport { handleTranscribe } from \"./routes/transcribe\"\nimport { handleTTS } from \"./routes/tts\"\nimport type { CursorBuddyHandler, CursorBuddyHandlerConfig } from \"./types\"\n\n/**\n * Create a cursor buddy request handler.\n *\n * The handler responds to three routes based on the last path segment:\n * - /chat - Screenshot + transcript → AI SSE stream\n * - /transcribe - Audio → text\n * - /tts - Text → audio\n *\n * @example\n * ```ts\n * import { createCursorBuddyHandler } from \"cursor-buddy/server\"\n * import { openai } from \"@ai-sdk/openai\"\n *\n * const cursorBuddy = createCursorBuddyHandler({\n * model: openai(\"gpt-4o\"),\n * speechModel: openai.speech(\"tts-1\"), // optional for browser-only speech\n * transcriptionModel: openai.transcription(\"whisper-1\"),\n * })\n * ```\n */\nexport function createCursorBuddyHandler(\n config: CursorBuddyHandlerConfig,\n): CursorBuddyHandler {\n const handler = async (request: Request): Promise<Response> => {\n const url = new URL(request.url)\n const pathSegments = url.pathname.split(\"/\").filter(Boolean)\n const route = pathSegments[pathSegments.length - 1]\n\n switch (route) {\n case \"chat\":\n return handleChat(request, config)\n\n case \"transcribe\":\n return handleTranscribe(request, config)\n\n case \"tts\":\n return handleTTS(request, config)\n\n default:\n return new Response(\n JSON.stringify({\n error: \"Not found\",\n availableRoutes: [\"/chat\", \"/transcribe\", \"/tts\"],\n }),\n {\n status: 404,\n headers: { \"Content-Type\": \"application/json\" },\n },\n )\n }\n }\n\n return { handler, config }\n}\n"],"mappings":";;;AAAA,MAAa,wBAAwB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ACQrC,eAAsB,WACpB,SACA,QACmB;CAEnB,MAAM,EAAE,YAAY,YAAY,SAAS,SAAS,gBADpC,MAAM,QAAQ,MAAM;CAIlC,MAAM,eACJ,OAAO,OAAO,WAAW,aACrB,OAAO,OAAO,EAAE,eAAe,uBAAuB,CAAC,GACtD,OAAO,UAAU;CAGxB,MAAM,eAAe,OAAO,cAAc,MAAM;CAChD,MAAM,iBAAiB,QAAQ,MAAM,CAAC,YAAY;CAGlD,MAAM,sBAAgC,EAAE;AAExC,KAAI,QACF,qBAAoB,KAClB,oBAAoB,QAAQ,MAAM,GAAG,QAAQ,OAAO,UACrD;AAGH,KAAI,YACF,qBAAoB,KAClB,IACA,iEACA,YACD;CAGH,MAAM,iBACJ,oBAAoB,SAAS,IAAI,oBAAoB,KAAK,KAAK,GAAG;CAGpE,MAAM,WAAW,CACf,GAAG,eAAe,KAAK,SAAS;EAC9B,MAAM,IAAI;EACV,SAAS,IAAI;EACd,EAAE,EACH;EACE,MAAM;EACN,SAAS;GACP,GAAI,iBACA,CACE;IACE,MAAM;IACN,MAAM;IACP,CACF,GACD,EAAE;GACN;IAAE,MAAM;IAAkB,OAAO;IAAY;GAC7C;IAAE,MAAM;IAAiB,MAAM;IAAY;GAC5C;EACF,CACF;CAED,MAAM,QAAQ;EACZ,OAAO;EACP,GAAG,OAAO;EACX;CAED,MAAM,yBAAsD,EAAE,YAAY;EACxE,MAAM,WAAW,MAAM,GAAG,GAAG;AAC7B,MAAI,CAAC,SAAU,QAAO;EAEtB,MAAM,WACJ,OAAO,SAAS,SAAS,WAAW,SAAS,KAAK,MAAM,GAAG;EAC7D,MAAM,iBACJ,MAAM,QAAQ,SAAS,YAAY,IAAI,SAAS,YAAY,SAAS;AAIvE,MAAI,SAAS,SAAS,EAAG,QAAO;AAChC,MAAI,eAAgB,QAAO;AAE3B,SAAO;;AA0CT,QAvCe,WAAW;EACxB,OAAO,OAAO;EACd,QAAQ;EACR,iBAAiB,QAAQ;EACzB;EACA;EAGA,UAAU,CAAC,uBAAuB,YAAY,EAAE,CAAC;EAEjD,aAAa,OAAO,EAAE,YAAY,YAAY;AAE5C,OAAI,eAAe,EACjB,QAAO,EAAE;GAGX,MAAM,eAAe,MAAM,GAAG,GAAG;GAEjC,MAAM,WACJ,OAAO,cAAc,SAAS,WAAW,aAAa,KAAK,MAAM,GAAG;AAQtE,QALE,cAAc,WAAW,MAAM,SAAS,KAAK,aAAa,QAAQ,IAClE,UAIe,SAAS,WAAW,EAGnC,QAAO,EACL,aAHgB,OAAO,KAAK,MAAM,CAGX,QAAQ,SAAS,SAAS,QAAQ,EAC1D;AAGH,UAAO,EAAE;;EAEZ,CAAC,CAEY,2BAA2B;;;;;;;AC3H3C,eAAsB,iBACpB,SACA,QACmB;AACnB,KAAI,CAAC,OAAO,mBACV,QAAO,IAAI,SACT,KAAK,UAAU,EACb,OACE,2GACH,CAAC,EACF;EACE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CACF;CAIH,MAAM,aADW,MAAM,QAAQ,UAAU,EACd,IAAI,QAAQ;AAEvC,KAAI,CAAC,aAAa,EAAE,qBAAqB,MACvC,QAAO,IAAI,SAAS,KAAK,UAAU,EAAE,OAAO,0BAA0B,CAAC,EAAE;EACvE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CAAC;CAGJ,MAAM,cAAc,MAAM,UAAU,aAAa;CAOjD,MAAM,WAA+B,EAAE,OALxB,MAAMA,wBAAW;EAC9B,OAAO,OAAO;EACd,OAAO,IAAI,WAAW,YAAY;EACnC,CAAC,EAEkD,MAAM;AAE1D,QAAO,IAAI,SAAS,KAAK,UAAU,SAAS,EAAE,EAC5C,SAAS,EAAE,gBAAgB,oBAAoB,EAChD,CAAC;;;;;;;ACtCJ,eAAsB,UACpB,SACA,QACmB;AACnB,KAAI,CAAC,OAAO,YACV,QAAO,IAAI,SACT,KAAK,UAAU,EACb,OACE,sFACH,CAAC,EACF;EACE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CACF;CAGH,MAAM,eAAe;CAErB,MAAM,EAAE,SADM,MAAM,QAAQ,MAAM;AAGlC,KAAI,CAAC,KACH,QAAO,IAAI,SAAS,KAAK,UAAU,EAAE,OAAO,oBAAoB,CAAC,EAAE;EACjE,QAAQ;EACR,SAAS,EAAE,gBAAgB,oBAAoB;EAChD,CAAC;CAGJ,MAAM,SAAS,MAAMC,4BAAe;EAClC,OAAO,OAAO;EACd;EACA,OAAO,QAAQ;EACf;EACD,CAAC;CAGF,MAAM,YAAY,IAAI,WAAW,OAAO,MAAM,WAAW;AAEzD,QAAO,IAAI,SAAS,WAAW,EAC7B,SAAS,EACP,gBAAgB,aACjB,EACF,CAAC;;;;;;;;;;;;;;;;;;;;;;;;ACvBJ,SAAgB,yBACd,QACoB;CACpB,MAAM,UAAU,OAAO,YAAwC;EAE7D,MAAM,eADM,IAAI,IAAI,QAAQ,IAAI,CACP,SAAS,MAAM,IAAI,CAAC,OAAO,QAAQ;AAG5D,UAFc,aAAa,aAAa,SAAS,IAEjD;GACE,KAAK,OACH,QAAO,WAAW,SAAS,OAAO;GAEpC,KAAK,aACH,QAAO,iBAAiB,SAAS,OAAO;GAE1C,KAAK,MACH,QAAO,UAAU,SAAS,OAAO;GAEnC,QACE,QAAO,IAAI,SACT,KAAK,UAAU;IACb,OAAO;IACP,iBAAiB;KAAC;KAAS;KAAe;KAAO;IAClD,CAAC,EACF;IACE,QAAQ;IACR,SAAS,EAAE,gBAAgB,oBAAoB;IAChD,CACF;;;AAIP,QAAO;EAAE;EAAS;EAAQ"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cursor-buddy",
3
- "version": "0.0.10-beta.0",
3
+ "version": "0.0.10",
4
4
  "description": "AI-powered cursor companion for web apps",
5
5
  "type": "module",
6
6
  "license": "MIT",