@mastra/voice-google-gemini-live 0.11.0-beta.1 → 0.11.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1262,7 +1262,7 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
1262
1262
  sessionDurationTimeout;
1263
1263
  // Tool integration properties
1264
1264
  tools;
1265
- requestContext;
1265
+ runtimeContext;
1266
1266
  // Store the configuration options
1267
1267
  options;
1268
1268
  /**
@@ -1499,67 +1499,70 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
1499
1499
  /**
1500
1500
  * Establish connection to the Gemini Live API
1501
1501
  */
1502
- async connect({ requestContext } = {}) {
1503
- if (this.state === "connected") {
1504
- this.log("Already connected to Gemini Live API");
1505
- return;
1506
- }
1507
- this.requestContext = requestContext;
1508
- this.emit("session", { state: "connecting" });
1509
- try {
1510
- let wsUrl;
1511
- let headers = {};
1512
- if (this.options.vertexAI) {
1513
- wsUrl = `wss://${this.options.location}-aiplatform.googleapis.com/ws/google.cloud.aiplatform.v1beta1.PredictionService.ServerStreamingPredict`;
1514
- await this.authManager.initialize();
1515
- const accessToken = await this.authManager.getAccessToken();
1516
- headers = { headers: { Authorization: `Bearer ${accessToken}` } };
1517
- this.log("Using Vertex AI authentication with OAuth token");
1518
- } else {
1519
- wsUrl = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent`;
1520
- headers = {
1521
- headers: {
1522
- "x-goog-api-key": this.options.apiKey || "",
1523
- "Content-Type": "application/json"
1524
- }
1525
- };
1526
- this.log("Using Live API authentication with API key");
1527
- }
1528
- this.log("Connecting to:", wsUrl);
1529
- this.ws = new ws.WebSocket(wsUrl, void 0, headers);
1530
- this.connectionManager.setWebSocket(this.ws);
1531
- this.setupEventListeners();
1532
- await this.connectionManager.waitForOpen();
1533
- if (this.isResuming && this.sessionHandle) {
1534
- await this.sendSessionResumption();
1535
- } else {
1536
- this.sendInitialConfig();
1537
- this.sessionStartTime = Date.now();
1538
- this.sessionId = crypto.randomUUID();
1502
+ async connect({ runtimeContext } = {}) {
1503
+ return this.traced(async () => {
1504
+ if (this.state === "connected") {
1505
+ this.log("Already connected to Gemini Live API");
1506
+ return;
1539
1507
  }
1540
- await this.waitForSessionCreated();
1541
- this.state = "connected";
1542
- this.emit("session", {
1543
- state: "connected",
1544
- config: {
1508
+ this.runtimeContext = runtimeContext;
1509
+ this.emit("session", { state: "connecting" });
1510
+ try {
1511
+ let wsUrl;
1512
+ let headers = {};
1513
+ if (this.options.vertexAI) {
1514
+ const location = this.getVertexLocation();
1515
+ wsUrl = `wss://${location}-aiplatform.googleapis.com/ws/google.cloud.aiplatform.v1beta1.LlmBidiService/BidiGenerateContent`;
1516
+ await this.authManager.initialize();
1517
+ const accessToken = await this.authManager.getAccessToken();
1518
+ headers = { headers: { Authorization: `Bearer ${accessToken}` } };
1519
+ this.log("Using Vertex AI authentication with OAuth token");
1520
+ } else {
1521
+ wsUrl = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent`;
1522
+ headers = {
1523
+ headers: {
1524
+ "x-goog-api-key": this.options.apiKey || "",
1525
+ "Content-Type": "application/json"
1526
+ }
1527
+ };
1528
+ this.log("Using Live API authentication with API key");
1529
+ }
1530
+ this.log("Connecting to:", wsUrl);
1531
+ this.ws = new ws.WebSocket(wsUrl, void 0, headers);
1532
+ this.connectionManager.setWebSocket(this.ws);
1533
+ this.setupEventListeners();
1534
+ await this.connectionManager.waitForOpen();
1535
+ if (this.isResuming && this.sessionHandle) {
1536
+ await this.sendSessionResumption();
1537
+ } else {
1538
+ this.sendInitialConfig();
1539
+ this.sessionStartTime = Date.now();
1540
+ this.sessionId = crypto.randomUUID();
1541
+ }
1542
+ await this.waitForSessionCreated();
1543
+ this.state = "connected";
1544
+ this.emit("session", {
1545
+ state: "connected",
1546
+ config: {
1547
+ sessionId: this.sessionId,
1548
+ isResuming: this.isResuming,
1549
+ toolCount: Object.keys(this.tools || {}).length
1550
+ }
1551
+ });
1552
+ this.log("Successfully connected to Gemini Live API", {
1545
1553
  sessionId: this.sessionId,
1546
1554
  isResuming: this.isResuming,
1547
1555
  toolCount: Object.keys(this.tools || {}).length
1556
+ });
1557
+ if (this.options.sessionConfig?.maxDuration) {
1558
+ this.startSessionDurationMonitor();
1548
1559
  }
1549
- });
1550
- this.log("Successfully connected to Gemini Live API", {
1551
- sessionId: this.sessionId,
1552
- isResuming: this.isResuming,
1553
- toolCount: Object.keys(this.tools || {}).length
1554
- });
1555
- if (this.options.sessionConfig?.maxDuration) {
1556
- this.startSessionDurationMonitor();
1560
+ } catch (error) {
1561
+ this.state = "disconnected";
1562
+ this.log("Connection failed", error);
1563
+ throw error;
1557
1564
  }
1558
- } catch (error) {
1559
- this.state = "disconnected";
1560
- this.log("Connection failed", error);
1561
- throw error;
1562
- }
1565
+ }, "gemini-live.connect")();
1563
1566
  }
1564
1567
  /**
1565
1568
  * Disconnect from the Gemini Live API
@@ -1597,164 +1600,172 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
1597
1600
  * Send text to be converted to speech
1598
1601
  */
1599
1602
  async speak(input, options) {
1600
- this.validateConnectionState();
1601
- if (typeof input !== "string") {
1602
- const chunks = [];
1603
- for await (const chunk of input) {
1604
- chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
1603
+ return this.traced(async () => {
1604
+ this.validateConnectionState();
1605
+ if (typeof input !== "string") {
1606
+ const chunks = [];
1607
+ for await (const chunk of input) {
1608
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
1609
+ }
1610
+ input = Buffer.concat(chunks).toString("utf-8");
1605
1611
  }
1606
- input = Buffer.concat(chunks).toString("utf-8");
1607
- }
1608
- if (input.trim().length === 0) {
1609
- throw this.createAndEmitError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Input text is empty");
1610
- }
1611
- this.addToContext("user", input);
1612
- const textMessage = {
1613
- client_content: {
1614
- turns: [
1615
- {
1616
- role: "user",
1617
- parts: [
1618
- {
1619
- text: input
1620
- }
1621
- ]
1622
- }
1623
- ],
1624
- turnComplete: true
1612
+ if (input.trim().length === 0) {
1613
+ throw this.createAndEmitError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Input text is empty");
1625
1614
  }
1626
- };
1627
- if (options && (options.speaker || options.languageCode || options.responseModalities)) {
1628
- const updateMessage = {
1629
- type: "session.update",
1630
- session: {
1631
- generation_config: {
1632
- ...options.responseModalities ? { response_modalities: options.responseModalities } : {},
1633
- speech_config: {
1634
- ...options.languageCode ? { language_code: options.languageCode } : {},
1635
- ...options.speaker ? { voice_config: { prebuilt_voice_config: { voice_name: options.speaker } } } : {}
1615
+ this.addToContext("user", input);
1616
+ const textMessage = {
1617
+ client_content: {
1618
+ turns: [
1619
+ {
1620
+ role: "user",
1621
+ parts: [
1622
+ {
1623
+ text: input
1624
+ }
1625
+ ]
1636
1626
  }
1637
- }
1627
+ ],
1628
+ turnComplete: true
1638
1629
  }
1639
1630
  };
1631
+ if (options && (options.speaker || options.languageCode || options.responseModalities)) {
1632
+ const updateMessage = {
1633
+ type: "session.update",
1634
+ session: {
1635
+ generation_config: {
1636
+ ...options.responseModalities ? { response_modalities: options.responseModalities } : {},
1637
+ speech_config: {
1638
+ ...options.languageCode ? { language_code: options.languageCode } : {},
1639
+ ...options.speaker ? { voice_config: { prebuilt_voice_config: { voice_name: options.speaker } } } : {}
1640
+ }
1641
+ }
1642
+ }
1643
+ };
1644
+ try {
1645
+ this.sendEvent("session.update", updateMessage);
1646
+ this.log("Applied per-turn runtime options", options);
1647
+ } catch (error) {
1648
+ this.log("Failed to apply per-turn runtime options", error);
1649
+ }
1650
+ }
1640
1651
  try {
1641
- this.sendEvent("session.update", updateMessage);
1642
- this.log("Applied per-turn runtime options", options);
1652
+ this.sendEvent("client_content", textMessage);
1653
+ this.log("Text message sent", { text: input });
1643
1654
  } catch (error) {
1644
- this.log("Failed to apply per-turn runtime options", error);
1655
+ this.log("Failed to send text message", error);
1656
+ throw this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Failed to send text message", error);
1645
1657
  }
1646
- }
1647
- try {
1648
- this.sendEvent("client_content", textMessage);
1649
- this.log("Text message sent", { text: input });
1650
- } catch (error) {
1651
- this.log("Failed to send text message", error);
1652
- throw this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Failed to send text message", error);
1653
- }
1658
+ }, "gemini-live.speak")();
1654
1659
  }
1655
1660
  /**
1656
1661
  * Send audio stream for processing
1657
1662
  */
1658
1663
  async send(audioData) {
1659
- this.validateConnectionState();
1660
- if ("readable" in audioData && typeof audioData.on === "function") {
1661
- const stream = audioData;
1662
- stream.on("data", (chunk) => {
1663
- try {
1664
- const base64Audio = this.audioStreamManager.processAudioChunk(chunk);
1665
- const message = this.audioStreamManager.createAudioMessage(base64Audio, "realtime");
1666
- this.sendEvent("realtime_input", message);
1667
- } catch (error) {
1668
- this.log("Failed to process audio chunk", error);
1669
- this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Failed to process audio chunk", error);
1670
- }
1671
- });
1672
- stream.on("error", (error) => {
1673
- this.log("Audio stream error", error);
1674
- this.createAndEmitError("audio_stream_error" /* AUDIO_STREAM_ERROR */, "Audio stream error", error);
1675
- });
1676
- stream.on("end", () => {
1677
- this.log("Audio stream ended");
1678
- });
1679
- } else {
1680
- const validateAudio = this.audioStreamManager.validateAndConvertAudioInput(audioData);
1681
- const base64Audio = this.audioStreamManager.int16ArrayToBase64(validateAudio);
1682
- const message = this.audioStreamManager.createAudioMessage(base64Audio, "realtime");
1683
- this.sendEvent("realtime_input", message);
1684
- }
1664
+ return this.traced(async () => {
1665
+ this.validateConnectionState();
1666
+ if ("readable" in audioData && typeof audioData.on === "function") {
1667
+ const stream = audioData;
1668
+ stream.on("data", (chunk) => {
1669
+ try {
1670
+ const base64Audio = this.audioStreamManager.processAudioChunk(chunk);
1671
+ const message = this.audioStreamManager.createAudioMessage(base64Audio, "realtime");
1672
+ this.sendEvent("realtime_input", message);
1673
+ } catch (error) {
1674
+ this.log("Failed to process audio chunk", error);
1675
+ this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Failed to process audio chunk", error);
1676
+ }
1677
+ });
1678
+ stream.on("error", (error) => {
1679
+ this.log("Audio stream error", error);
1680
+ this.createAndEmitError("audio_stream_error" /* AUDIO_STREAM_ERROR */, "Audio stream error", error);
1681
+ });
1682
+ stream.on("end", () => {
1683
+ this.log("Audio stream ended");
1684
+ });
1685
+ } else {
1686
+ const validateAudio = this.audioStreamManager.validateAndConvertAudioInput(audioData);
1687
+ const base64Audio = this.audioStreamManager.int16ArrayToBase64(validateAudio);
1688
+ const message = this.audioStreamManager.createAudioMessage(base64Audio, "realtime");
1689
+ this.sendEvent("realtime_input", message);
1690
+ }
1691
+ }, "gemini-live.send")();
1685
1692
  }
1686
1693
  /**
1687
1694
  * Process speech from audio stream (traditional STT interface)
1688
1695
  */
1689
1696
  async listen(audioStream, _options) {
1690
- this.validateConnectionState();
1691
- let transcriptionText = "";
1692
- const onWriting = (data) => {
1693
- if (data.role === "user") {
1694
- transcriptionText += data.text;
1695
- this.log("Received transcription text:", { text: data.text, total: transcriptionText });
1696
- }
1697
- };
1698
- const onError = (error) => {
1699
- throw new Error(`Transcription failed: ${error.message}`);
1700
- };
1701
- const onSession = (data) => {
1702
- if (data.state === "disconnected") {
1703
- throw new Error("Session disconnected during transcription");
1704
- }
1705
- };
1706
- this.on("writing", onWriting);
1707
- this.on("error", onError);
1708
- this.on("session", onSession);
1709
- try {
1710
- const result = await this.audioStreamManager.handleAudioTranscription(
1711
- audioStream,
1712
- (base64Audio) => {
1713
- return new Promise((resolve, reject) => {
1714
- try {
1715
- const message = this.audioStreamManager.createAudioMessage(base64Audio, "input");
1716
- const cleanup = () => {
1717
- this.off("turnComplete", onTurnComplete);
1718
- this.off("error", onErr);
1719
- };
1720
- const onTurnComplete = () => {
1721
- cleanup();
1722
- resolve(transcriptionText.trim());
1723
- };
1724
- const onErr = (e) => {
1725
- cleanup();
1726
- reject(new Error(e.message));
1727
- };
1728
- this.on("turnComplete", onTurnComplete);
1729
- this.on("error", onErr);
1730
- this.sendEvent("client_content", message);
1731
- this.log("Sent audio for transcription");
1732
- } catch (err) {
1733
- reject(err);
1734
- }
1735
- });
1736
- },
1737
- (error) => {
1738
- this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Audio transcription failed", error);
1697
+ return this.traced(async () => {
1698
+ this.validateConnectionState();
1699
+ let transcriptionText = "";
1700
+ const onWriting = (data) => {
1701
+ if (data.role === "user") {
1702
+ transcriptionText += data.text;
1703
+ this.log("Received transcription text:", { text: data.text, total: transcriptionText });
1739
1704
  }
1740
- );
1741
- return result;
1742
- } finally {
1743
- this.off("writing", onWriting);
1744
- this.off("error", onError);
1745
- this.off("session", onSession);
1746
- }
1705
+ };
1706
+ const onError = (error) => {
1707
+ throw new Error(`Transcription failed: ${error.message}`);
1708
+ };
1709
+ const onSession = (data) => {
1710
+ if (data.state === "disconnected") {
1711
+ throw new Error("Session disconnected during transcription");
1712
+ }
1713
+ };
1714
+ this.on("writing", onWriting);
1715
+ this.on("error", onError);
1716
+ this.on("session", onSession);
1717
+ try {
1718
+ const result = await this.audioStreamManager.handleAudioTranscription(
1719
+ audioStream,
1720
+ (base64Audio) => {
1721
+ return new Promise((resolve, reject) => {
1722
+ try {
1723
+ const message = this.audioStreamManager.createAudioMessage(base64Audio, "input");
1724
+ const cleanup = () => {
1725
+ this.off("turnComplete", onTurnComplete);
1726
+ this.off("error", onErr);
1727
+ };
1728
+ const onTurnComplete = () => {
1729
+ cleanup();
1730
+ resolve(transcriptionText.trim());
1731
+ };
1732
+ const onErr = (e) => {
1733
+ cleanup();
1734
+ reject(new Error(e.message));
1735
+ };
1736
+ this.on("turnComplete", onTurnComplete);
1737
+ this.on("error", onErr);
1738
+ this.sendEvent("client_content", message);
1739
+ this.log("Sent audio for transcription");
1740
+ } catch (err) {
1741
+ reject(err);
1742
+ }
1743
+ });
1744
+ },
1745
+ (error) => {
1746
+ this.createAndEmitError("audio_processing_error" /* AUDIO_PROCESSING_ERROR */, "Audio transcription failed", error);
1747
+ }
1748
+ );
1749
+ return result;
1750
+ } finally {
1751
+ this.off("writing", onWriting);
1752
+ this.off("error", onError);
1753
+ this.off("session", onSession);
1754
+ }
1755
+ }, "gemini-live.listen")();
1747
1756
  }
1748
1757
  /**
1749
1758
  * Get available speakers/voices
1750
1759
  */
1751
1760
  async getSpeakers() {
1752
- return [
1753
- { voiceId: "Puck", description: "Conversational, friendly" },
1754
- { voiceId: "Charon", description: "Deep, authoritative" },
1755
- { voiceId: "Kore", description: "Neutral, professional" },
1756
- { voiceId: "Fenrir", description: "Warm, approachable" }
1757
- ];
1761
+ return this.traced(async () => {
1762
+ return [
1763
+ { voiceId: "Puck", description: "Conversational, friendly" },
1764
+ { voiceId: "Charon", description: "Deep, authoritative" },
1765
+ { voiceId: "Kore", description: "Neutral, professional" },
1766
+ { voiceId: "Fenrir", description: "Warm, approachable" }
1767
+ ];
1768
+ }, "gemini-live.getSpeakers")();
1758
1769
  }
1759
1770
  /**
1760
1771
  * Resume a previous session using a session handle
@@ -2382,7 +2393,13 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
2382
2393
  let result;
2383
2394
  if (tool.execute) {
2384
2395
  this.log("Executing tool", { toolName, toolArgs });
2385
- result = await tool.execute(toolArgs, { requestContext: this.requestContext });
2396
+ result = await tool.execute(
2397
+ { context: toolArgs, runtimeContext: this.runtimeContext },
2398
+ {
2399
+ toolCallId: toolId,
2400
+ messages: []
2401
+ }
2402
+ );
2386
2403
  this.log("Tool executed successfully", { toolName, result });
2387
2404
  } else {
2388
2405
  this.log("Tool has no execute function", { toolName });
@@ -2473,6 +2490,31 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
2473
2490
  }
2474
2491
  return "text";
2475
2492
  }
2493
+ /**
2494
+ * Resolve Vertex AI location with sensible default
2495
+ * @private
2496
+ */
2497
+ getVertexLocation() {
2498
+ return this.options.location?.trim() || "us-central1";
2499
+ }
2500
+ /**
2501
+ * Resolve the correct model identifier for Gemini API or Vertex AI
2502
+ * @private
2503
+ */
2504
+ resolveModelIdentifier() {
2505
+ const model = this.options.model ?? DEFAULT_MODEL;
2506
+ if (!this.options.vertexAI) {
2507
+ return `models/${model}`;
2508
+ }
2509
+ if (!this.options.project) {
2510
+ throw this.createAndEmitError(
2511
+ "project_id_missing" /* PROJECT_ID_MISSING */,
2512
+ "Google Cloud project ID is required when using Vertex AI."
2513
+ );
2514
+ }
2515
+ const location = this.getVertexLocation();
2516
+ return `projects/${this.options.project}/locations/${location}/publishers/google/models/${model}`;
2517
+ }
2476
2518
  /**
2477
2519
  * Send initial configuration to Gemini Live API
2478
2520
  * @private
@@ -2483,7 +2525,7 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
2483
2525
  }
2484
2526
  const setupMessage = {
2485
2527
  setup: {
2486
- model: `models/${this.options.model}`
2528
+ model: this.resolveModelIdentifier()
2487
2529
  }
2488
2530
  };
2489
2531
  if (this.options.instructions) {
@@ -2659,14 +2701,14 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
2659
2701
  * inputSchema: z.object({
2660
2702
  * location: z.string().describe("The city and state, e.g. San Francisco, CA"),
2661
2703
  * }),
2662
- * execute: async (inputData) => {
2704
+ * execute: async ({ context }) => {
2663
2705
  * // Fetch weather data from an API
2664
2706
  * const response = await fetch(
2665
- * `https://api.weather.com?location=${encodeURIComponent(inputData.location)}`,
2707
+ * `https://api.weather.com?location=${encodeURIComponent(context.location)}`,
2666
2708
  * );
2667
2709
  * const data = await response.json();
2668
2710
  * return {
2669
- * message: `The current temperature in ${inputData.location} is ${data.temperature}°F with ${data.conditions}.`,
2711
+ * message: `The current temperature in ${context.location} is ${data.temperature}°F with ${data.conditions}.`,
2670
2712
  * };
2671
2713
  * },
2672
2714
  * });
@@ -2684,7 +2726,7 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
2684
2726
  * Get the current tools configured for this voice instance
2685
2727
  * @returns Object containing the current tools
2686
2728
  */
2687
- listTools() {
2729
+ getTools() {
2688
2730
  return this.tools;
2689
2731
  }
2690
2732
  log(message, ...args) {