@just-every/ensemble 0.2.79 → 0.2.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/dist/config/tool_execution.d.ts.map +1 -1
  2. package/dist/config/tool_execution.js +2 -11
  3. package/dist/config/tool_execution.js.map +1 -1
  4. package/dist/core/ensemble_embed.d.ts.map +1 -1
  5. package/dist/core/ensemble_embed.js +2 -4
  6. package/dist/core/ensemble_embed.js.map +1 -1
  7. package/dist/core/ensemble_image.d.ts.map +1 -1
  8. package/dist/core/ensemble_image.js +1 -1
  9. package/dist/core/ensemble_image.js.map +1 -1
  10. package/dist/core/ensemble_listen.d.ts.map +1 -1
  11. package/dist/core/ensemble_listen.js +2 -4
  12. package/dist/core/ensemble_listen.js.map +1 -1
  13. package/dist/core/ensemble_live.d.ts +14 -0
  14. package/dist/core/ensemble_live.d.ts.map +1 -0
  15. package/dist/core/ensemble_live.js +382 -0
  16. package/dist/core/ensemble_live.js.map +1 -0
  17. package/dist/core/ensemble_request.d.ts.map +1 -1
  18. package/dist/core/ensemble_request.js +5 -13
  19. package/dist/core/ensemble_request.js.map +1 -1
  20. package/dist/core/ensemble_voice.d.ts.map +1 -1
  21. package/dist/core/ensemble_voice.js +1 -1
  22. package/dist/core/ensemble_voice.js.map +1 -1
  23. package/dist/data/model_data.d.ts.map +1 -1
  24. package/dist/data/model_data.js +85 -11
  25. package/dist/data/model_data.js.map +1 -1
  26. package/dist/index.d.ts +6 -5
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +6 -5
  29. package/dist/index.js.map +1 -1
  30. package/dist/model_providers/base_provider.d.ts.map +1 -1
  31. package/dist/model_providers/base_provider.js +1 -1
  32. package/dist/model_providers/base_provider.js.map +1 -1
  33. package/dist/model_providers/claude.d.ts.map +1 -1
  34. package/dist/model_providers/claude.js +48 -101
  35. package/dist/model_providers/claude.js.map +1 -1
  36. package/dist/model_providers/deepseek.d.ts.map +1 -1
  37. package/dist/model_providers/deepseek.js +7 -18
  38. package/dist/model_providers/deepseek.js.map +1 -1
  39. package/dist/model_providers/elevenlabs.d.ts.map +1 -1
  40. package/dist/model_providers/elevenlabs.js +3 -7
  41. package/dist/model_providers/elevenlabs.js.map +1 -1
  42. package/dist/model_providers/gemini.d.ts +2 -1
  43. package/dist/model_providers/gemini.d.ts.map +1 -1
  44. package/dist/model_providers/gemini.js +512 -147
  45. package/dist/model_providers/gemini.js.map +1 -1
  46. package/dist/model_providers/grok.d.ts.map +1 -1
  47. package/dist/model_providers/grok.js +1 -2
  48. package/dist/model_providers/grok.js.map +1 -1
  49. package/dist/model_providers/model_provider.d.ts.map +1 -1
  50. package/dist/model_providers/model_provider.js +10 -20
  51. package/dist/model_providers/model_provider.js.map +1 -1
  52. package/dist/model_providers/openai.d.ts +2 -1
  53. package/dist/model_providers/openai.d.ts.map +1 -1
  54. package/dist/model_providers/openai.js +261 -100
  55. package/dist/model_providers/openai.js.map +1 -1
  56. package/dist/model_providers/openai_chat.d.ts.map +1 -1
  57. package/dist/model_providers/openai_chat.js +39 -72
  58. package/dist/model_providers/openai_chat.js.map +1 -1
  59. package/dist/model_providers/test_provider.d.ts.map +1 -1
  60. package/dist/model_providers/test_provider.js +7 -17
  61. package/dist/model_providers/test_provider.js.map +1 -1
  62. package/dist/tsconfig.tsbuildinfo +1 -1
  63. package/dist/types/errors.d.ts.map +1 -1
  64. package/dist/types/errors.js.map +1 -1
  65. package/dist/types/types.d.ts +162 -7
  66. package/dist/types/types.d.ts.map +1 -1
  67. package/dist/utils/agent.d.ts.map +1 -1
  68. package/dist/utils/agent.js +5 -16
  69. package/dist/utils/agent.js.map +1 -1
  70. package/dist/utils/citation_tracker.d.ts.map +1 -1
  71. package/dist/utils/citation_tracker.js.map +1 -1
  72. package/dist/utils/config_manager.d.ts.map +1 -1
  73. package/dist/utils/config_manager.js +12 -4
  74. package/dist/utils/config_manager.js.map +1 -1
  75. package/dist/utils/cost_tracker.d.ts.map +1 -1
  76. package/dist/utils/cost_tracker.js +13 -26
  77. package/dist/utils/cost_tracker.js.map +1 -1
  78. package/dist/utils/create_tool_function.d.ts.map +1 -1
  79. package/dist/utils/create_tool_function.js +4 -16
  80. package/dist/utils/create_tool_function.js.map +1 -1
  81. package/dist/utils/delta_buffer.d.ts.map +1 -1
  82. package/dist/utils/delta_buffer.js +1 -2
  83. package/dist/utils/delta_buffer.js.map +1 -1
  84. package/dist/utils/ensemble_result.d.ts.map +1 -1
  85. package/dist/utils/ensemble_result.js +9 -24
  86. package/dist/utils/ensemble_result.js.map +1 -1
  87. package/dist/utils/event_controller.d.ts.map +1 -1
  88. package/dist/utils/event_controller.js.map +1 -1
  89. package/dist/utils/external_models.d.ts.map +1 -1
  90. package/dist/utils/external_models.js.map +1 -1
  91. package/dist/utils/image_to_text.d.ts.map +1 -1
  92. package/dist/utils/image_to_text.js +1 -2
  93. package/dist/utils/image_to_text.js.map +1 -1
  94. package/dist/utils/image_utils.d.ts.map +1 -1
  95. package/dist/utils/image_utils.js.map +1 -1
  96. package/dist/utils/image_validation.d.ts.map +1 -1
  97. package/dist/utils/image_validation.js.map +1 -1
  98. package/dist/utils/llm_logger.d.ts.map +1 -1
  99. package/dist/utils/llm_logger.js.map +1 -1
  100. package/dist/utils/message_history.d.ts.map +1 -1
  101. package/dist/utils/message_history.js +9 -20
  102. package/dist/utils/message_history.js.map +1 -1
  103. package/dist/utils/model_class_config.d.ts.map +1 -1
  104. package/dist/utils/model_class_config.js +1 -1
  105. package/dist/utils/model_class_config.js.map +1 -1
  106. package/dist/utils/pause_controller.d.ts.map +1 -1
  107. package/dist/utils/pause_controller.js.map +1 -1
  108. package/dist/utils/quota_tracker.d.ts.map +1 -1
  109. package/dist/utils/quota_tracker.js +19 -49
  110. package/dist/utils/quota_tracker.js.map +1 -1
  111. package/dist/utils/retry_handler.d.ts.map +1 -1
  112. package/dist/utils/retry_handler.js.map +1 -1
  113. package/dist/utils/running_tool_tracker.d.ts.map +1 -1
  114. package/dist/utils/running_tool_tracker.js.map +1 -1
  115. package/dist/utils/sequential_queue.d.ts.map +1 -1
  116. package/dist/utils/sequential_queue.js.map +1 -1
  117. package/dist/utils/stream_handler.d.ts.map +1 -1
  118. package/dist/utils/stream_handler.js +1 -1
  119. package/dist/utils/stream_handler.js.map +1 -1
  120. package/dist/utils/summary_utils.d.ts.map +1 -1
  121. package/dist/utils/summary_utils.js +3 -8
  122. package/dist/utils/summary_utils.js.map +1 -1
  123. package/dist/utils/test_utils.d.ts.map +1 -1
  124. package/dist/utils/test_utils.js +1 -3
  125. package/dist/utils/test_utils.js.map +1 -1
  126. package/dist/utils/tool_execution_manager.d.ts.map +1 -1
  127. package/dist/utils/tool_execution_manager.js +3 -9
  128. package/dist/utils/tool_execution_manager.js.map +1 -1
  129. package/dist/utils/tool_parameter_utils.d.ts.map +1 -1
  130. package/dist/utils/tool_parameter_utils.js +2 -6
  131. package/dist/utils/tool_parameter_utils.js.map +1 -1
  132. package/dist/utils/tool_result_processor.d.ts.map +1 -1
  133. package/dist/utils/tool_result_processor.js +7 -18
  134. package/dist/utils/tool_result_processor.js.map +1 -1
  135. package/dist/utils/verification.d.ts.map +1 -1
  136. package/dist/utils/verification.js.map +1 -1
  137. package/package.json +4 -2
@@ -1,10 +1,10 @@
1
- import { GoogleGenAI, Type, FunctionCallingConfigMode, Modality, } from '@google/genai';
1
+ import { GoogleGenAI, Type, FunctionCallingConfigMode, Modality, MediaResolution, } from '@google/genai';
2
2
  import { v4 as uuidv4 } from 'uuid';
3
3
  import { BaseModelProvider } from './base_provider.js';
4
4
  import { costTracker } from '../index.js';
5
- import { log_llm_error, log_llm_request, log_llm_response, } from '../utils/llm_logger.js';
5
+ import { log_llm_error, log_llm_request, log_llm_response } from '../utils/llm_logger.js';
6
6
  import { isPaused } from '../utils/pause_controller.js';
7
- import { appendMessageWithImage, resizeAndTruncateForGemini, } from '../utils/image_utils.js';
7
+ import { appendMessageWithImage, resizeAndTruncateForGemini } from '../utils/image_utils.js';
8
8
  function convertParameterToGeminiFormat(param) {
9
9
  let type = Type.STRING;
10
10
  switch (param.type) {
@@ -85,8 +85,7 @@ function convertParameterToGeminiFormat(param) {
85
85
  if (param.properties && typeof param.properties === 'object') {
86
86
  result.properties = {};
87
87
  for (const [propName, propSchema] of Object.entries(param.properties)) {
88
- result.properties[propName] =
89
- convertParameterToGeminiFormat(propSchema);
88
+ result.properties[propName] = convertParameterToGeminiFormat(propSchema);
90
89
  }
91
90
  }
92
91
  else {
@@ -161,9 +160,7 @@ async function convertToGeminiFunctionDeclarations(tools) {
161
160
  parameters: {
162
161
  type: Type.OBJECT,
163
162
  properties,
164
- required: Array.isArray(resolvedParams?.required)
165
- ? resolvedParams.required
166
- : [],
163
+ required: Array.isArray(resolvedParams?.required) ? resolvedParams.required : [],
167
164
  },
168
165
  };
169
166
  }));
@@ -218,10 +215,7 @@ async function convertToGeminiContents(model, messages) {
218
215
  let args = {};
219
216
  try {
220
217
  const parsedArgs = JSON.parse(msg.arguments || '{}');
221
- args =
222
- typeof parsedArgs === 'object' && parsedArgs !== null
223
- ? parsedArgs
224
- : { value: parsedArgs };
218
+ args = typeof parsedArgs === 'object' && parsedArgs !== null ? parsedArgs : { value: parsedArgs };
225
219
  }
226
220
  catch (e) {
227
221
  console.error(`Failed to parse function call arguments for ${msg.name}:`, msg.arguments, e);
@@ -264,8 +258,7 @@ async function convertToGeminiContents(model, messages) {
264
258
  contents = await appendMessageWithImage(model, contents, message, {
265
259
  read: () => textOutput,
266
260
  write: value => {
267
- message.parts[0].functionResponse.response.content =
268
- value;
261
+ message.parts[0].functionResponse.response.content = value;
269
262
  return message;
270
263
  },
271
264
  }, addImagesToInput);
@@ -275,9 +268,7 @@ async function convertToGeminiContents(model, messages) {
275
268
  if (typeof msg.content === 'string') {
276
269
  textContent = msg.content;
277
270
  }
278
- else if (msg.content &&
279
- typeof msg.content === 'object' &&
280
- 'text' in msg.content) {
271
+ else if (msg.content && typeof msg.content === 'object' && 'text' in msg.content) {
281
272
  textContent = msg.content.text;
282
273
  }
283
274
  else {
@@ -326,15 +317,14 @@ export class GeminiProvider extends BaseModelProvider {
326
317
  this._client = new GoogleGenAI({
327
318
  apiKey: apiKey,
328
319
  vertexai: false,
320
+ httpOptions: { apiVersion: 'v1alpha' },
329
321
  });
330
322
  }
331
323
  return this._client;
332
324
  }
333
325
  async createEmbedding(input, model, opts) {
334
326
  try {
335
- let actualModelId = model.startsWith('gemini/')
336
- ? model.substring(7)
337
- : model;
327
+ let actualModelId = model.startsWith('gemini/') ? model.substring(7) : model;
338
328
  let thinkingConfig = null;
339
329
  for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) {
340
330
  if (actualModelId.endsWith(suffix)) {
@@ -355,9 +345,7 @@ export class GeminiProvider extends BaseModelProvider {
355
345
  payload.config.thinkingConfig = thinkingConfig;
356
346
  }
357
347
  const response = await this.client.models.embedContent(payload);
358
- console.log('[Gemini] Embedding response structure:', JSON.stringify(response, (key, value) => key === 'values' &&
359
- Array.isArray(value) &&
360
- value.length > 10
348
+ console.log('[Gemini] Embedding response structure:', JSON.stringify(response, (key, value) => key === 'values' && Array.isArray(value) && value.length > 10
361
349
  ? `[${value.length} items]`
362
350
  : value, 2));
363
351
  if (!response.embeddings || !Array.isArray(response.embeddings)) {
@@ -376,11 +364,8 @@ export class GeminiProvider extends BaseModelProvider {
376
364
  }
377
365
  else {
378
366
  console.warn('[Gemini] Could not find expected "values" property in embeddings response');
379
- extractedValues =
380
- response.embeddings;
381
- dimensions = Array.isArray(extractedValues[0])
382
- ? extractedValues[0].length
383
- : 0;
367
+ extractedValues = response.embeddings;
368
+ dimensions = Array.isArray(extractedValues[0]) ? extractedValues[0].length : 0;
384
369
  }
385
370
  }
386
371
  costTracker.addUsage({
@@ -396,8 +381,7 @@ export class GeminiProvider extends BaseModelProvider {
396
381
  }
397
382
  else {
398
383
  let result;
399
- if (Array.isArray(extractedValues) &&
400
- extractedValues.length >= 1) {
384
+ if (Array.isArray(extractedValues) && extractedValues.length >= 1) {
401
385
  const firstValue = extractedValues[0];
402
386
  if (Array.isArray(firstValue)) {
403
387
  result = firstValue;
@@ -417,10 +401,7 @@ export class GeminiProvider extends BaseModelProvider {
417
401
  adjustedResult = result.slice(0, 3072);
418
402
  }
419
403
  else {
420
- adjustedResult = [
421
- ...result,
422
- ...Array(3072 - result.length).fill(0),
423
- ];
404
+ adjustedResult = [...result, ...Array(3072 - result.length).fill(0)];
424
405
  }
425
406
  }
426
407
  return adjustedResult;
@@ -444,8 +425,7 @@ export class GeminiProvider extends BaseModelProvider {
444
425
  catch (error) {
445
426
  attempts++;
446
427
  const errorMsg = error instanceof Error ? error.message : String(error);
447
- if (errorMsg.includes('Incomplete JSON segment') &&
448
- attempts <= maxRetries) {
428
+ if (errorMsg.includes('Incomplete JSON segment') && attempts <= maxRetries) {
449
429
  console.warn(`[Gemini] Incomplete JSON segment error, retrying (${attempts}/${maxRetries})...`);
450
430
  await new Promise(resolve => setTimeout(resolve, 1000 * attempts));
451
431
  continue;
@@ -456,9 +436,7 @@ export class GeminiProvider extends BaseModelProvider {
456
436
  }
457
437
  async *createResponseStream(messages, model, agent) {
458
438
  const { getToolsFromAgent } = await import('../utils/agent.js');
459
- const tools = agent
460
- ? await getToolsFromAgent(agent)
461
- : [];
439
+ const tools = agent ? await getToolsFromAgent(agent) : [];
462
440
  const settings = agent?.modelSettings;
463
441
  let messageId = uuidv4();
464
442
  let contentBuffer = '';
@@ -526,8 +504,7 @@ export class GeminiProvider extends BaseModelProvider {
526
504
  if ('additionalProperties' in obj) {
527
505
  delete obj.additionalProperties;
528
506
  }
529
- if (obj.properties &&
530
- typeof obj.properties === 'object') {
507
+ if (obj.properties && typeof obj.properties === 'object') {
531
508
  Object.values(obj.properties).forEach(prop => {
532
509
  removeAdditionalProperties(prop);
533
510
  });
@@ -559,9 +536,7 @@ export class GeminiProvider extends BaseModelProvider {
559
536
  settings.tool_choice?.type === 'function' &&
560
537
  settings.tool_choice?.function?.name) {
561
538
  toolChoice = FunctionCallingConfigMode.ANY;
562
- allowedFunctionNames = [
563
- settings.tool_choice.function.name,
564
- ];
539
+ allowedFunctionNames = [settings.tool_choice.function.name];
565
540
  }
566
541
  else if (settings.tool_choice === 'required') {
567
542
  toolChoice = FunctionCallingConfigMode.ANY;
@@ -579,8 +554,7 @@ export class GeminiProvider extends BaseModelProvider {
579
554
  },
580
555
  };
581
556
  if (allowedFunctionNames.length > 0) {
582
- config.toolConfig.functionCallingConfig.allowedFunctionNames =
583
- allowedFunctionNames;
557
+ config.toolConfig.functionCallingConfig.allowedFunctionNames = allowedFunctionNames;
584
558
  }
585
559
  }
586
560
  }
@@ -697,8 +671,7 @@ export class GeminiProvider extends BaseModelProvider {
697
671
  message_id: messageId,
698
672
  order: eventOrder++,
699
673
  };
700
- contentBuffer +=
701
- '\n\nSearch Results:\n' + formatted + '\n';
674
+ contentBuffer += '\n\nSearch Results:\n' + formatted + '\n';
702
675
  }
703
676
  }
704
677
  }
@@ -743,9 +716,7 @@ export class GeminiProvider extends BaseModelProvider {
743
716
  }
744
717
  catch (error) {
745
718
  log_llm_error(requestId, error);
746
- const errorMessage = error instanceof Error
747
- ? error.stack || error.message
748
- : String(error);
719
+ const errorMessage = error instanceof Error ? error.stack || error.message : String(error);
749
720
  if (errorMessage.includes('Incomplete JSON segment')) {
750
721
  console.error('[Gemini] Stream terminated with incomplete JSON. This may indicate network issues or timeouts.');
751
722
  }
@@ -796,8 +767,7 @@ export class GeminiProvider extends BaseModelProvider {
796
767
  },
797
768
  });
798
769
  const images = [];
799
- if (response.generatedImages &&
800
- response.generatedImages.length > 0) {
770
+ if (response.generatedImages && response.generatedImages.length > 0) {
801
771
  for (const generatedImage of response.generatedImages) {
802
772
  if (generatedImage.image?.imageBytes) {
803
773
  const base64Image = `data:image/png;base64,${generatedImage.image.imageBytes}`;
@@ -879,14 +849,12 @@ export class GeminiProvider extends BaseModelProvider {
879
849
  throw new Error('No audio generated from Gemini TTS');
880
850
  }
881
851
  const candidate = response.candidates[0];
882
- if (!candidate.content.parts ||
883
- candidate.content.parts.length === 0) {
852
+ if (!candidate.content.parts || candidate.content.parts.length === 0) {
884
853
  throw new Error('No audio parts in Gemini TTS response');
885
854
  }
886
855
  let audioData;
887
856
  for (const part of candidate.content.parts) {
888
- if (part.inlineData &&
889
- part.inlineData.mimeType?.includes('audio')) {
857
+ if (part.inlineData && part.inlineData.mimeType?.includes('audio')) {
890
858
  audioData = part.inlineData.data;
891
859
  break;
892
860
  }
@@ -976,38 +944,56 @@ export class GeminiProvider extends BaseModelProvider {
976
944
  }
977
945
  async *createTranscription(audio, agent, model, opts) {
978
946
  let session = null;
979
- let audioBuffer = Buffer.alloc(0);
980
947
  let isConnected = false;
981
948
  try {
982
- const ai = new GoogleGenAI({ apiKey: this.apiKey });
983
- const realtimeConfig = opts?.realtimeConfig
984
- ?.automaticActivityDetection || {
985
- prefixPaddingMs: 20,
986
- silenceDurationMs: 100,
949
+ const ai = new GoogleGenAI({
950
+ apiKey: this.apiKey,
951
+ httpOptions: { apiVersion: 'v1alpha' },
952
+ });
953
+ const realtimeInputConfig = opts?.realtimeInputConfig || {
954
+ automaticActivityDetection: {
955
+ disabled: false,
956
+ startOfSpeechSensitivity: 'START_SENSITIVITY_HIGH',
957
+ endOfSpeechSensitivity: 'END_SENSITIVITY_LOW',
958
+ },
959
+ };
960
+ const speechConfig = opts?.speechConfig || {
961
+ languageCode: 'en-US',
987
962
  };
988
963
  const systemInstruction = agent.instructions ||
989
964
  `You are a real-time transcription assistant. Your only task is to transcribe speech as you hear it. DO NOT ADD YOUR OWN RESPONSE OR COMMENTARY. TRANSCRIBE WHAT YOU HEAR ONLY.
990
965
  Respond immediately with transcribed text as you process the audio.
991
966
  If quick corrections are used e.g. "Let's go to Point A, no Point B" then just remove incorrect part e.g. respond with "Let's go to Point B".
992
- When it makes the transcription clearer, remove filler words (like "um") add punctuation, correct obvious grammar issues and add in missing words.`;
967
+ When it makes the transcription clearer, remove filler words (like "um") add punctuation, correct obvious grammar issues and add in missing words.
968
+
969
+ EXAMPLES:
970
+ User: What capital of France
971
+ Model: What's the capital of France?
972
+
973
+ User: How about um we then do no actually how about you tell me the weather
974
+ Model: How about you tell me the weather?
975
+
976
+ User: Ok ignore all that lets start again
977
+ Model: Ok ignore all that, let's start again.`;
993
978
  console.log('[Gemini] Connecting to Live API for transcription...');
994
979
  const connectionPromise = new Promise((resolve, reject) => {
995
980
  const timeout = setTimeout(() => {
996
981
  reject(new Error('Connection timeout'));
997
- }, 30000);
982
+ }, 10000);
983
+ const config = {
984
+ responseModalities: [Modality.TEXT],
985
+ mediaResolution: MediaResolution.MEDIA_RESOLUTION_MEDIUM,
986
+ speechConfig,
987
+ realtimeInputConfig,
988
+ systemInstruction: {
989
+ parts: [{ text: systemInstruction }],
990
+ },
991
+ inputAudioTranscription: {},
992
+ };
998
993
  ai.live
999
994
  .connect({
1000
995
  model: model,
1001
- config: {
1002
- responseModalities: [Modality.TEXT],
1003
- systemInstruction: {
1004
- parts: [{ text: systemInstruction }],
1005
- },
1006
- realtimeInputConfig: {
1007
- automaticActivityDetection: realtimeConfig,
1008
- },
1009
- inputAudioTranscription: true,
1010
- },
996
+ config,
1011
997
  callbacks: {
1012
998
  onopen: () => {
1013
999
  clearTimeout(timeout);
@@ -1016,27 +1002,17 @@ When it makes the transcription clearer, remove filler words (like "um") add pun
1016
1002
  resolve();
1017
1003
  },
1018
1004
  onmessage: async (msg) => {
1019
- if (msg.serverContent?.inputAudioTranscription) {
1020
- const transcriptionText = msg.serverContent
1021
- .inputAudioTranscription.text ||
1022
- msg.serverContent
1023
- .inputAudioTranscription
1024
- .transcript ||
1025
- '';
1026
- if (transcriptionText) {
1027
- const previewEvent = {
1028
- type: 'transcription_preview',
1029
- timestamp: new Date().toISOString(),
1030
- text: transcriptionText,
1031
- isFinal: true,
1032
- };
1033
- transcriptEvents.push(previewEvent);
1034
- console.debug('[Gemini] Received input transcription:', transcriptionText);
1035
- }
1005
+ if (msg.serverContent?.inputTranscription?.text) {
1006
+ const previewEvent = {
1007
+ type: 'transcription_preview',
1008
+ timestamp: new Date().toISOString(),
1009
+ text: msg.serverContent.inputTranscription.text,
1010
+ isFinal: true,
1011
+ };
1012
+ transcriptEvents.push(previewEvent);
1036
1013
  }
1037
1014
  if (msg.serverContent?.modelTurn?.parts) {
1038
- for (const part of msg.serverContent
1039
- .modelTurn.parts) {
1015
+ for (const part of msg.serverContent.modelTurn.parts) {
1040
1016
  if (part.text && part.text.trim()) {
1041
1017
  const deltaEvent = {
1042
1018
  type: 'transcription_delta',
@@ -1058,31 +1034,39 @@ When it makes the transcription clearer, remove filler words (like "um") add pun
1058
1034
  if (msg.usageMetadata) {
1059
1035
  costTracker.addUsage({
1060
1036
  model: model,
1061
- input_tokens: msg.usageMetadata
1062
- .promptTokenCount || 0,
1063
- output_tokens: msg.usageMetadata
1064
- .responseTokenCount || 0,
1037
+ input_tokens: msg.usageMetadata.promptTokenCount || 0,
1038
+ output_tokens: msg.usageMetadata.responseTokenCount || 0,
1065
1039
  input_modality: 'audio',
1066
1040
  output_modality: 'text',
1067
1041
  metadata: {
1068
- totalTokens: msg.usageMetadata
1069
- .totalTokenCount || 0,
1042
+ totalTokens: msg.usageMetadata.totalTokenCount || 0,
1070
1043
  source: 'gemini-live-transcription',
1071
1044
  },
1072
1045
  });
1073
1046
  }
1074
1047
  },
1075
1048
  onerror: (err) => {
1076
- console.error('[Gemini] Live API error:', err);
1049
+ console.error('[Gemini] Live API error:', {
1050
+ code: err.code,
1051
+ reason: err.reason,
1052
+ wasClean: err.wasClean,
1053
+ });
1077
1054
  connectionError = err;
1078
1055
  },
1079
- onclose: () => {
1056
+ onclose: (event) => {
1080
1057
  console.log('[Gemini] Live session closed');
1058
+ if (event) {
1059
+ console.log('[Gemini] Close event details:', {
1060
+ code: event.code,
1061
+ reason: event.reason,
1062
+ wasClean: event.wasClean,
1063
+ });
1064
+ }
1081
1065
  isConnected = false;
1082
1066
  },
1083
1067
  },
1084
1068
  })
1085
- .then(s => {
1069
+ .then(async (s) => {
1086
1070
  session = s;
1087
1071
  });
1088
1072
  });
@@ -1091,30 +1075,19 @@ When it makes the transcription clearer, remove filler words (like "um") add pun
1091
1075
  await connectionPromise;
1092
1076
  const audioStream = normalizeAudioSource(audio);
1093
1077
  const reader = audioStream.getReader();
1094
- const chunkSize = opts?.bufferConfig?.chunkSize || 8000;
1095
- const flushInterval = opts?.bufferConfig?.flushInterval || 500;
1096
- let flushTimer = null;
1097
- const scheduleFlush = () => {
1098
- if (flushTimer)
1099
- clearTimeout(flushTimer);
1100
- flushTimer = setTimeout(async () => {
1101
- if (audioBuffer.length > 0 && session && isConnected) {
1102
- await sendAudioChunk(audioBuffer);
1103
- audioBuffer = Buffer.alloc(0);
1104
- }
1105
- }, flushInterval);
1106
- };
1107
1078
  const sendAudioChunk = async (chunk) => {
1108
1079
  try {
1080
+ const base64Data = chunk.toString('base64');
1109
1081
  await session.sendRealtimeInput({
1110
1082
  media: {
1111
1083
  mimeType: 'audio/pcm;rate=16000',
1112
- data: chunk.toString('base64'),
1084
+ data: base64Data,
1113
1085
  },
1114
1086
  });
1115
1087
  }
1116
1088
  catch (err) {
1117
1089
  console.error('[Gemini] Error sending audio chunk:', err);
1090
+ connectionError = err;
1118
1091
  throw err;
1119
1092
  }
1120
1093
  };
@@ -1123,43 +1096,30 @@ When it makes the transcription clearer, remove filler words (like "um") add pun
1123
1096
  const { done, value } = await reader.read();
1124
1097
  if (done)
1125
1098
  break;
1126
- if (value) {
1127
- audioBuffer = Buffer.concat([
1128
- audioBuffer,
1129
- Buffer.from(value),
1130
- ]);
1131
- while (audioBuffer.length >= chunkSize) {
1132
- const chunk = audioBuffer.slice(0, chunkSize);
1133
- audioBuffer = audioBuffer.slice(chunkSize);
1134
- if (session && isConnected) {
1135
- await sendAudioChunk(chunk);
1136
- }
1137
- }
1138
- scheduleFlush();
1099
+ if (value && session && isConnected) {
1100
+ const chunk = value instanceof Buffer ? value : Buffer.from(value);
1101
+ await sendAudioChunk(chunk);
1139
1102
  }
1140
- while (transcriptEvents.length > 0) {
1141
- const event = transcriptEvents.shift();
1142
- if (event)
1103
+ if (transcriptEvents.length > 0) {
1104
+ const events = transcriptEvents.splice(0, transcriptEvents.length);
1105
+ for (const event of events) {
1143
1106
  yield event;
1107
+ }
1144
1108
  }
1145
1109
  if (connectionError) {
1146
1110
  throw connectionError;
1147
1111
  }
1148
1112
  }
1149
- if (audioBuffer.length > 0 && session && isConnected) {
1150
- await sendAudioChunk(audioBuffer);
1151
- }
1152
1113
  await new Promise(resolve => setTimeout(resolve, 1000));
1153
- while (transcriptEvents.length > 0) {
1154
- const event = transcriptEvents.shift();
1155
- if (event)
1114
+ if (transcriptEvents.length > 0) {
1115
+ const events = transcriptEvents.splice(0, transcriptEvents.length);
1116
+ for (const event of events) {
1156
1117
  yield event;
1118
+ }
1157
1119
  }
1158
1120
  }
1159
1121
  finally {
1160
1122
  reader.releaseLock();
1161
- if (flushTimer)
1162
- clearTimeout(flushTimer);
1163
1123
  if (session) {
1164
1124
  session.close();
1165
1125
  }
@@ -1170,21 +1130,34 @@ When it makes the transcription clearer, remove filler words (like "um") add pun
1170
1130
  const errorEvent = {
1171
1131
  type: 'error',
1172
1132
  timestamp: new Date().toISOString(),
1173
- error: error instanceof Error
1174
- ? error.message
1175
- : 'Transcription failed',
1133
+ error: error instanceof Error ? error.message : 'Transcription failed',
1176
1134
  };
1177
1135
  yield errorEvent;
1178
1136
  }
1179
1137
  }
1138
+ async createLiveSession(config, agent, model, opts) {
1139
+ console.log(`[Gemini] Creating Live session with model ${model}`);
1140
+ const liveModels = [
1141
+ 'gemini-2.0-flash-live-001',
1142
+ 'gemini-live-2.5-flash-preview',
1143
+ 'gemini-2.5-flash-preview-native-audio-dialog',
1144
+ 'gemini-2.5-flash-exp-native-audio-thinking-dialog',
1145
+ 'gemini-2.0-flash-exp',
1146
+ ];
1147
+ if (!liveModels.some(m => model.includes(m))) {
1148
+ throw new Error(`Model ${model} does not support Live API. Supported models: ${liveModels.join(', ')}`);
1149
+ }
1150
+ const sessionId = uuidv4();
1151
+ const liveSession = new GeminiLiveSession(sessionId, this.client, model, config, agent, opts);
1152
+ await liveSession.initialize();
1153
+ return liveSession;
1154
+ }
1180
1155
  }
1181
1156
  function normalizeAudioSource(source) {
1182
1157
  if (source instanceof ReadableStream) {
1183
1158
  return source;
1184
1159
  }
1185
- if (typeof source === 'object' &&
1186
- source !== null &&
1187
- Symbol.asyncIterator in source) {
1160
+ if (typeof source === 'object' && source !== null && Symbol.asyncIterator in source) {
1188
1161
  return new ReadableStream({
1189
1162
  async start(controller) {
1190
1163
  try {
@@ -1214,5 +1187,397 @@ function normalizeAudioSource(source) {
1214
1187
  }
1215
1188
  throw new Error(`Unsupported audio source type: ${typeof source}`);
1216
1189
  }
1190
+ class GeminiLiveSession {
1191
+ sessionId;
1192
+ ai;
1193
+ model;
1194
+ config;
1195
+ agent;
1196
+ options;
1197
+ session = null;
1198
+ eventQueue = [];
1199
+ eventResolvers = [];
1200
+ _isActive = true;
1201
+ sessionClosed = false;
1202
+ messageHistory = [];
1203
+ currentTurn = null;
1204
+ constructor(sessionId, ai, model, config, agent, options) {
1205
+ this.sessionId = sessionId;
1206
+ this.ai = ai;
1207
+ this.model = model;
1208
+ this.config = config;
1209
+ this.agent = agent;
1210
+ this.options = options;
1211
+ }
1212
+ async initialize() {
1213
+ const connectionPromise = new Promise((resolve, reject) => {
1214
+ const timeout = setTimeout(() => {
1215
+ reject(new Error('Connection timeout'));
1216
+ }, 30000);
1217
+ const tools = [];
1218
+ if (this.config.tools) {
1219
+ for (const toolGroup of this.config.tools) {
1220
+ if (toolGroup.functionDeclarations) {
1221
+ const functionDeclarations = toolGroup.functionDeclarations.map(func => ({
1222
+ name: func.name,
1223
+ description: func.description,
1224
+ parameters: convertParameterToGeminiFormat(func.parameters),
1225
+ }));
1226
+ tools.push({ functionDeclarations });
1227
+ }
1228
+ if (toolGroup.codeExecution) {
1229
+ tools.push({ codeExecution: {} });
1230
+ }
1231
+ if (toolGroup.googleSearch) {
1232
+ tools.push({ googleSearch: {} });
1233
+ }
1234
+ }
1235
+ }
1236
+ let systemInstruction = undefined;
1237
+ if (this.agent.instructions) {
1238
+ systemInstruction = {
1239
+ parts: [{ text: this.agent.instructions }],
1240
+ };
1241
+ }
1242
+ const responseModalities = this.config.responseModalities[0] === 'AUDIO' ? [Modality.AUDIO] : [Modality.TEXT];
1243
+ const config = {
1244
+ responseModalities,
1245
+ systemInstruction,
1246
+ tools: tools.length > 0 ? tools : undefined,
1247
+ };
1248
+ if (this.config.responseModalities[0] === 'AUDIO' && this.config.speechConfig) {
1249
+ config.speechConfig = {
1250
+ voiceConfig: this.config.speechConfig.voiceConfig,
1251
+ };
1252
+ }
1253
+ if (this.config.realtimeInputConfig) {
1254
+ config.realtimeInputConfig = {
1255
+ automaticActivityDetection: this.config.realtimeInputConfig.automaticActivityDetection
1256
+ ? {
1257
+ disabled: this.config.realtimeInputConfig.automaticActivityDetection.disabled,
1258
+ }
1259
+ : undefined,
1260
+ };
1261
+ }
1262
+ if (this.config.inputAudioTranscription) {
1263
+ config.inputAudioTranscription = true;
1264
+ }
1265
+ if (this.config.outputAudioTranscription) {
1266
+ config.outputAudioTranscription = true;
1267
+ }
1268
+ if (this.config.enableAffectiveDialog) {
1269
+ config.enableAffectiveDialog = true;
1270
+ }
1271
+ if (this.config.proactivity) {
1272
+ config.proactivity = this.config.proactivity;
1273
+ }
1274
+ console.log('[Gemini] Connecting with config:', JSON.stringify(config, null, 2));
1275
+ this.ai.live
1276
+ .connect({
1277
+ model: this.model,
1278
+ config,
1279
+ callbacks: {
1280
+ onopen: () => {
1281
+ clearTimeout(timeout);
1282
+ console.log('[Gemini] Live session connected');
1283
+ this.pushEvent({
1284
+ type: 'live_ready',
1285
+ timestamp: new Date().toISOString(),
1286
+ });
1287
+ resolve();
1288
+ },
1289
+ onmessage: (msg) => {
1290
+ this.handleMessage(msg);
1291
+ },
1292
+ onerror: (err) => {
1293
+ console.error('[Gemini] Live API error:', err);
1294
+ console.error('[Gemini] Error details:', JSON.stringify(err, null, 2));
1295
+ this.pushEvent({
1296
+ type: 'error',
1297
+ timestamp: new Date().toISOString(),
1298
+ error: err.message || String(err),
1299
+ code: err.code,
1300
+ recoverable: true,
1301
+ });
1302
+ },
1303
+ onclose: (event) => {
1304
+ console.log('[Gemini] Live session closed', event);
1305
+ if (event) {
1306
+ console.log('[Gemini] Close event details:', {
1307
+ code: event.code,
1308
+ reason: event.reason,
1309
+ wasClean: event.wasClean,
1310
+ });
1311
+ }
1312
+ this._isActive = false;
1313
+ this.sessionClosed = true;
1314
+ this.resolveAllWaitingEvents();
1315
+ },
1316
+ },
1317
+ })
1318
+ .then(s => {
1319
+ this.session = s;
1320
+ });
1321
+ });
1322
+ await connectionPromise;
1323
+ }
1324
+ handleMessage(msg) {
1325
+ console.log('[Gemini] Received message:', JSON.stringify(msg, null, 2));
1326
+ if (msg.error) {
1327
+ console.error('[Gemini] Error in message:', msg.error);
1328
+ this.pushEvent({
1329
+ type: 'error',
1330
+ timestamp: new Date().toISOString(),
1331
+ error: msg.error.message || JSON.stringify(msg.error),
1332
+ code: msg.error.code || 'UNKNOWN_ERROR',
1333
+ recoverable: false,
1334
+ });
1335
+ return;
1336
+ }
1337
+ if (msg.serverContent?.modelTurn?.parts) {
1338
+ for (const part of msg.serverContent.modelTurn.parts) {
1339
+ if (part.inlineData?.mimeType?.startsWith('audio/')) {
1340
+ this.pushEvent({
1341
+ type: 'audio_output',
1342
+ timestamp: new Date().toISOString(),
1343
+ data: part.inlineData.data,
1344
+ format: {
1345
+ sampleRate: 24000,
1346
+ channels: 1,
1347
+ encoding: 'pcm',
1348
+ },
1349
+ });
1350
+ }
1351
+ if (part.text) {
1352
+ if (!this.currentTurn || this.currentTurn.role !== 'model') {
1353
+ this.currentTurn = { role: 'model', text: '' };
1354
+ this.pushEvent({
1355
+ type: 'turn_start',
1356
+ timestamp: new Date().toISOString(),
1357
+ role: 'model',
1358
+ });
1359
+ }
1360
+ this.currentTurn.text += part.text;
1361
+ this.pushEvent({
1362
+ type: 'text_delta',
1363
+ timestamp: new Date().toISOString(),
1364
+ delta: part.text,
1365
+ });
1366
+ this.pushEvent({
1367
+ type: 'message_delta',
1368
+ timestamp: new Date().toISOString(),
1369
+ delta: part.text,
1370
+ });
1371
+ }
1372
+ }
1373
+ }
1374
+ if (msg.serverContent?.modelTurn?.parts) {
1375
+ for (const part of msg.serverContent.modelTurn.parts) {
1376
+ if (part.functionCall) {
1377
+ const toolCall = {
1378
+ id: uuidv4(),
1379
+ type: 'function',
1380
+ function: {
1381
+ name: part.functionCall.name,
1382
+ arguments: JSON.stringify(part.functionCall.args),
1383
+ },
1384
+ };
1385
+ this.pushEvent({
1386
+ type: 'tool_call',
1387
+ timestamp: new Date().toISOString(),
1388
+ toolCalls: [toolCall],
1389
+ });
1390
+ }
1391
+ }
1392
+ }
1393
+ if (msg.serverContent?.inputAudioTranscription) {
1394
+ const text = msg.serverContent.inputAudioTranscription.text ||
1395
+ msg.serverContent.inputAudioTranscription.transcript ||
1396
+ '';
1397
+ if (text) {
1398
+ this.pushEvent({
1399
+ type: 'transcription_input',
1400
+ timestamp: new Date().toISOString(),
1401
+ text,
1402
+ });
1403
+ }
1404
+ }
1405
+ if (msg.serverContent?.outputTranscription) {
1406
+ const text = msg.serverContent.outputTranscription.text || '';
1407
+ if (text) {
1408
+ this.pushEvent({
1409
+ type: 'transcription_output',
1410
+ timestamp: new Date().toISOString(),
1411
+ text,
1412
+ });
1413
+ }
1414
+ }
1415
+ if (msg.serverContent?.turnComplete) {
1416
+ if (this.currentTurn) {
1417
+ const message = this.currentTurn.role === 'model'
1418
+ ? {
1419
+ type: 'message',
1420
+ role: 'assistant',
1421
+ content: this.currentTurn.text,
1422
+ status: 'completed',
1423
+ }
1424
+ : {
1425
+ type: 'message',
1426
+ role: 'user',
1427
+ content: this.currentTurn.text,
1428
+ };
1429
+ this.messageHistory.push(message);
1430
+ this.pushEvent({
1431
+ type: 'turn_complete',
1432
+ timestamp: new Date().toISOString(),
1433
+ role: this.currentTurn.role,
1434
+ message,
1435
+ });
1436
+ this.currentTurn = null;
1437
+ }
1438
+ }
1439
+ if (msg.serverContent?.interrupted) {
1440
+ const cancelledToolCalls = [];
1441
+ if (msg.serverContent.cancelledFunctionCalls) {
1442
+ cancelledToolCalls.push(...msg.serverContent.cancelledFunctionCalls.map((fc) => fc.id));
1443
+ }
1444
+ this.pushEvent({
1445
+ type: 'interrupted',
1446
+ timestamp: new Date().toISOString(),
1447
+ cancelledToolCalls,
1448
+ });
1449
+ }
1450
+ if (msg.usageMetadata) {
1451
+ const usage = msg.usageMetadata;
1452
+ const inputTokens = usage.promptTokenCount || 0;
1453
+ const outputTokens = usage.candidatesTokenCount || 0;
1454
+ const totalTokens = usage.totalTokenCount || 0;
1455
+ costTracker.addUsage({
1456
+ model: this.model,
1457
+ input_tokens: inputTokens,
1458
+ output_tokens: outputTokens,
1459
+ cached_tokens: usage.cachedContentTokenCount || 0,
1460
+ metadata: {
1461
+ total_tokens: totalTokens,
1462
+ source: 'gemini-live',
1463
+ },
1464
+ });
1465
+ const inputCost = undefined;
1466
+ const outputCost = undefined;
1467
+ const totalCost = undefined;
1468
+ this.pushEvent({
1469
+ type: 'cost_update',
1470
+ timestamp: new Date().toISOString(),
1471
+ usage: {
1472
+ inputTokens,
1473
+ outputTokens,
1474
+ totalTokens,
1475
+ inputCost,
1476
+ outputCost,
1477
+ totalCost,
1478
+ },
1479
+ });
1480
+ }
1481
+ }
1482
+ async sendAudio(audio) {
1483
+ if (!this.session || !this._isActive) {
1484
+ console.error(`[GeminiLiveSession ${this.sessionId}] Cannot send audio - session not active`);
1485
+ throw new Error('Session is not active');
1486
+ }
1487
+ console.log(`[GeminiLiveSession ${this.sessionId}] Sending audio: ${audio.data.length} chars (base64), mimeType: ${audio.mimeType}`);
1488
+ try {
1489
+ await this.session.sendRealtimeInput({
1490
+ media: {
1491
+ mimeType: audio.mimeType,
1492
+ data: audio.data,
1493
+ },
1494
+ });
1495
+ console.log(`[GeminiLiveSession ${this.sessionId}] Audio sent successfully`);
1496
+ }
1497
+ catch (error) {
1498
+ console.error(`[GeminiLiveSession ${this.sessionId}] Error sending audio:`, error);
1499
+ throw error;
1500
+ }
1501
+ const size = Math.ceil((audio.data.length * 3) / 4);
1502
+ this.pushEvent({
1503
+ type: 'audio_input',
1504
+ timestamp: new Date().toISOString(),
1505
+ size,
1506
+ });
1507
+ }
1508
+ async sendText(text, role = 'user') {
1509
+ if (!this.session || !this._isActive) {
1510
+ throw new Error('Session is not active');
1511
+ }
1512
+ const message = {
1513
+ role: role === 'assistant' ? 'model' : 'user',
1514
+ parts: [{ text }],
1515
+ };
1516
+ await this.session.sendClientContent({
1517
+ turns: [message],
1518
+ });
1519
+ this.pushEvent({
1520
+ type: 'turn_start',
1521
+ timestamp: new Date().toISOString(),
1522
+ role: role === 'assistant' ? 'model' : 'user',
1523
+ });
1524
+ }
1525
+ async sendToolResponse(toolResults) {
1526
+ if (!this.session || !this._isActive) {
1527
+ throw new Error('Session is not active');
1528
+ }
1529
+ const functionResponses = toolResults.map(result => ({
1530
+ id: result.call_id || result.id,
1531
+ name: result.toolCall.function.name,
1532
+ response: result.error ? { error: result.error } : { result: result.output },
1533
+ }));
1534
+ await this.session.sendToolResponse({ functionResponses });
1535
+ }
1536
+ async *getEventStream() {
1537
+ while (this._isActive || this.eventQueue.length > 0) {
1538
+ if (this.eventQueue.length > 0) {
1539
+ yield this.eventQueue.shift();
1540
+ }
1541
+ else {
1542
+ const result = await new Promise(resolve => {
1543
+ if (this.sessionClosed && this.eventQueue.length === 0) {
1544
+ resolve({ done: true, value: undefined });
1545
+ }
1546
+ else {
1547
+ this.eventResolvers.push(resolve);
1548
+ }
1549
+ });
1550
+ if (result.done)
1551
+ break;
1552
+ if (result.value)
1553
+ yield result.value;
1554
+ }
1555
+ }
1556
+ }
1557
+ async close() {
1558
+ if (this.session && this._isActive) {
1559
+ this._isActive = false;
1560
+ await this.session.close();
1561
+ }
1562
+ }
1563
+ isActive() {
1564
+ return this._isActive;
1565
+ }
1566
+ pushEvent(event) {
1567
+ if (this.eventResolvers.length > 0) {
1568
+ const resolver = this.eventResolvers.shift();
1569
+ resolver({ value: event, done: false });
1570
+ }
1571
+ else {
1572
+ this.eventQueue.push(event);
1573
+ }
1574
+ }
1575
+ resolveAllWaitingEvents() {
1576
+ for (const resolver of this.eventResolvers) {
1577
+ resolver({ done: true, value: undefined });
1578
+ }
1579
+ this.eventResolvers = [];
1580
+ }
1581
+ }
1217
1582
  export const geminiProvider = new GeminiProvider();
1218
1583
  //# sourceMappingURL=gemini.js.map