kugelaudio 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,12 @@
7
7
  */
8
8
 
9
9
  import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
10
+ import packageJson from '../package.json';
10
11
  import { KugelAudio } from './client';
11
12
  import { RateLimitError } from './errors';
13
+ import { parseSessionUsage } from './types';
14
+
15
+ const SDK_VERSION = packageJson.version;
12
16
 
13
17
  // ---------------------------------------------------------------------------
14
18
  // Minimal WebSocket mock
@@ -99,6 +103,43 @@ function collectStream(stream: NodeJS.ReadableStream): Promise<Buffer> {
99
103
  // Tests
100
104
  // ---------------------------------------------------------------------------
101
105
 
106
+ describe('parseSessionUsage (/ws/tts final + session_closed)', () => {
107
+ it('parses the usage block from a /ws/tts final frame', () => {
108
+ const usage = parseSessionUsage({
109
+ final: true,
110
+ chunks: 3,
111
+ total_samples: 1000,
112
+ dur_ms: 5400,
113
+ gen_ms: 900,
114
+ rtf: 0.17,
115
+ usage: {
116
+ audio_seconds: 5.4,
117
+ characters: 142,
118
+ cost_cents: 0.49,
119
+ currency: 'eur',
120
+ model_id: 'kugel-3',
121
+ },
122
+ });
123
+ expect(usage).not.toBeNull();
124
+ expect(usage?.audioSeconds).toBe(5.4);
125
+ expect(usage?.costCents).toBe(0.49);
126
+ expect(usage?.costAvailable).toBe(true);
127
+ });
128
+
129
+ it('reports cost null (not zero) when unavailable', () => {
130
+ const usage = parseSessionUsage({
131
+ final: true,
132
+ usage: { audio_seconds: 2.0, cost_cents: null, cost_unavailable: true },
133
+ });
134
+ expect(usage?.costCents).toBeNull();
135
+ expect(usage?.costAvailable).toBe(false);
136
+ });
137
+
138
+ it('returns null when there is no usage info', () => {
139
+ expect(parseSessionUsage({ final: true, chunks: 1 })).toBeNull();
140
+ });
141
+ });
142
+
102
143
  describe('TTSResource.toReadable()', () => {
103
144
  let client: KugelAudio;
104
145
 
@@ -271,7 +312,7 @@ describe('KugelAudio SDK metadata', () => {
271
312
  expect(init).toMatchObject({
272
313
  headers: {
273
314
  'X-KugelAudio-SDK': 'js',
274
- 'X-KugelAudio-SDK-Version': '0.6.1',
315
+ 'X-KugelAudio-SDK-Version': SDK_VERSION,
275
316
  },
276
317
  });
277
318
  });
@@ -283,7 +324,7 @@ describe('KugelAudio SDK metadata', () => {
283
324
  await new Promise<void>((r) => setTimeout(r, 10));
284
325
 
285
326
  expect(mockWs.url).toContain('sdk=js');
286
- expect(mockWs.url).toContain('sdk_version=0.6.1');
327
+ expect(mockWs.url).toContain(`sdk_version=${SDK_VERSION}`);
287
328
  });
288
329
  });
289
330
 
@@ -451,6 +492,111 @@ describe('StreamingSession', () => {
451
492
  expect(sessionClosedCalls[0].totalAudioChunks).toBe(4);
452
493
  });
453
494
 
495
+ it('fires onFinal (end-of-audio) before onSessionClosed on turn end (KUG-1238)', async () => {
496
+ const order: string[] = [];
497
+ let finalStats: { totalAudioSeconds: number; totalTextChunks: number; totalAudioChunks: number } | null = null;
498
+
499
+ const session = client.tts.streamingSession(
500
+ { voiceId: 1 },
501
+ {
502
+ onFinal: (totalAudioSeconds, totalTextChunks, totalAudioChunks) => {
503
+ order.push('final');
504
+ finalStats = { totalAudioSeconds, totalTextChunks, totalAudioChunks };
505
+ },
506
+ onSessionClosed: () => order.push('session_closed'),
507
+ },
508
+ );
509
+
510
+ session.connect();
511
+ await new Promise<void>((r) => setTimeout(r, 10));
512
+ session.send('Hello.', true);
513
+
514
+ mockWs.onmessage?.({ data: makeAudioMsg(0, 100) });
515
+ mockWs.onmessage?.({ data: makeChunkCompleteMsg(0, 1.0, 100) });
516
+ mockWs.onmessage?.({
517
+ data: JSON.stringify({
518
+ final: true,
519
+ total_audio_seconds: 1.0,
520
+ total_text_chunks: 1,
521
+ total_audio_chunks: 1,
522
+ }),
523
+ });
524
+ mockWs.onmessage?.({ data: makeSessionClosedMsg(1.0, 1, 1) });
525
+
526
+ expect(order).toEqual(['final', 'session_closed']);
527
+ expect(finalStats!.totalAudioSeconds).toBe(1.0);
528
+ expect(finalStats!.totalTextChunks).toBe(1);
529
+ expect(finalStats!.totalAudioChunks).toBe(1);
530
+ });
531
+
532
+ it('exposes typed per-session usage (cost charged) on lastUsage', async () => {
533
+ const session = client.tts.streamingSession({ voiceId: 1 }, {});
534
+ session.connect();
535
+ await new Promise<void>((r) => setTimeout(r, 10));
536
+ session.send('Hello.');
537
+
538
+ expect(session.lastUsage).toBeNull();
539
+
540
+ mockWs.onmessage?.({
541
+ data: JSON.stringify({
542
+ session_closed: true,
543
+ total_audio_seconds: 5.4,
544
+ usage: {
545
+ audio_seconds: 5.4,
546
+ characters: 142,
547
+ cost_cents: 0.49,
548
+ currency: 'eur',
549
+ model_id: 'kugel-3',
550
+ },
551
+ }),
552
+ });
553
+
554
+ expect(session.lastUsage).not.toBeNull();
555
+ expect(session.lastUsage?.audioSeconds).toBe(5.4);
556
+ expect(session.lastUsage?.characters).toBe(142);
557
+ expect(session.lastUsage?.costCents).toBe(0.49);
558
+ expect(session.lastUsage?.currency).toBe('eur');
559
+ expect(session.lastUsage?.modelId).toBe('kugel-3');
560
+ expect(session.lastUsage?.costAvailable).toBe(true);
561
+ });
562
+
563
+ it('reports cost as null (not zero) when the charge is unavailable', async () => {
564
+ const session = client.tts.streamingSession({ voiceId: 1 }, {});
565
+ session.connect();
566
+ await new Promise<void>((r) => setTimeout(r, 10));
567
+ session.send('Hi.');
568
+
569
+ mockWs.onmessage?.({
570
+ data: JSON.stringify({
571
+ session_closed: true,
572
+ total_audio_seconds: 2.0,
573
+ usage: {
574
+ audio_seconds: 2.0,
575
+ cost_cents: null,
576
+ cost_unavailable: true,
577
+ model_id: 'kugel-3',
578
+ },
579
+ }),
580
+ });
581
+
582
+ expect(session.lastUsage?.costCents).toBeNull();
583
+ expect(session.lastUsage?.costAvailable).toBe(false);
584
+ expect(session.lastUsage?.audioSeconds).toBe(2.0);
585
+ });
586
+
587
+ it('falls back to total_audio_seconds for a legacy server with no usage block', async () => {
588
+ const session = client.tts.streamingSession({ voiceId: 1 }, {});
589
+ session.connect();
590
+ await new Promise<void>((r) => setTimeout(r, 10));
591
+ session.send('Hi.');
592
+
593
+ mockWs.onmessage?.({ data: makeSessionClosedMsg(3.0, 1, 2) });
594
+
595
+ expect(session.lastUsage?.audioSeconds).toBe(3.0);
596
+ expect(session.lastUsage?.costCents).toBeNull();
597
+ expect(session.lastUsage?.costAvailable).toBe(false);
598
+ });
599
+
454
600
  it('resolves close() even if server never sends session_closed (quiet timeout)', async () => {
455
601
  const session = client.tts.streamingSession(
456
602
  { voiceId: 1 },
@@ -644,6 +790,55 @@ describe('StreamingSession', () => {
644
790
  expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
645
791
  });
646
792
 
793
+ // -------------------------------------------------------------------------
794
+ // dictionaryIds — per-request dictionary selection (KUG-1094)
795
+ // -------------------------------------------------------------------------
796
+
797
+ it('first send carries dictionary_ids when configured', async () => {
798
+ const session = client.tts.streamingSession(
799
+ { voiceId: 1, dictionaryIds: [7, 9] },
800
+ {},
801
+ );
802
+
803
+ session.connect();
804
+ await new Promise<void>((r) => setTimeout(r, 10));
805
+
806
+ session.send('Hello.');
807
+ const sent = JSON.parse(
808
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
809
+ );
810
+ expect(sent.dictionary_ids).toEqual([7, 9]);
811
+ });
812
+
813
+ it('first send carries dictionary_ids: [] (explicit opt-out)', async () => {
814
+ const session = client.tts.streamingSession(
815
+ { voiceId: 1, dictionaryIds: [] },
816
+ {},
817
+ );
818
+
819
+ session.connect();
820
+ await new Promise<void>((r) => setTimeout(r, 10));
821
+
822
+ session.send('Hello.');
823
+ const sent = JSON.parse(
824
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
825
+ );
826
+ expect(sent.dictionary_ids).toEqual([]);
827
+ });
828
+
829
+ it('omits dictionary_ids when not configured', async () => {
830
+ const session = client.tts.streamingSession({ voiceId: 1 }, {});
831
+
832
+ session.connect();
833
+ await new Promise<void>((r) => setTimeout(r, 10));
834
+
835
+ session.send('Hello.');
836
+ const sent = JSON.parse(
837
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
838
+ );
839
+ expect(sent.dictionary_ids).toBeUndefined();
840
+ });
841
+
647
842
  it('cancelCurrent() resolves on quiet timeout if server never acks', async () => {
648
843
  const session = client.tts.streamingSession({ voiceId: 1 }, {});
649
844
 
@@ -724,4 +919,163 @@ describe('MultiContextSession closeContext', () => {
724
919
  expect((errors[0].error as RateLimitError).statusCode).toBe(429);
725
920
  expect((errors[0].error as RateLimitError).errorCode).toBe('TOO_MANY_CONTEXTS');
726
921
  });
922
+
923
+ it('fires onFinal per context on flush completion and graceful close (KUG-1238)', async () => {
924
+ const finals: string[] = [];
925
+ const closed: string[] = [];
926
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
927
+ await session.connect({
928
+ onFinal: (contextId) => finals.push(contextId),
929
+ onContextClosed: (contextId) => closed.push(contextId),
930
+ });
931
+
932
+ // Flush boundary: all audio admitted before the flush has been sent.
933
+ mockWs.onmessage?.({
934
+ data: JSON.stringify({ final: true, context_id: 'a' }),
935
+ });
936
+ expect(finals).toEqual(['a']);
937
+ expect(closed).toEqual([]);
938
+
939
+ // Graceful close: final precedes context_closed.
940
+ mockWs.onmessage?.({
941
+ data: JSON.stringify({ final: true, context_id: 'a' }),
942
+ });
943
+ mockWs.onmessage?.({
944
+ data: JSON.stringify({ context_closed: true, context_id: 'a' }),
945
+ });
946
+ expect(finals).toEqual(['a', 'a']);
947
+ expect(closed).toEqual(['a']);
948
+ });
949
+
950
+ it('exposes per-context usage on context_closed (per conversation)', async () => {
951
+ const closed: Array<{ id: string; usage: unknown }> = [];
952
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
953
+ await session.connect({
954
+ onContextClosed: (contextId, usage) => closed.push({ id: contextId, usage }),
955
+ });
956
+
957
+ mockWs.onmessage?.({
958
+ data: JSON.stringify({
959
+ context_closed: true,
960
+ context_id: 'narrator',
961
+ usage: { audio_seconds: 4.1, cost_cents: 0.37, currency: 'eur', model_id: 'kugel-3' },
962
+ }),
963
+ });
964
+
965
+ // Available both via the callback arg and the per-context accessor
966
+ expect(closed).toHaveLength(1);
967
+ expect(closed[0].id).toBe('narrator');
968
+ expect((closed[0].usage as { costCents: number }).costCents).toBe(0.37);
969
+
970
+ const u = session.usageFor('narrator');
971
+ expect(u?.audioSeconds).toBe(4.1);
972
+ expect(u?.costCents).toBe(0.37);
973
+ expect(u?.costAvailable).toBe(true);
974
+ expect(session.usageFor('missing')).toBeNull();
975
+ });
976
+ });
977
+
978
+ // ---------------------------------------------------------------------------
979
+ // MultiContextSession createContext wire format (KUG-1233)
980
+ //
981
+ // The server binds a context's voice ONLY from voice_settings.voice_id at
982
+ // context creation. A top-level voice_id updates session config and leaves
983
+ // the context voiceless → MISSING_VOICE_ID on the first text. These tests
984
+ // pin the wire format so it cannot silently regress.
985
+ // ---------------------------------------------------------------------------
986
+
987
+ describe('MultiContextSession createContext wire format (KUG-1233)', () => {
988
+ let client: KugelAudio;
989
+
990
+ beforeEach(() => {
991
+ client = new KugelAudio({ apiKey: 'test-key-xxx' });
992
+ });
993
+
994
+ it('puts voice_id inside voice_settings, never top-level', async () => {
995
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
996
+ await session.connect({});
997
+
998
+ session.createContext('narrator', { voiceId: 123 });
999
+
1000
+ const sent = JSON.parse(
1001
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
1002
+ );
1003
+ expect(sent.context_id).toBe('narrator');
1004
+ expect(sent.voice_id).toBeUndefined();
1005
+ expect(sent.voice_settings).toBeDefined();
1006
+ expect(sent.voice_settings.voice_id).toBe(123);
1007
+ });
1008
+
1009
+ it('falls back to defaultVoiceId inside voice_settings', async () => {
1010
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
1011
+ await session.connect({});
1012
+
1013
+ session.createContext('narrator');
1014
+
1015
+ const sent = JSON.parse(
1016
+ mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
1017
+ );
1018
+ expect(sent.voice_id).toBeUndefined();
1019
+ expect(sent.voice_settings.voice_id).toBe(42);
1020
+ });
1021
+
1022
+ it('send() to an unknown context auto-creates it with the default voice, even after session start', async () => {
1023
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
1024
+ await session.connect({});
1025
+
1026
+ // Simulate a started session (first context confirmed by the server).
1027
+ session.createContext('first');
1028
+ mockWs.onmessage?.({
1029
+ data: JSON.stringify({ session_started: true, session_id: 's1' }),
1030
+ });
1031
+ mockWs.onmessage?.({
1032
+ data: JSON.stringify({ context_created: true, context_id: 'first' }),
1033
+ });
1034
+
1035
+ const callsBefore = mockWs.send.mock.calls.length;
1036
+ session.send('second', 'hello there', true);
1037
+ const frames = mockWs.send.mock.calls
1038
+ .slice(callsBefore)
1039
+ .map((c) => JSON.parse(c[0] as string));
1040
+
1041
+ // First frame: the auto-create with voice_settings.voice_id; then the text.
1042
+ expect(frames).toHaveLength(2);
1043
+ expect(frames[0].context_id).toBe('second');
1044
+ expect(frames[0].voice_settings.voice_id).toBe(42);
1045
+ expect(frames[1].text).toBe('hello there');
1046
+ expect(frames[1].flush).toBe(true);
1047
+ });
1048
+
1049
+ it('does not duplicate the create frame across repeated sends', async () => {
1050
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
1051
+ await session.connect({});
1052
+
1053
+ session.send('ctx', 'one');
1054
+ session.send('ctx', 'two');
1055
+
1056
+ const frames = mockWs.send.mock.calls.map((c) => JSON.parse(c[0] as string));
1057
+ const creates = frames.filter((f) => f.voice_settings?.voice_id === 42);
1058
+ expect(creates).toHaveLength(1);
1059
+ });
1060
+
1061
+ it('allows re-creating a context after the server closed it', async () => {
1062
+ const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
1063
+ await session.connect({});
1064
+
1065
+ session.send('ctx', 'one');
1066
+ mockWs.onmessage?.({
1067
+ data: JSON.stringify({ context_created: true, context_id: 'ctx' }),
1068
+ });
1069
+ mockWs.onmessage?.({
1070
+ data: JSON.stringify({ context_closed: true, context_id: 'ctx' }),
1071
+ });
1072
+
1073
+ const callsBefore = mockWs.send.mock.calls.length;
1074
+ session.send('ctx', 'again');
1075
+ const frames = mockWs.send.mock.calls
1076
+ .slice(callsBefore)
1077
+ .map((c) => JSON.parse(c[0] as string));
1078
+ expect(frames[0].voice_settings.voice_id).toBe(42);
1079
+ expect(frames[1].text).toBe('again');
1080
+ });
727
1081
  });
package/src/client.ts CHANGED
@@ -29,6 +29,7 @@ import type {
29
29
  VoiceReference,
30
30
  WordTimestamp
31
31
  } from './types';
32
+ import { parseSessionUsage } from './types';
32
33
  import { base64ToArrayBuffer } from './utils';
33
34
  import { getWebSocket } from './websocket';
34
35
 
@@ -596,6 +597,7 @@ class TTSResource {
596
597
  generationMs: data.gen_ms,
597
598
  rtf: data.rtf,
598
599
  error: data.error,
600
+ usage: parseSessionUsage(data) ?? undefined,
599
601
  };
600
602
  pending.callbacks.onFinal?.(stats);
601
603
  this.pendingRequests.delete(requestId);
@@ -713,11 +715,15 @@ class TTSResource {
713
715
  ...(options.temperature !== undefined && { temperature: options.temperature }),
714
716
  max_new_tokens: options.maxNewTokens ?? 2048,
715
717
  sample_rate: options.sampleRate ?? 24000,
718
+ ...(options.outputFormat && { output_format: options.outputFormat }),
716
719
  normalize: options.normalize ?? true,
717
720
  ...(options.language && { language: options.language }),
718
721
  ...(options.wordTimestamps && { word_timestamps: true }),
719
722
  ...(options.speed !== undefined && { speed: options.speed }),
720
723
  ...(options.projectId !== undefined && { project_id: options.projectId }),
724
+ // [] is meaningful (explicit opt-out) and must be sent; only
725
+ // undefined (use the project default) is omitted.
726
+ ...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
721
727
  }));
722
728
  });
723
729
  }
@@ -744,11 +750,15 @@ class TTSResource {
744
750
  cfg_scale: options.cfgScale ?? 2.0,
745
751
  max_new_tokens: options.maxNewTokens ?? 2048,
746
752
  sample_rate: options.sampleRate ?? 24000,
753
+ ...(options.outputFormat && { output_format: options.outputFormat }),
747
754
  normalize: options.normalize ?? true,
748
755
  ...(options.language && { language: options.language }),
749
756
  ...(options.wordTimestamps && { word_timestamps: true }),
750
757
  ...(options.speed !== undefined && { speed: options.speed }),
751
758
  ...(options.projectId !== undefined && { project_id: options.projectId }),
759
+ // [] is meaningful (explicit opt-out) and must be sent; only
760
+ // undefined (use the project default) is omitted.
761
+ ...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
752
762
  }));
753
763
  };
754
764
 
@@ -779,6 +789,7 @@ class TTSResource {
779
789
  generationMs: data.gen_ms,
780
790
  rtf: data.rtf,
781
791
  error: data.error,
792
+ usage: parseSessionUsage(data) ?? undefined,
782
793
  };
783
794
  callbacks.onFinal?.(stats);
784
795
  ws.close();
@@ -977,7 +988,11 @@ class MultiContextSession {
977
988
  private config: import('./types').MultiContextConfig;
978
989
  private callbacks: import('./types').MultiContextCallbacks = {};
979
990
  private contexts: Set<string> = new Set();
991
+ /** Contexts a create message has been sent for (not yet necessarily
992
+ * confirmed by the server via context_created). */
993
+ private requestedContexts: Set<string> = new Set();
980
994
  private _sessionId: string | null = null;
995
+ private _contextUsage: Map<string, import('./types').SessionUsage> = new Map();
981
996
  private isStarted = false;
982
997
 
983
998
  constructor(
@@ -994,6 +1009,20 @@ class MultiContextSession {
994
1009
  return this._sessionId;
995
1010
  }
996
1011
 
1012
+ /**
1013
+ * Per-context usage (audio time + amount charged) for a closed context, or
1014
+ * null if that context hasn't closed yet. Each context is its own
1015
+ * conversation — use this to bill per conversation. See {@link SessionUsage}.
1016
+ */
1017
+ usageFor(contextId: string): import('./types').SessionUsage | null {
1018
+ return this._contextUsage.get(contextId) ?? null;
1019
+ }
1020
+
1021
+ /** Map of context_id → per-context usage for all closed contexts. */
1022
+ get contextUsage(): Map<string, import('./types').SessionUsage> {
1023
+ return new Map(this._contextUsage);
1024
+ }
1025
+
997
1026
  /**
998
1027
  * Connect to the multi-context WebSocket endpoint.
999
1028
  *
@@ -1063,13 +1092,25 @@ class MultiContextSession {
1063
1092
  this.callbacks.onChunk?.(chunk);
1064
1093
  }
1065
1094
 
1095
+ if (data.final && data.context_id) {
1096
+ // Per-context end-of-audio marker (KUG-1238): all audio admitted
1097
+ // before the client's flush has been delivered; also precedes
1098
+ // context_closed on a graceful close.
1099
+ this.callbacks.onFinal?.(data.context_id);
1100
+ }
1101
+
1066
1102
  if (data.context_closed) {
1067
1103
  this.contexts.delete(data.context_id);
1068
- this.callbacks.onContextClosed?.(data.context_id);
1104
+ this.requestedContexts.delete(data.context_id);
1105
+ // Per-context (per-conversation) usage rides on context_closed.
1106
+ const ctxUsage = parseSessionUsage(data) ?? undefined;
1107
+ if (ctxUsage) this._contextUsage.set(data.context_id, ctxUsage);
1108
+ this.callbacks.onContextClosed?.(data.context_id, ctxUsage);
1069
1109
  }
1070
1110
 
1071
1111
  if (data.context_timeout) {
1072
1112
  this.contexts.delete(data.context_id);
1113
+ this.requestedContexts.delete(data.context_id);
1073
1114
  this.callbacks.onContextTimeout?.(data.context_id);
1074
1115
  }
1075
1116
 
@@ -1124,6 +1165,7 @@ class MultiContextSession {
1124
1165
  this.ws = null;
1125
1166
  this.isStarted = false;
1126
1167
  this.contexts.clear();
1168
+ this.requestedContexts.clear();
1127
1169
  };
1128
1170
  });
1129
1171
  }
@@ -1141,6 +1183,7 @@ class MultiContextSession {
1141
1183
  if (!this.ws || this.ws.readyState !== WS_OPEN) {
1142
1184
  throw new KugelAudioError('WebSocket not connected');
1143
1185
  }
1186
+ this.requestedContexts.add(contextId);
1144
1187
 
1145
1188
  const msg: Record<string, unknown> = {
1146
1189
  text: ' ',
@@ -1151,26 +1194,36 @@ class MultiContextSession {
1151
1194
  if (!this.isStarted) {
1152
1195
  warnIfNoLanguage(this.config.language, this.config.normalize);
1153
1196
  if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
1197
+ if (this.config.outputFormat) msg.output_format = this.config.outputFormat;
1154
1198
  if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
1155
1199
  if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
1156
1200
  if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
1157
1201
  if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
1158
1202
  if (this.config.language) msg.language = this.config.language;
1203
+ // [] is meaningful (explicit opt-out) and must be sent; only
1204
+ // undefined (use the project default) is omitted.
1205
+ if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
1159
1206
  if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
1160
1207
  }
1161
1208
 
1162
- // Per-context voice
1209
+ // Per-context voice. The server binds a context's voice ONLY from
1210
+ // voice_settings.voice_id at context creation — a top-level voice_id
1211
+ // merely updates the session config and leaves the context voiceless,
1212
+ // which the server rejects with MISSING_VOICE_ID on the first text
1213
+ // (KUG-1233). This matches the Python SDK's wire format.
1214
+ const voiceSettings: Record<string, unknown> = {};
1163
1215
  const voiceId = options?.voiceId || this.config.defaultVoiceId;
1164
- if (voiceId) msg.voice_id = voiceId;
1216
+ if (voiceId) voiceSettings.voice_id = voiceId;
1165
1217
 
1166
1218
  if (options?.voiceSettings) {
1167
- msg.voice_settings = {
1168
- stability: options.voiceSettings.stability,
1169
- similarity_boost: options.voiceSettings.similarityBoost,
1170
- style: options.voiceSettings.style,
1171
- use_speaker_boost: options.voiceSettings.useSpeakerBoost,
1172
- speed: options.voiceSettings.speed,
1173
- };
1219
+ voiceSettings.stability = options.voiceSettings.stability;
1220
+ voiceSettings.similarity_boost = options.voiceSettings.similarityBoost;
1221
+ voiceSettings.style = options.voiceSettings.style;
1222
+ voiceSettings.use_speaker_boost = options.voiceSettings.useSpeakerBoost;
1223
+ voiceSettings.speed = options.voiceSettings.speed;
1224
+ }
1225
+ if (Object.keys(voiceSettings).length > 0) {
1226
+ msg.voice_settings = voiceSettings;
1174
1227
  }
1175
1228
 
1176
1229
  this.ws.send(JSON.stringify(msg));
@@ -1184,8 +1237,12 @@ class MultiContextSession {
1184
1237
  throw new KugelAudioError('WebSocket not connected');
1185
1238
  }
1186
1239
 
1187
- // Auto-create context if needed
1188
- if (!this.contexts.has(contextId) && !this.isStarted) {
1240
+ // Auto-create context if needed. Tracked via requestedContexts (sent
1241
+ // creates, not yet necessarily confirmed) rather than this.contexts
1242
+ // (server-confirmed) — otherwise a send() to a new context after the
1243
+ // session started goes out bare, and the server auto-creates the
1244
+ // context without voice_settings → MISSING_VOICE_ID (KUG-1233).
1245
+ if (!this.requestedContexts.has(contextId) && !this.contexts.has(contextId)) {
1189
1246
  this.createContext(contextId);
1190
1247
  }
1191
1248
 
@@ -1251,6 +1308,7 @@ class MultiContextSession {
1251
1308
  this.ws = null;
1252
1309
  this.isStarted = false;
1253
1310
  this.contexts.clear();
1311
+ this.requestedContexts.clear();
1254
1312
  }
1255
1313
 
1256
1314
  /**
@@ -1303,6 +1361,7 @@ class StreamingSession {
1303
1361
  private callbacks: StreamingSessionCallbacks;
1304
1362
  private client: KugelAudio;
1305
1363
  private configSent = false;
1364
+ private _lastUsage: import('./types').SessionUsage | null = null;
1306
1365
 
1307
1366
  constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
1308
1367
  this.client = client;
@@ -1310,6 +1369,15 @@ class StreamingSession {
1310
1369
  this.callbacks = callbacks;
1311
1370
  }
1312
1371
 
1372
+ /**
1373
+ * Per-session usage from the most recently closed session, or null before
1374
+ * the first session closes. Use this to bill your own customers per
1375
+ * conversation. See {@link SessionUsage}.
1376
+ */
1377
+ get lastUsage(): import('./types').SessionUsage | null {
1378
+ return this._lastUsage;
1379
+ }
1380
+
1313
1381
  /**
1314
1382
  * Open the WebSocket connection and authenticate.
1315
1383
  *
@@ -1389,7 +1457,18 @@ class StreamingSession {
1389
1457
  this.callbacks.onInterrupted?.();
1390
1458
  }
1391
1459
 
1460
+ if (data.final) {
1461
+ // End-of-audio marker for the turn (KUG-1238) — arrives after
1462
+ // the last audio frame and before session_closed.
1463
+ this.callbacks.onFinal?.(
1464
+ data.total_audio_seconds ?? 0,
1465
+ data.total_text_chunks ?? 0,
1466
+ data.total_audio_chunks ?? 0,
1467
+ );
1468
+ }
1469
+
1392
1470
  if (data.session_closed) {
1471
+ this._lastUsage = parseSessionUsage(data);
1393
1472
  this.callbacks.onSessionClosed?.(
1394
1473
  data.total_audio_seconds ?? 0,
1395
1474
  data.total_text_chunks ?? 0,
@@ -1474,6 +1553,7 @@ class StreamingSession {
1474
1553
  if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
1475
1554
  if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
1476
1555
  if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
1556
+ if (this.config.outputFormat !== undefined) msg.output_format = this.config.outputFormat;
1477
1557
  if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
1478
1558
  if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
1479
1559
  if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
@@ -1482,6 +1562,9 @@ class StreamingSession {
1482
1562
  if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
1483
1563
  if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
1484
1564
  if (this.config.speed !== undefined) msg.speed = this.config.speed;
1565
+ // [] is meaningful (explicit opt-out) and must be sent; only
1566
+ // undefined (use the project default) is omitted.
1567
+ if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
1485
1568
  this.configSent = true;
1486
1569
  }
1487
1570
 
package/src/index.ts CHANGED
@@ -62,6 +62,7 @@ export type {
62
62
  MultiContextAudioChunk,
63
63
  MultiContextCallbacks,
64
64
  MultiContextConfig,
65
+ SessionUsage,
65
66
  StreamCallbacks,
66
67
  StreamConfig,
67
68
  StreamingSessionCallbacks,
@@ -78,6 +79,7 @@ export type {
78
79
  VoiceSex,
79
80
  WordTimestamp
80
81
  } from './types';
82
+ export { parseSessionUsage } from './types';
81
83
 
82
84
  export { DictionariesResource, DictionaryEntriesResource } from './dictionaries';
83
85