kugelaudio 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/README.md +11 -2
- package/dist/index.d.mts +121 -10
- package/dist/index.d.ts +121 -10
- package/dist/index.js +95 -17
- package/dist/index.mjs +93 -16
- package/package.json +1 -1
- package/src/client.test.ts +356 -2
- package/src/client.ts +95 -12
- package/src/index.ts +2 -0
- package/src/types.ts +128 -10
package/src/client.test.ts
CHANGED
|
@@ -7,8 +7,12 @@
|
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
9
|
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
10
|
+
import packageJson from '../package.json';
|
|
10
11
|
import { KugelAudio } from './client';
|
|
11
12
|
import { RateLimitError } from './errors';
|
|
13
|
+
import { parseSessionUsage } from './types';
|
|
14
|
+
|
|
15
|
+
const SDK_VERSION = packageJson.version;
|
|
12
16
|
|
|
13
17
|
// ---------------------------------------------------------------------------
|
|
14
18
|
// Minimal WebSocket mock
|
|
@@ -99,6 +103,43 @@ function collectStream(stream: NodeJS.ReadableStream): Promise<Buffer> {
|
|
|
99
103
|
// Tests
|
|
100
104
|
// ---------------------------------------------------------------------------
|
|
101
105
|
|
|
106
|
+
describe('parseSessionUsage (/ws/tts final + session_closed)', () => {
|
|
107
|
+
it('parses the usage block from a /ws/tts final frame', () => {
|
|
108
|
+
const usage = parseSessionUsage({
|
|
109
|
+
final: true,
|
|
110
|
+
chunks: 3,
|
|
111
|
+
total_samples: 1000,
|
|
112
|
+
dur_ms: 5400,
|
|
113
|
+
gen_ms: 900,
|
|
114
|
+
rtf: 0.17,
|
|
115
|
+
usage: {
|
|
116
|
+
audio_seconds: 5.4,
|
|
117
|
+
characters: 142,
|
|
118
|
+
cost_cents: 0.49,
|
|
119
|
+
currency: 'eur',
|
|
120
|
+
model_id: 'kugel-3',
|
|
121
|
+
},
|
|
122
|
+
});
|
|
123
|
+
expect(usage).not.toBeNull();
|
|
124
|
+
expect(usage?.audioSeconds).toBe(5.4);
|
|
125
|
+
expect(usage?.costCents).toBe(0.49);
|
|
126
|
+
expect(usage?.costAvailable).toBe(true);
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('reports cost null (not zero) when unavailable', () => {
|
|
130
|
+
const usage = parseSessionUsage({
|
|
131
|
+
final: true,
|
|
132
|
+
usage: { audio_seconds: 2.0, cost_cents: null, cost_unavailable: true },
|
|
133
|
+
});
|
|
134
|
+
expect(usage?.costCents).toBeNull();
|
|
135
|
+
expect(usage?.costAvailable).toBe(false);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
it('returns null when there is no usage info', () => {
|
|
139
|
+
expect(parseSessionUsage({ final: true, chunks: 1 })).toBeNull();
|
|
140
|
+
});
|
|
141
|
+
});
|
|
142
|
+
|
|
102
143
|
describe('TTSResource.toReadable()', () => {
|
|
103
144
|
let client: KugelAudio;
|
|
104
145
|
|
|
@@ -271,7 +312,7 @@ describe('KugelAudio SDK metadata', () => {
|
|
|
271
312
|
expect(init).toMatchObject({
|
|
272
313
|
headers: {
|
|
273
314
|
'X-KugelAudio-SDK': 'js',
|
|
274
|
-
'X-KugelAudio-SDK-Version':
|
|
315
|
+
'X-KugelAudio-SDK-Version': SDK_VERSION,
|
|
275
316
|
},
|
|
276
317
|
});
|
|
277
318
|
});
|
|
@@ -283,7 +324,7 @@ describe('KugelAudio SDK metadata', () => {
|
|
|
283
324
|
await new Promise<void>((r) => setTimeout(r, 10));
|
|
284
325
|
|
|
285
326
|
expect(mockWs.url).toContain('sdk=js');
|
|
286
|
-
expect(mockWs.url).toContain(
|
|
327
|
+
expect(mockWs.url).toContain(`sdk_version=${SDK_VERSION}`);
|
|
287
328
|
});
|
|
288
329
|
});
|
|
289
330
|
|
|
@@ -451,6 +492,111 @@ describe('StreamingSession', () => {
|
|
|
451
492
|
expect(sessionClosedCalls[0].totalAudioChunks).toBe(4);
|
|
452
493
|
});
|
|
453
494
|
|
|
495
|
+
it('fires onFinal (end-of-audio) before onSessionClosed on turn end (KUG-1238)', async () => {
|
|
496
|
+
const order: string[] = [];
|
|
497
|
+
let finalStats: { totalAudioSeconds: number; totalTextChunks: number; totalAudioChunks: number } | null = null;
|
|
498
|
+
|
|
499
|
+
const session = client.tts.streamingSession(
|
|
500
|
+
{ voiceId: 1 },
|
|
501
|
+
{
|
|
502
|
+
onFinal: (totalAudioSeconds, totalTextChunks, totalAudioChunks) => {
|
|
503
|
+
order.push('final');
|
|
504
|
+
finalStats = { totalAudioSeconds, totalTextChunks, totalAudioChunks };
|
|
505
|
+
},
|
|
506
|
+
onSessionClosed: () => order.push('session_closed'),
|
|
507
|
+
},
|
|
508
|
+
);
|
|
509
|
+
|
|
510
|
+
session.connect();
|
|
511
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
512
|
+
session.send('Hello.', true);
|
|
513
|
+
|
|
514
|
+
mockWs.onmessage?.({ data: makeAudioMsg(0, 100) });
|
|
515
|
+
mockWs.onmessage?.({ data: makeChunkCompleteMsg(0, 1.0, 100) });
|
|
516
|
+
mockWs.onmessage?.({
|
|
517
|
+
data: JSON.stringify({
|
|
518
|
+
final: true,
|
|
519
|
+
total_audio_seconds: 1.0,
|
|
520
|
+
total_text_chunks: 1,
|
|
521
|
+
total_audio_chunks: 1,
|
|
522
|
+
}),
|
|
523
|
+
});
|
|
524
|
+
mockWs.onmessage?.({ data: makeSessionClosedMsg(1.0, 1, 1) });
|
|
525
|
+
|
|
526
|
+
expect(order).toEqual(['final', 'session_closed']);
|
|
527
|
+
expect(finalStats!.totalAudioSeconds).toBe(1.0);
|
|
528
|
+
expect(finalStats!.totalTextChunks).toBe(1);
|
|
529
|
+
expect(finalStats!.totalAudioChunks).toBe(1);
|
|
530
|
+
});
|
|
531
|
+
|
|
532
|
+
it('exposes typed per-session usage (cost charged) on lastUsage', async () => {
|
|
533
|
+
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
534
|
+
session.connect();
|
|
535
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
536
|
+
session.send('Hello.');
|
|
537
|
+
|
|
538
|
+
expect(session.lastUsage).toBeNull();
|
|
539
|
+
|
|
540
|
+
mockWs.onmessage?.({
|
|
541
|
+
data: JSON.stringify({
|
|
542
|
+
session_closed: true,
|
|
543
|
+
total_audio_seconds: 5.4,
|
|
544
|
+
usage: {
|
|
545
|
+
audio_seconds: 5.4,
|
|
546
|
+
characters: 142,
|
|
547
|
+
cost_cents: 0.49,
|
|
548
|
+
currency: 'eur',
|
|
549
|
+
model_id: 'kugel-3',
|
|
550
|
+
},
|
|
551
|
+
}),
|
|
552
|
+
});
|
|
553
|
+
|
|
554
|
+
expect(session.lastUsage).not.toBeNull();
|
|
555
|
+
expect(session.lastUsage?.audioSeconds).toBe(5.4);
|
|
556
|
+
expect(session.lastUsage?.characters).toBe(142);
|
|
557
|
+
expect(session.lastUsage?.costCents).toBe(0.49);
|
|
558
|
+
expect(session.lastUsage?.currency).toBe('eur');
|
|
559
|
+
expect(session.lastUsage?.modelId).toBe('kugel-3');
|
|
560
|
+
expect(session.lastUsage?.costAvailable).toBe(true);
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
it('reports cost as null (not zero) when the charge is unavailable', async () => {
|
|
564
|
+
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
565
|
+
session.connect();
|
|
566
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
567
|
+
session.send('Hi.');
|
|
568
|
+
|
|
569
|
+
mockWs.onmessage?.({
|
|
570
|
+
data: JSON.stringify({
|
|
571
|
+
session_closed: true,
|
|
572
|
+
total_audio_seconds: 2.0,
|
|
573
|
+
usage: {
|
|
574
|
+
audio_seconds: 2.0,
|
|
575
|
+
cost_cents: null,
|
|
576
|
+
cost_unavailable: true,
|
|
577
|
+
model_id: 'kugel-3',
|
|
578
|
+
},
|
|
579
|
+
}),
|
|
580
|
+
});
|
|
581
|
+
|
|
582
|
+
expect(session.lastUsage?.costCents).toBeNull();
|
|
583
|
+
expect(session.lastUsage?.costAvailable).toBe(false);
|
|
584
|
+
expect(session.lastUsage?.audioSeconds).toBe(2.0);
|
|
585
|
+
});
|
|
586
|
+
|
|
587
|
+
it('falls back to total_audio_seconds for a legacy server with no usage block', async () => {
|
|
588
|
+
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
589
|
+
session.connect();
|
|
590
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
591
|
+
session.send('Hi.');
|
|
592
|
+
|
|
593
|
+
mockWs.onmessage?.({ data: makeSessionClosedMsg(3.0, 1, 2) });
|
|
594
|
+
|
|
595
|
+
expect(session.lastUsage?.audioSeconds).toBe(3.0);
|
|
596
|
+
expect(session.lastUsage?.costCents).toBeNull();
|
|
597
|
+
expect(session.lastUsage?.costAvailable).toBe(false);
|
|
598
|
+
});
|
|
599
|
+
|
|
454
600
|
it('resolves close() even if server never sends session_closed (quiet timeout)', async () => {
|
|
455
601
|
const session = client.tts.streamingSession(
|
|
456
602
|
{ voiceId: 1 },
|
|
@@ -644,6 +790,55 @@ describe('StreamingSession', () => {
|
|
|
644
790
|
expect(JSON.parse(mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string).voice_id).toBe(42);
|
|
645
791
|
});
|
|
646
792
|
|
|
793
|
+
// -------------------------------------------------------------------------
|
|
794
|
+
// dictionaryIds — per-request dictionary selection (KUG-1094)
|
|
795
|
+
// -------------------------------------------------------------------------
|
|
796
|
+
|
|
797
|
+
it('first send carries dictionary_ids when configured', async () => {
|
|
798
|
+
const session = client.tts.streamingSession(
|
|
799
|
+
{ voiceId: 1, dictionaryIds: [7, 9] },
|
|
800
|
+
{},
|
|
801
|
+
);
|
|
802
|
+
|
|
803
|
+
session.connect();
|
|
804
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
805
|
+
|
|
806
|
+
session.send('Hello.');
|
|
807
|
+
const sent = JSON.parse(
|
|
808
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
809
|
+
);
|
|
810
|
+
expect(sent.dictionary_ids).toEqual([7, 9]);
|
|
811
|
+
});
|
|
812
|
+
|
|
813
|
+
it('first send carries dictionary_ids: [] (explicit opt-out)', async () => {
|
|
814
|
+
const session = client.tts.streamingSession(
|
|
815
|
+
{ voiceId: 1, dictionaryIds: [] },
|
|
816
|
+
{},
|
|
817
|
+
);
|
|
818
|
+
|
|
819
|
+
session.connect();
|
|
820
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
821
|
+
|
|
822
|
+
session.send('Hello.');
|
|
823
|
+
const sent = JSON.parse(
|
|
824
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
825
|
+
);
|
|
826
|
+
expect(sent.dictionary_ids).toEqual([]);
|
|
827
|
+
});
|
|
828
|
+
|
|
829
|
+
it('omits dictionary_ids when not configured', async () => {
|
|
830
|
+
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
831
|
+
|
|
832
|
+
session.connect();
|
|
833
|
+
await new Promise<void>((r) => setTimeout(r, 10));
|
|
834
|
+
|
|
835
|
+
session.send('Hello.');
|
|
836
|
+
const sent = JSON.parse(
|
|
837
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
838
|
+
);
|
|
839
|
+
expect(sent.dictionary_ids).toBeUndefined();
|
|
840
|
+
});
|
|
841
|
+
|
|
647
842
|
it('cancelCurrent() resolves on quiet timeout if server never acks', async () => {
|
|
648
843
|
const session = client.tts.streamingSession({ voiceId: 1 }, {});
|
|
649
844
|
|
|
@@ -724,4 +919,163 @@ describe('MultiContextSession closeContext', () => {
|
|
|
724
919
|
expect((errors[0].error as RateLimitError).statusCode).toBe(429);
|
|
725
920
|
expect((errors[0].error as RateLimitError).errorCode).toBe('TOO_MANY_CONTEXTS');
|
|
726
921
|
});
|
|
922
|
+
|
|
923
|
+
it('fires onFinal per context on flush completion and graceful close (KUG-1238)', async () => {
|
|
924
|
+
const finals: string[] = [];
|
|
925
|
+
const closed: string[] = [];
|
|
926
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
|
|
927
|
+
await session.connect({
|
|
928
|
+
onFinal: (contextId) => finals.push(contextId),
|
|
929
|
+
onContextClosed: (contextId) => closed.push(contextId),
|
|
930
|
+
});
|
|
931
|
+
|
|
932
|
+
// Flush boundary: all audio admitted before the flush has been sent.
|
|
933
|
+
mockWs.onmessage?.({
|
|
934
|
+
data: JSON.stringify({ final: true, context_id: 'a' }),
|
|
935
|
+
});
|
|
936
|
+
expect(finals).toEqual(['a']);
|
|
937
|
+
expect(closed).toEqual([]);
|
|
938
|
+
|
|
939
|
+
// Graceful close: final precedes context_closed.
|
|
940
|
+
mockWs.onmessage?.({
|
|
941
|
+
data: JSON.stringify({ final: true, context_id: 'a' }),
|
|
942
|
+
});
|
|
943
|
+
mockWs.onmessage?.({
|
|
944
|
+
data: JSON.stringify({ context_closed: true, context_id: 'a' }),
|
|
945
|
+
});
|
|
946
|
+
expect(finals).toEqual(['a', 'a']);
|
|
947
|
+
expect(closed).toEqual(['a']);
|
|
948
|
+
});
|
|
949
|
+
|
|
950
|
+
it('exposes per-context usage on context_closed (per conversation)', async () => {
|
|
951
|
+
const closed: Array<{ id: string; usage: unknown }> = [];
|
|
952
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 1 });
|
|
953
|
+
await session.connect({
|
|
954
|
+
onContextClosed: (contextId, usage) => closed.push({ id: contextId, usage }),
|
|
955
|
+
});
|
|
956
|
+
|
|
957
|
+
mockWs.onmessage?.({
|
|
958
|
+
data: JSON.stringify({
|
|
959
|
+
context_closed: true,
|
|
960
|
+
context_id: 'narrator',
|
|
961
|
+
usage: { audio_seconds: 4.1, cost_cents: 0.37, currency: 'eur', model_id: 'kugel-3' },
|
|
962
|
+
}),
|
|
963
|
+
});
|
|
964
|
+
|
|
965
|
+
// Available both via the callback arg and the per-context accessor
|
|
966
|
+
expect(closed).toHaveLength(1);
|
|
967
|
+
expect(closed[0].id).toBe('narrator');
|
|
968
|
+
expect((closed[0].usage as { costCents: number }).costCents).toBe(0.37);
|
|
969
|
+
|
|
970
|
+
const u = session.usageFor('narrator');
|
|
971
|
+
expect(u?.audioSeconds).toBe(4.1);
|
|
972
|
+
expect(u?.costCents).toBe(0.37);
|
|
973
|
+
expect(u?.costAvailable).toBe(true);
|
|
974
|
+
expect(session.usageFor('missing')).toBeNull();
|
|
975
|
+
});
|
|
976
|
+
});
|
|
977
|
+
|
|
978
|
+
// ---------------------------------------------------------------------------
|
|
979
|
+
// MultiContextSession createContext wire format (KUG-1233)
|
|
980
|
+
//
|
|
981
|
+
// The server binds a context's voice ONLY from voice_settings.voice_id at
|
|
982
|
+
// context creation. A top-level voice_id updates session config and leaves
|
|
983
|
+
// the context voiceless → MISSING_VOICE_ID on the first text. These tests
|
|
984
|
+
// pin the wire format so it cannot silently regress.
|
|
985
|
+
// ---------------------------------------------------------------------------
|
|
986
|
+
|
|
987
|
+
describe('MultiContextSession createContext wire format (KUG-1233)', () => {
|
|
988
|
+
let client: KugelAudio;
|
|
989
|
+
|
|
990
|
+
beforeEach(() => {
|
|
991
|
+
client = new KugelAudio({ apiKey: 'test-key-xxx' });
|
|
992
|
+
});
|
|
993
|
+
|
|
994
|
+
it('puts voice_id inside voice_settings, never top-level', async () => {
|
|
995
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
|
|
996
|
+
await session.connect({});
|
|
997
|
+
|
|
998
|
+
session.createContext('narrator', { voiceId: 123 });
|
|
999
|
+
|
|
1000
|
+
const sent = JSON.parse(
|
|
1001
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
1002
|
+
);
|
|
1003
|
+
expect(sent.context_id).toBe('narrator');
|
|
1004
|
+
expect(sent.voice_id).toBeUndefined();
|
|
1005
|
+
expect(sent.voice_settings).toBeDefined();
|
|
1006
|
+
expect(sent.voice_settings.voice_id).toBe(123);
|
|
1007
|
+
});
|
|
1008
|
+
|
|
1009
|
+
it('falls back to defaultVoiceId inside voice_settings', async () => {
|
|
1010
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
|
|
1011
|
+
await session.connect({});
|
|
1012
|
+
|
|
1013
|
+
session.createContext('narrator');
|
|
1014
|
+
|
|
1015
|
+
const sent = JSON.parse(
|
|
1016
|
+
mockWs.send.mock.calls[mockWs.send.mock.calls.length - 1][0] as string
|
|
1017
|
+
);
|
|
1018
|
+
expect(sent.voice_id).toBeUndefined();
|
|
1019
|
+
expect(sent.voice_settings.voice_id).toBe(42);
|
|
1020
|
+
});
|
|
1021
|
+
|
|
1022
|
+
it('send() to an unknown context auto-creates it with the default voice, even after session start', async () => {
|
|
1023
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
|
|
1024
|
+
await session.connect({});
|
|
1025
|
+
|
|
1026
|
+
// Simulate a started session (first context confirmed by the server).
|
|
1027
|
+
session.createContext('first');
|
|
1028
|
+
mockWs.onmessage?.({
|
|
1029
|
+
data: JSON.stringify({ session_started: true, session_id: 's1' }),
|
|
1030
|
+
});
|
|
1031
|
+
mockWs.onmessage?.({
|
|
1032
|
+
data: JSON.stringify({ context_created: true, context_id: 'first' }),
|
|
1033
|
+
});
|
|
1034
|
+
|
|
1035
|
+
const callsBefore = mockWs.send.mock.calls.length;
|
|
1036
|
+
session.send('second', 'hello there', true);
|
|
1037
|
+
const frames = mockWs.send.mock.calls
|
|
1038
|
+
.slice(callsBefore)
|
|
1039
|
+
.map((c) => JSON.parse(c[0] as string));
|
|
1040
|
+
|
|
1041
|
+
// First frame: the auto-create with voice_settings.voice_id; then the text.
|
|
1042
|
+
expect(frames).toHaveLength(2);
|
|
1043
|
+
expect(frames[0].context_id).toBe('second');
|
|
1044
|
+
expect(frames[0].voice_settings.voice_id).toBe(42);
|
|
1045
|
+
expect(frames[1].text).toBe('hello there');
|
|
1046
|
+
expect(frames[1].flush).toBe(true);
|
|
1047
|
+
});
|
|
1048
|
+
|
|
1049
|
+
it('does not duplicate the create frame across repeated sends', async () => {
|
|
1050
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
|
|
1051
|
+
await session.connect({});
|
|
1052
|
+
|
|
1053
|
+
session.send('ctx', 'one');
|
|
1054
|
+
session.send('ctx', 'two');
|
|
1055
|
+
|
|
1056
|
+
const frames = mockWs.send.mock.calls.map((c) => JSON.parse(c[0] as string));
|
|
1057
|
+
const creates = frames.filter((f) => f.voice_settings?.voice_id === 42);
|
|
1058
|
+
expect(creates).toHaveLength(1);
|
|
1059
|
+
});
|
|
1060
|
+
|
|
1061
|
+
it('allows re-creating a context after the server closed it', async () => {
|
|
1062
|
+
const session = client.tts.createMultiContextSession({ defaultVoiceId: 42 });
|
|
1063
|
+
await session.connect({});
|
|
1064
|
+
|
|
1065
|
+
session.send('ctx', 'one');
|
|
1066
|
+
mockWs.onmessage?.({
|
|
1067
|
+
data: JSON.stringify({ context_created: true, context_id: 'ctx' }),
|
|
1068
|
+
});
|
|
1069
|
+
mockWs.onmessage?.({
|
|
1070
|
+
data: JSON.stringify({ context_closed: true, context_id: 'ctx' }),
|
|
1071
|
+
});
|
|
1072
|
+
|
|
1073
|
+
const callsBefore = mockWs.send.mock.calls.length;
|
|
1074
|
+
session.send('ctx', 'again');
|
|
1075
|
+
const frames = mockWs.send.mock.calls
|
|
1076
|
+
.slice(callsBefore)
|
|
1077
|
+
.map((c) => JSON.parse(c[0] as string));
|
|
1078
|
+
expect(frames[0].voice_settings.voice_id).toBe(42);
|
|
1079
|
+
expect(frames[1].text).toBe('again');
|
|
1080
|
+
});
|
|
727
1081
|
});
|
package/src/client.ts
CHANGED
|
@@ -29,6 +29,7 @@ import type {
|
|
|
29
29
|
VoiceReference,
|
|
30
30
|
WordTimestamp
|
|
31
31
|
} from './types';
|
|
32
|
+
import { parseSessionUsage } from './types';
|
|
32
33
|
import { base64ToArrayBuffer } from './utils';
|
|
33
34
|
import { getWebSocket } from './websocket';
|
|
34
35
|
|
|
@@ -596,6 +597,7 @@ class TTSResource {
|
|
|
596
597
|
generationMs: data.gen_ms,
|
|
597
598
|
rtf: data.rtf,
|
|
598
599
|
error: data.error,
|
|
600
|
+
usage: parseSessionUsage(data) ?? undefined,
|
|
599
601
|
};
|
|
600
602
|
pending.callbacks.onFinal?.(stats);
|
|
601
603
|
this.pendingRequests.delete(requestId);
|
|
@@ -713,11 +715,15 @@ class TTSResource {
|
|
|
713
715
|
...(options.temperature !== undefined && { temperature: options.temperature }),
|
|
714
716
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
715
717
|
sample_rate: options.sampleRate ?? 24000,
|
|
718
|
+
...(options.outputFormat && { output_format: options.outputFormat }),
|
|
716
719
|
normalize: options.normalize ?? true,
|
|
717
720
|
...(options.language && { language: options.language }),
|
|
718
721
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
719
722
|
...(options.speed !== undefined && { speed: options.speed }),
|
|
720
723
|
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
724
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
725
|
+
// undefined (use the project default) is omitted.
|
|
726
|
+
...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
|
|
721
727
|
}));
|
|
722
728
|
});
|
|
723
729
|
}
|
|
@@ -744,11 +750,15 @@ class TTSResource {
|
|
|
744
750
|
cfg_scale: options.cfgScale ?? 2.0,
|
|
745
751
|
max_new_tokens: options.maxNewTokens ?? 2048,
|
|
746
752
|
sample_rate: options.sampleRate ?? 24000,
|
|
753
|
+
...(options.outputFormat && { output_format: options.outputFormat }),
|
|
747
754
|
normalize: options.normalize ?? true,
|
|
748
755
|
...(options.language && { language: options.language }),
|
|
749
756
|
...(options.wordTimestamps && { word_timestamps: true }),
|
|
750
757
|
...(options.speed !== undefined && { speed: options.speed }),
|
|
751
758
|
...(options.projectId !== undefined && { project_id: options.projectId }),
|
|
759
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
760
|
+
// undefined (use the project default) is omitted.
|
|
761
|
+
...(options.dictionaryIds !== undefined && { dictionary_ids: options.dictionaryIds }),
|
|
752
762
|
}));
|
|
753
763
|
};
|
|
754
764
|
|
|
@@ -779,6 +789,7 @@ class TTSResource {
|
|
|
779
789
|
generationMs: data.gen_ms,
|
|
780
790
|
rtf: data.rtf,
|
|
781
791
|
error: data.error,
|
|
792
|
+
usage: parseSessionUsage(data) ?? undefined,
|
|
782
793
|
};
|
|
783
794
|
callbacks.onFinal?.(stats);
|
|
784
795
|
ws.close();
|
|
@@ -977,7 +988,11 @@ class MultiContextSession {
|
|
|
977
988
|
private config: import('./types').MultiContextConfig;
|
|
978
989
|
private callbacks: import('./types').MultiContextCallbacks = {};
|
|
979
990
|
private contexts: Set<string> = new Set();
|
|
991
|
+
/** Contexts a create message has been sent for (not yet necessarily
|
|
992
|
+
* confirmed by the server via context_created). */
|
|
993
|
+
private requestedContexts: Set<string> = new Set();
|
|
980
994
|
private _sessionId: string | null = null;
|
|
995
|
+
private _contextUsage: Map<string, import('./types').SessionUsage> = new Map();
|
|
981
996
|
private isStarted = false;
|
|
982
997
|
|
|
983
998
|
constructor(
|
|
@@ -994,6 +1009,20 @@ class MultiContextSession {
|
|
|
994
1009
|
return this._sessionId;
|
|
995
1010
|
}
|
|
996
1011
|
|
|
1012
|
+
/**
|
|
1013
|
+
* Per-context usage (audio time + amount charged) for a closed context, or
|
|
1014
|
+
* null if that context hasn't closed yet. Each context is its own
|
|
1015
|
+
* conversation — use this to bill per conversation. See {@link SessionUsage}.
|
|
1016
|
+
*/
|
|
1017
|
+
usageFor(contextId: string): import('./types').SessionUsage | null {
|
|
1018
|
+
return this._contextUsage.get(contextId) ?? null;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
/** Map of context_id → per-context usage for all closed contexts. */
|
|
1022
|
+
get contextUsage(): Map<string, import('./types').SessionUsage> {
|
|
1023
|
+
return new Map(this._contextUsage);
|
|
1024
|
+
}
|
|
1025
|
+
|
|
997
1026
|
/**
|
|
998
1027
|
* Connect to the multi-context WebSocket endpoint.
|
|
999
1028
|
*
|
|
@@ -1063,13 +1092,25 @@ class MultiContextSession {
|
|
|
1063
1092
|
this.callbacks.onChunk?.(chunk);
|
|
1064
1093
|
}
|
|
1065
1094
|
|
|
1095
|
+
if (data.final && data.context_id) {
|
|
1096
|
+
// Per-context end-of-audio marker (KUG-1238): all audio admitted
|
|
1097
|
+
// before the client's flush has been delivered; also precedes
|
|
1098
|
+
// context_closed on a graceful close.
|
|
1099
|
+
this.callbacks.onFinal?.(data.context_id);
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1066
1102
|
if (data.context_closed) {
|
|
1067
1103
|
this.contexts.delete(data.context_id);
|
|
1068
|
-
this.
|
|
1104
|
+
this.requestedContexts.delete(data.context_id);
|
|
1105
|
+
// Per-context (per-conversation) usage rides on context_closed.
|
|
1106
|
+
const ctxUsage = parseSessionUsage(data) ?? undefined;
|
|
1107
|
+
if (ctxUsage) this._contextUsage.set(data.context_id, ctxUsage);
|
|
1108
|
+
this.callbacks.onContextClosed?.(data.context_id, ctxUsage);
|
|
1069
1109
|
}
|
|
1070
1110
|
|
|
1071
1111
|
if (data.context_timeout) {
|
|
1072
1112
|
this.contexts.delete(data.context_id);
|
|
1113
|
+
this.requestedContexts.delete(data.context_id);
|
|
1073
1114
|
this.callbacks.onContextTimeout?.(data.context_id);
|
|
1074
1115
|
}
|
|
1075
1116
|
|
|
@@ -1124,6 +1165,7 @@ class MultiContextSession {
|
|
|
1124
1165
|
this.ws = null;
|
|
1125
1166
|
this.isStarted = false;
|
|
1126
1167
|
this.contexts.clear();
|
|
1168
|
+
this.requestedContexts.clear();
|
|
1127
1169
|
};
|
|
1128
1170
|
});
|
|
1129
1171
|
}
|
|
@@ -1141,6 +1183,7 @@ class MultiContextSession {
|
|
|
1141
1183
|
if (!this.ws || this.ws.readyState !== WS_OPEN) {
|
|
1142
1184
|
throw new KugelAudioError('WebSocket not connected');
|
|
1143
1185
|
}
|
|
1186
|
+
this.requestedContexts.add(contextId);
|
|
1144
1187
|
|
|
1145
1188
|
const msg: Record<string, unknown> = {
|
|
1146
1189
|
text: ' ',
|
|
@@ -1151,26 +1194,36 @@ class MultiContextSession {
|
|
|
1151
1194
|
if (!this.isStarted) {
|
|
1152
1195
|
warnIfNoLanguage(this.config.language, this.config.normalize);
|
|
1153
1196
|
if (this.config.sampleRate) msg.sample_rate = this.config.sampleRate;
|
|
1197
|
+
if (this.config.outputFormat) msg.output_format = this.config.outputFormat;
|
|
1154
1198
|
if (this.config.cfgScale) msg.cfg_scale = this.config.cfgScale;
|
|
1155
1199
|
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1156
1200
|
if (this.config.maxNewTokens) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1157
1201
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
1158
1202
|
if (this.config.language) msg.language = this.config.language;
|
|
1203
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
1204
|
+
// undefined (use the project default) is omitted.
|
|
1205
|
+
if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
|
|
1159
1206
|
if (this.config.inactivityTimeout) msg.inactivity_timeout = this.config.inactivityTimeout;
|
|
1160
1207
|
}
|
|
1161
1208
|
|
|
1162
|
-
// Per-context voice
|
|
1209
|
+
// Per-context voice. The server binds a context's voice ONLY from
|
|
1210
|
+
// voice_settings.voice_id at context creation — a top-level voice_id
|
|
1211
|
+
// merely updates the session config and leaves the context voiceless,
|
|
1212
|
+
// which the server rejects with MISSING_VOICE_ID on the first text
|
|
1213
|
+
// (KUG-1233). This matches the Python SDK's wire format.
|
|
1214
|
+
const voiceSettings: Record<string, unknown> = {};
|
|
1163
1215
|
const voiceId = options?.voiceId || this.config.defaultVoiceId;
|
|
1164
|
-
if (voiceId)
|
|
1216
|
+
if (voiceId) voiceSettings.voice_id = voiceId;
|
|
1165
1217
|
|
|
1166
1218
|
if (options?.voiceSettings) {
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1219
|
+
voiceSettings.stability = options.voiceSettings.stability;
|
|
1220
|
+
voiceSettings.similarity_boost = options.voiceSettings.similarityBoost;
|
|
1221
|
+
voiceSettings.style = options.voiceSettings.style;
|
|
1222
|
+
voiceSettings.use_speaker_boost = options.voiceSettings.useSpeakerBoost;
|
|
1223
|
+
voiceSettings.speed = options.voiceSettings.speed;
|
|
1224
|
+
}
|
|
1225
|
+
if (Object.keys(voiceSettings).length > 0) {
|
|
1226
|
+
msg.voice_settings = voiceSettings;
|
|
1174
1227
|
}
|
|
1175
1228
|
|
|
1176
1229
|
this.ws.send(JSON.stringify(msg));
|
|
@@ -1184,8 +1237,12 @@ class MultiContextSession {
|
|
|
1184
1237
|
throw new KugelAudioError('WebSocket not connected');
|
|
1185
1238
|
}
|
|
1186
1239
|
|
|
1187
|
-
// Auto-create context if needed
|
|
1188
|
-
|
|
1240
|
+
// Auto-create context if needed. Tracked via requestedContexts (sent
|
|
1241
|
+
// creates, not yet necessarily confirmed) rather than this.contexts
|
|
1242
|
+
// (server-confirmed) — otherwise a send() to a new context after the
|
|
1243
|
+
// session started goes out bare, and the server auto-creates the
|
|
1244
|
+
// context without voice_settings → MISSING_VOICE_ID (KUG-1233).
|
|
1245
|
+
if (!this.requestedContexts.has(contextId) && !this.contexts.has(contextId)) {
|
|
1189
1246
|
this.createContext(contextId);
|
|
1190
1247
|
}
|
|
1191
1248
|
|
|
@@ -1251,6 +1308,7 @@ class MultiContextSession {
|
|
|
1251
1308
|
this.ws = null;
|
|
1252
1309
|
this.isStarted = false;
|
|
1253
1310
|
this.contexts.clear();
|
|
1311
|
+
this.requestedContexts.clear();
|
|
1254
1312
|
}
|
|
1255
1313
|
|
|
1256
1314
|
/**
|
|
@@ -1303,6 +1361,7 @@ class StreamingSession {
|
|
|
1303
1361
|
private callbacks: StreamingSessionCallbacks;
|
|
1304
1362
|
private client: KugelAudio;
|
|
1305
1363
|
private configSent = false;
|
|
1364
|
+
private _lastUsage: import('./types').SessionUsage | null = null;
|
|
1306
1365
|
|
|
1307
1366
|
constructor(client: KugelAudio, config: StreamConfig, callbacks: StreamingSessionCallbacks) {
|
|
1308
1367
|
this.client = client;
|
|
@@ -1310,6 +1369,15 @@ class StreamingSession {
|
|
|
1310
1369
|
this.callbacks = callbacks;
|
|
1311
1370
|
}
|
|
1312
1371
|
|
|
1372
|
+
/**
|
|
1373
|
+
* Per-session usage from the most recently closed session, or null before
|
|
1374
|
+
* the first session closes. Use this to bill your own customers per
|
|
1375
|
+
* conversation. See {@link SessionUsage}.
|
|
1376
|
+
*/
|
|
1377
|
+
get lastUsage(): import('./types').SessionUsage | null {
|
|
1378
|
+
return this._lastUsage;
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1313
1381
|
/**
|
|
1314
1382
|
* Open the WebSocket connection and authenticate.
|
|
1315
1383
|
*
|
|
@@ -1389,7 +1457,18 @@ class StreamingSession {
|
|
|
1389
1457
|
this.callbacks.onInterrupted?.();
|
|
1390
1458
|
}
|
|
1391
1459
|
|
|
1460
|
+
if (data.final) {
|
|
1461
|
+
// End-of-audio marker for the turn (KUG-1238) — arrives after
|
|
1462
|
+
// the last audio frame and before session_closed.
|
|
1463
|
+
this.callbacks.onFinal?.(
|
|
1464
|
+
data.total_audio_seconds ?? 0,
|
|
1465
|
+
data.total_text_chunks ?? 0,
|
|
1466
|
+
data.total_audio_chunks ?? 0,
|
|
1467
|
+
);
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1392
1470
|
if (data.session_closed) {
|
|
1471
|
+
this._lastUsage = parseSessionUsage(data);
|
|
1393
1472
|
this.callbacks.onSessionClosed?.(
|
|
1394
1473
|
data.total_audio_seconds ?? 0,
|
|
1395
1474
|
data.total_text_chunks ?? 0,
|
|
@@ -1474,6 +1553,7 @@ class StreamingSession {
|
|
|
1474
1553
|
if (this.config.temperature !== undefined) msg.temperature = this.config.temperature;
|
|
1475
1554
|
if (this.config.maxNewTokens !== undefined) msg.max_new_tokens = this.config.maxNewTokens;
|
|
1476
1555
|
if (this.config.sampleRate !== undefined) msg.sample_rate = this.config.sampleRate;
|
|
1556
|
+
if (this.config.outputFormat !== undefined) msg.output_format = this.config.outputFormat;
|
|
1477
1557
|
if (this.config.flushTimeoutMs !== undefined) msg.flush_timeout_ms = this.config.flushTimeoutMs;
|
|
1478
1558
|
if (this.config.maxBufferLength !== undefined) msg.max_buffer_length = this.config.maxBufferLength;
|
|
1479
1559
|
if (this.config.normalize !== undefined) msg.normalize = this.config.normalize;
|
|
@@ -1482,6 +1562,9 @@ class StreamingSession {
|
|
|
1482
1562
|
if (this.config.autoMode !== undefined) msg.auto_mode = this.config.autoMode;
|
|
1483
1563
|
if (this.config.chunkLengthSchedule?.length) msg.chunk_length_schedule = this.config.chunkLengthSchedule;
|
|
1484
1564
|
if (this.config.speed !== undefined) msg.speed = this.config.speed;
|
|
1565
|
+
// [] is meaningful (explicit opt-out) and must be sent; only
|
|
1566
|
+
// undefined (use the project default) is omitted.
|
|
1567
|
+
if (this.config.dictionaryIds !== undefined) msg.dictionary_ids = this.config.dictionaryIds;
|
|
1485
1568
|
this.configSent = true;
|
|
1486
1569
|
}
|
|
1487
1570
|
|
package/src/index.ts
CHANGED
|
@@ -62,6 +62,7 @@ export type {
|
|
|
62
62
|
MultiContextAudioChunk,
|
|
63
63
|
MultiContextCallbacks,
|
|
64
64
|
MultiContextConfig,
|
|
65
|
+
SessionUsage,
|
|
65
66
|
StreamCallbacks,
|
|
66
67
|
StreamConfig,
|
|
67
68
|
StreamingSessionCallbacks,
|
|
@@ -78,6 +79,7 @@ export type {
|
|
|
78
79
|
VoiceSex,
|
|
79
80
|
WordTimestamp
|
|
80
81
|
} from './types';
|
|
82
|
+
export { parseSessionUsage } from './types';
|
|
81
83
|
|
|
82
84
|
export { DictionariesResource, DictionaryEntriesResource } from './dictionaries';
|
|
83
85
|
|