@convbased/sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +235 -0
- package/dist/cjs/client.js +635 -0
- package/dist/cjs/client.js.map +1 -0
- package/dist/cjs/endpoints.js +10 -0
- package/dist/cjs/endpoints.js.map +1 -0
- package/dist/cjs/events.js +39 -0
- package/dist/cjs/events.js.map +1 -0
- package/dist/cjs/graphql.js +40 -0
- package/dist/cjs/graphql.js.map +1 -0
- package/dist/cjs/index.js +24 -0
- package/dist/cjs/index.js.map +1 -0
- package/dist/cjs/package.json +3 -0
- package/dist/cjs/rtcServers.js +35 -0
- package/dist/cjs/rtcServers.js.map +1 -0
- package/dist/cjs/sdp.js +37 -0
- package/dist/cjs/sdp.js.map +1 -0
- package/dist/cjs/signaling.js +146 -0
- package/dist/cjs/signaling.js.map +1 -0
- package/dist/cjs/tts.js +227 -0
- package/dist/cjs/tts.js.map +1 -0
- package/dist/cjs/types.js +26 -0
- package/dist/cjs/types.js.map +1 -0
- package/dist/cjs/upload.js +87 -0
- package/dist/cjs/upload.js.map +1 -0
- package/dist/client.d.ts +169 -0
- package/dist/client.d.ts.map +1 -0
- package/dist/client.js +631 -0
- package/dist/client.js.map +1 -0
- package/dist/convbased-sdk.global.js +1291 -0
- package/dist/endpoints.d.ts +3 -0
- package/dist/endpoints.d.ts.map +1 -0
- package/dist/endpoints.js +7 -0
- package/dist/endpoints.js.map +1 -0
- package/dist/events.d.ts +9 -0
- package/dist/events.d.ts.map +1 -0
- package/dist/events.js +35 -0
- package/dist/events.js.map +1 -0
- package/dist/graphql.d.ts +18 -0
- package/dist/graphql.d.ts.map +1 -0
- package/dist/graphql.js +37 -0
- package/dist/graphql.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/rtcServers.d.ts +13 -0
- package/dist/rtcServers.d.ts.map +1 -0
- package/dist/rtcServers.js +31 -0
- package/dist/rtcServers.js.map +1 -0
- package/dist/sdp.d.ts +6 -0
- package/dist/sdp.d.ts.map +1 -0
- package/dist/sdp.js +34 -0
- package/dist/sdp.js.map +1 -0
- package/dist/signaling.d.ts +33 -0
- package/dist/signaling.d.ts.map +1 -0
- package/dist/signaling.js +142 -0
- package/dist/signaling.js.map +1 -0
- package/dist/tts.d.ts +111 -0
- package/dist/tts.d.ts.map +1 -0
- package/dist/tts.js +223 -0
- package/dist/tts.js.map +1 -0
- package/dist/types.d.ts +194 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +23 -0
- package/dist/types.js.map +1 -0
- package/dist/upload.d.ts +46 -0
- package/dist/upload.d.ts.map +1 -0
- package/dist/upload.js +82 -0
- package/dist/upload.js.map +1 -0
- package/package.json +57 -0
- package/src/client.ts +839 -0
- package/src/endpoints.ts +8 -0
- package/src/events.ts +38 -0
- package/src/graphql.ts +58 -0
- package/src/index.ts +50 -0
- package/src/rtcServers.ts +38 -0
- package/src/sdp.ts +45 -0
- package/src/signaling.ts +172 -0
- package/src/tts.ts +364 -0
- package/src/types.ts +201 -0
- package/src/upload.ts +132 -0
package/src/tts.ts
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
// Text-to-speech client for the Convbased IndexTTS2 service. This path is pure
|
|
2
|
+
// GraphQL — it does not touch WebRTC or the signaling socket. Synthesis is
|
|
3
|
+
// asynchronous: you submit a job, then poll until it reaches a terminal state.
|
|
4
|
+
//
|
|
5
|
+
// Typical flow:
|
|
6
|
+
// const tts = new TtsClient({ apiKey });
|
|
7
|
+
// const { key } = await tts.uploadReferenceAudio(file); // reference voice
|
|
8
|
+
// const result = await tts.synthesize({ referenceKey: key, text: "你好" });
|
|
9
|
+
// audio.src = result.url; // presigned, ~1h
|
|
10
|
+
|
|
11
|
+
import { DEFAULT_GRAPHQL_URL } from "./endpoints.js";
|
|
12
|
+
import { graphqlRequest } from "./graphql.js";
|
|
13
|
+
import { uploadAudio } from "./upload.js";
|
|
14
|
+
|
|
15
|
+
/** Optional emotion / sampling controls forwarded verbatim to IndexTTS2. */
|
|
16
|
+
export interface TtsParams {
|
|
17
|
+
emo_alpha?: number;
|
|
18
|
+
emo_vector?: number[];
|
|
19
|
+
use_emo_text?: boolean;
|
|
20
|
+
emo_text?: string;
|
|
21
|
+
temperature?: number;
|
|
22
|
+
top_p?: number;
|
|
23
|
+
top_k?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export type TtsJobStatus =
|
|
27
|
+
| "queued"
|
|
28
|
+
| "warming"
|
|
29
|
+
| "processing"
|
|
30
|
+
| "done"
|
|
31
|
+
| "failed"
|
|
32
|
+
| "cancelled";
|
|
33
|
+
|
|
34
|
+
export interface TtsResult {
|
|
35
|
+
/** COS key of the synthesized audio. */
|
|
36
|
+
key: string;
|
|
37
|
+
/** Presigned URL of the synthesized audio (valid ~1h), or null if unsigned. */
|
|
38
|
+
url: string | null;
|
|
39
|
+
/** Input-text token count the charge was based on. */
|
|
40
|
+
tokenCount: number;
|
|
41
|
+
/** Duration of the synthesized audio, in seconds. */
|
|
42
|
+
audioDurationSec: number;
|
|
43
|
+
/** Amount deducted from the wallet for this synthesis. */
|
|
44
|
+
amountCharged: number;
|
|
45
|
+
/** Wallet balance after the deduction. */
|
|
46
|
+
balanceAfter: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface TtsJob {
|
|
50
|
+
jobId: string;
|
|
51
|
+
status: TtsJobStatus;
|
|
52
|
+
/** 1-based queue position while `queued`; 0 otherwise. */
|
|
53
|
+
position: number;
|
|
54
|
+
/** Result, populated once `status === "done"`. */
|
|
55
|
+
result: TtsResult | null;
|
|
56
|
+
/** Server-reported error key when `status === "failed"`. */
|
|
57
|
+
error: string | null;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface TtsPricing {
|
|
61
|
+
/** Price charged per input-text token. */
|
|
62
|
+
pricePerToken: number;
|
|
63
|
+
/** Minimum charge applied to any single synthesis. */
|
|
64
|
+
minCharge: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export interface TtsClientOptions {
|
|
68
|
+
/** Convbased API key. Required unless `accessToken` is provided. */
|
|
69
|
+
apiKey?: string;
|
|
70
|
+
/** Short-lived JWT access token (alternative to `apiKey`). */
|
|
71
|
+
accessToken?: string;
|
|
72
|
+
/**
|
|
73
|
+
* GraphQL endpoint. Defaults to the production Convbased endpoint
|
|
74
|
+
* (`https://api.weights.chat/api/v1/graphql`). Override for self-hosted
|
|
75
|
+
* deployments.
|
|
76
|
+
*/
|
|
77
|
+
graphqlUrl?: string;
|
|
78
|
+
/** Optional logger; defaults to `console` for warn/error only. */
|
|
79
|
+
logger?: Partial<Pick<Console, "debug" | "info" | "warn" | "error">>;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export interface SubmitTtsOptions {
|
|
83
|
+
/** COS key of an already-uploaded reference voice (see `uploadReferenceAudio`). */
|
|
84
|
+
referenceKey: string;
|
|
85
|
+
/** Text to synthesize. */
|
|
86
|
+
text: string;
|
|
87
|
+
/** Optional emotion / sampling controls. */
|
|
88
|
+
params?: TtsParams;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export interface SynthesizeOptions {
|
|
92
|
+
/** COS key of an already-uploaded reference voice. Provide this or `referenceAudio`. */
|
|
93
|
+
referenceKey?: string;
|
|
94
|
+
/** A reference-voice `Blob`/`File` to upload first. Provide this or `referenceKey`. */
|
|
95
|
+
referenceAudio?: Blob;
|
|
96
|
+
/** Text to synthesize. */
|
|
97
|
+
text: string;
|
|
98
|
+
/** Optional emotion / sampling controls. */
|
|
99
|
+
params?: TtsParams;
|
|
100
|
+
/** Poll interval while waiting for the job, in ms. Default 1500. */
|
|
101
|
+
pollIntervalMs?: number;
|
|
102
|
+
/** Give up waiting after this many ms. Default 300_000 (5 min). */
|
|
103
|
+
timeoutMs?: number;
|
|
104
|
+
/** Abort the wait (and cancel the job if still queued). */
|
|
105
|
+
signal?: AbortSignal;
|
|
106
|
+
/** Called on every poll with the latest job snapshot (queue position, status…). */
|
|
107
|
+
onJob?: (job: TtsJob) => void;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
interface TtsJobWire {
|
|
111
|
+
job_id: string;
|
|
112
|
+
status: TtsJobStatus;
|
|
113
|
+
position: number;
|
|
114
|
+
result: {
|
|
115
|
+
key: string;
|
|
116
|
+
url: string | null;
|
|
117
|
+
token_count: number;
|
|
118
|
+
audio_duration_sec: number;
|
|
119
|
+
amount_charged: number;
|
|
120
|
+
balance_after: number;
|
|
121
|
+
} | null;
|
|
122
|
+
error: string | null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const JOB_FIELDS = /* GraphQL */ `
|
|
126
|
+
job_id
|
|
127
|
+
status
|
|
128
|
+
position
|
|
129
|
+
result {
|
|
130
|
+
key
|
|
131
|
+
url
|
|
132
|
+
token_count
|
|
133
|
+
audio_duration_sec
|
|
134
|
+
amount_charged
|
|
135
|
+
balance_after
|
|
136
|
+
}
|
|
137
|
+
error
|
|
138
|
+
`;
|
|
139
|
+
|
|
140
|
+
export class TtsClient {
|
|
141
|
+
private readonly graphqlUrl: string;
|
|
142
|
+
private readonly apiKey?: string;
|
|
143
|
+
private readonly accessToken?: string;
|
|
144
|
+
private readonly logger: Pick<Console, "debug" | "info" | "warn" | "error">;
|
|
145
|
+
|
|
146
|
+
constructor(options: TtsClientOptions) {
|
|
147
|
+
if (!options.apiKey && !options.accessToken) {
|
|
148
|
+
throw new Error("TtsClient requires either `apiKey` or `accessToken`");
|
|
149
|
+
}
|
|
150
|
+
this.graphqlUrl = options.graphqlUrl ?? DEFAULT_GRAPHQL_URL;
|
|
151
|
+
this.apiKey = options.apiKey;
|
|
152
|
+
this.accessToken = options.accessToken;
|
|
153
|
+
const provided = options.logger ?? {};
|
|
154
|
+
this.logger = {
|
|
155
|
+
debug: provided.debug ?? (() => {}),
|
|
156
|
+
info: provided.info ?? (() => {}),
|
|
157
|
+
warn: provided.warn ?? console.warn.bind(console),
|
|
158
|
+
error: provided.error ?? console.error.bind(console),
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/** Upload a reference-voice `Blob`/`File` and resolve its COS key. */
|
|
163
|
+
async uploadReferenceAudio(
|
|
164
|
+
file: Blob,
|
|
165
|
+
opts?: { filename?: string; contentType?: string; signal?: AbortSignal }
|
|
166
|
+
): Promise<{ key: string }> {
|
|
167
|
+
return uploadAudio({
|
|
168
|
+
graphqlUrl: this.graphqlUrl,
|
|
169
|
+
apiKey: this.apiKey,
|
|
170
|
+
accessToken: this.accessToken,
|
|
171
|
+
file,
|
|
172
|
+
filename: opts?.filename,
|
|
173
|
+
contentType: opts?.contentType,
|
|
174
|
+
signal: opts?.signal,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Current billing rule: `cost = max(tokens * pricePerToken, minCharge)`. */
|
|
179
|
+
async getPricing(signal?: AbortSignal): Promise<TtsPricing> {
|
|
180
|
+
const data = await graphqlRequest<{
|
|
181
|
+
ttsPricing: { price_per_token: number; min_charge: number };
|
|
182
|
+
}>({
|
|
183
|
+
graphqlUrl: this.graphqlUrl,
|
|
184
|
+
apiKey: this.apiKey,
|
|
185
|
+
accessToken: this.accessToken,
|
|
186
|
+
signal,
|
|
187
|
+
query: /* GraphQL */ `
|
|
188
|
+
query {
|
|
189
|
+
ttsPricing {
|
|
190
|
+
price_per_token
|
|
191
|
+
min_charge
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
`,
|
|
195
|
+
});
|
|
196
|
+
return {
|
|
197
|
+
pricePerToken: data.ttsPricing.price_per_token,
|
|
198
|
+
minCharge: data.ttsPricing.min_charge,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** Enqueue a synthesis job; resolves immediately with the queued job. */
|
|
203
|
+
async submit(opts: SubmitTtsOptions, signal?: AbortSignal): Promise<TtsJob> {
|
|
204
|
+
const data = await graphqlRequest<{ submitTts: TtsJobWire }>({
|
|
205
|
+
graphqlUrl: this.graphqlUrl,
|
|
206
|
+
apiKey: this.apiKey,
|
|
207
|
+
accessToken: this.accessToken,
|
|
208
|
+
signal,
|
|
209
|
+
query: /* GraphQL */ `
|
|
210
|
+
mutation SubmitTts($input: SynthesizeTtsInput!) {
|
|
211
|
+
submitTts(input: $input) {
|
|
212
|
+
${JOB_FIELDS}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
`,
|
|
216
|
+
variables: {
|
|
217
|
+
input: {
|
|
218
|
+
reference_key: opts.referenceKey,
|
|
219
|
+
text: opts.text,
|
|
220
|
+
params: opts.params ?? null,
|
|
221
|
+
},
|
|
222
|
+
},
|
|
223
|
+
});
|
|
224
|
+
return toJob(data.submitTts);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/** Read the current status/result of a job. */
|
|
228
|
+
async getJob(jobId: string, signal?: AbortSignal): Promise<TtsJob> {
|
|
229
|
+
const data = await graphqlRequest<{ ttsJob: TtsJobWire }>({
|
|
230
|
+
graphqlUrl: this.graphqlUrl,
|
|
231
|
+
apiKey: this.apiKey,
|
|
232
|
+
accessToken: this.accessToken,
|
|
233
|
+
signal,
|
|
234
|
+
query: /* GraphQL */ `
|
|
235
|
+
query TtsJob($jobId: String!) {
|
|
236
|
+
ttsJob(jobId: $jobId) {
|
|
237
|
+
${JOB_FIELDS}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
`,
|
|
241
|
+
variables: { jobId },
|
|
242
|
+
});
|
|
243
|
+
return toJob(data.ttsJob);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/** Cancel a job. Only effective while it is still `queued`. */
|
|
247
|
+
async cancel(jobId: string, signal?: AbortSignal): Promise<TtsJob> {
|
|
248
|
+
const data = await graphqlRequest<{ cancelTtsJob: TtsJobWire }>({
|
|
249
|
+
graphqlUrl: this.graphqlUrl,
|
|
250
|
+
apiKey: this.apiKey,
|
|
251
|
+
accessToken: this.accessToken,
|
|
252
|
+
signal,
|
|
253
|
+
query: /* GraphQL */ `
|
|
254
|
+
mutation CancelTtsJob($jobId: String!) {
|
|
255
|
+
cancelTtsJob(jobId: $jobId) {
|
|
256
|
+
${JOB_FIELDS}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
`,
|
|
260
|
+
variables: { jobId },
|
|
261
|
+
});
|
|
262
|
+
return toJob(data.cancelTtsJob);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* One-call synthesis: (optionally upload the reference voice,) submit, then
|
|
267
|
+
* poll until the job finishes. Resolves with the `TtsResult` on success;
|
|
268
|
+
* rejects if the job fails/cancels, times out, or `signal` aborts.
|
|
269
|
+
*/
|
|
270
|
+
async synthesize(opts: SynthesizeOptions): Promise<TtsResult> {
|
|
271
|
+
if (!opts.referenceKey && !opts.referenceAudio) {
|
|
272
|
+
throw new Error(
|
|
273
|
+
"synthesize() requires either `referenceKey` or `referenceAudio`"
|
|
274
|
+
);
|
|
275
|
+
}
|
|
276
|
+
const pollIntervalMs = opts.pollIntervalMs ?? 1500;
|
|
277
|
+
const timeoutMs = opts.timeoutMs ?? 300_000;
|
|
278
|
+
const deadline = Date.now() + timeoutMs;
|
|
279
|
+
|
|
280
|
+
const referenceKey =
|
|
281
|
+
opts.referenceKey ??
|
|
282
|
+
(await this.uploadReferenceAudio(opts.referenceAudio!, {
|
|
283
|
+
signal: opts.signal,
|
|
284
|
+
})).key;
|
|
285
|
+
|
|
286
|
+
const submitted = await this.submit(
|
|
287
|
+
{ referenceKey, text: opts.text, params: opts.params },
|
|
288
|
+
opts.signal
|
|
289
|
+
);
|
|
290
|
+
opts.onJob?.(submitted);
|
|
291
|
+
|
|
292
|
+
let job = submitted;
|
|
293
|
+
try {
|
|
294
|
+
while (job.status !== "done") {
|
|
295
|
+
if (opts.signal?.aborted) {
|
|
296
|
+
throw new DOMException("Aborted", "AbortError");
|
|
297
|
+
}
|
|
298
|
+
if (job.status === "failed") {
|
|
299
|
+
throw new Error(job.error || "TTS job failed");
|
|
300
|
+
}
|
|
301
|
+
if (job.status === "cancelled") {
|
|
302
|
+
throw new Error("TTS job was cancelled");
|
|
303
|
+
}
|
|
304
|
+
if (Date.now() > deadline) {
|
|
305
|
+
throw new Error(
|
|
306
|
+
`Timed out waiting for TTS job ${job.jobId} after ${timeoutMs}ms`
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
await delay(pollIntervalMs, opts.signal);
|
|
310
|
+
job = await this.getJob(job.jobId, opts.signal);
|
|
311
|
+
opts.onJob?.(job);
|
|
312
|
+
}
|
|
313
|
+
} catch (err) {
|
|
314
|
+
// Best-effort: stop a still-queued job so we don't pay for a result
|
|
315
|
+
// nobody is waiting for.
|
|
316
|
+
if (opts.signal?.aborted) {
|
|
317
|
+
this.cancel(job.jobId).catch(() => {});
|
|
318
|
+
}
|
|
319
|
+
throw err;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (!job.result) {
|
|
323
|
+
throw new Error("TTS job is done but carried no result");
|
|
324
|
+
}
|
|
325
|
+
return job.result;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
function toJob(wire: TtsJobWire): TtsJob {
|
|
330
|
+
return {
|
|
331
|
+
jobId: wire.job_id,
|
|
332
|
+
status: wire.status,
|
|
333
|
+
position: wire.position,
|
|
334
|
+
result: wire.result
|
|
335
|
+
? {
|
|
336
|
+
key: wire.result.key,
|
|
337
|
+
url: wire.result.url,
|
|
338
|
+
tokenCount: wire.result.token_count,
|
|
339
|
+
audioDurationSec: wire.result.audio_duration_sec,
|
|
340
|
+
amountCharged: wire.result.amount_charged,
|
|
341
|
+
balanceAfter: wire.result.balance_after,
|
|
342
|
+
}
|
|
343
|
+
: null,
|
|
344
|
+
error: wire.error,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
function delay(ms: number, signal?: AbortSignal): Promise<void> {
|
|
349
|
+
return new Promise<void>((resolve, reject) => {
|
|
350
|
+
if (signal?.aborted) {
|
|
351
|
+
reject(new DOMException("Aborted", "AbortError"));
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
const timer = setTimeout(() => {
|
|
355
|
+
signal?.removeEventListener("abort", onAbort);
|
|
356
|
+
resolve();
|
|
357
|
+
}, ms);
|
|
358
|
+
const onAbort = () => {
|
|
359
|
+
clearTimeout(timer);
|
|
360
|
+
reject(new DOMException("Aborted", "AbortError"));
|
|
361
|
+
};
|
|
362
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
363
|
+
});
|
|
364
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
// Wire-protocol types mirrored from ServerAPI / signaling.
|
|
2
|
+
|
|
3
|
+
export interface RTCServersConfig {
|
|
4
|
+
urls: string[];
|
|
5
|
+
username?: string;
|
|
6
|
+
credential?: string;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface RTCPreferences {
|
|
10
|
+
model_id: string;
|
|
11
|
+
sample_rate: number;
|
|
12
|
+
pitch?: number;
|
|
13
|
+
rms_mix_rate?: number;
|
|
14
|
+
f0_threshold?: number;
|
|
15
|
+
block_time?: number;
|
|
16
|
+
crossfade_time?: number;
|
|
17
|
+
extra_time?: number;
|
|
18
|
+
f0_autotune?: boolean;
|
|
19
|
+
f0_autotune_strength?: number;
|
|
20
|
+
proposed_pitch?: boolean;
|
|
21
|
+
proposed_pitch_threshold?: number;
|
|
22
|
+
enable_limiter?: boolean;
|
|
23
|
+
limiter_threshold?: number;
|
|
24
|
+
enable_lookahead?: boolean;
|
|
25
|
+
lookahead_time?: number;
|
|
26
|
+
formant?: number;
|
|
27
|
+
index_rate?: number;
|
|
28
|
+
protect?: number;
|
|
29
|
+
threshold?: number;
|
|
30
|
+
[key: string]: unknown;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Per-task parameters for offline file inference (voice-to-voice). Forwarded
|
|
35
|
+
* verbatim to the inference node alongside `task_start`. Distinct from the
|
|
36
|
+
* live `RTCPreferences` — file inference exposes `f0_method`, `use_pv`, etc.
|
|
37
|
+
*/
|
|
38
|
+
export interface FileInferencePreferences {
|
|
39
|
+
pitch?: number;
|
|
40
|
+
f0_method?: "rmvpe" | "fcpe";
|
|
41
|
+
f0_threshold?: number;
|
|
42
|
+
index_rate?: number;
|
|
43
|
+
protect?: number;
|
|
44
|
+
f0_autotune?: boolean;
|
|
45
|
+
f0_autotune_strength?: number;
|
|
46
|
+
proposed_pitch?: boolean;
|
|
47
|
+
proposed_pitch_threshold?: number;
|
|
48
|
+
sample_rate?: number;
|
|
49
|
+
formant?: number;
|
|
50
|
+
block_time?: number;
|
|
51
|
+
crossfade_time?: number;
|
|
52
|
+
extra_time?: number;
|
|
53
|
+
use_pv?: boolean;
|
|
54
|
+
rms_mix_rate?: number;
|
|
55
|
+
threshold?: number;
|
|
56
|
+
enable_limiter?: boolean;
|
|
57
|
+
limiter_threshold?: number;
|
|
58
|
+
enable_lookahead?: boolean;
|
|
59
|
+
lookahead_time?: number;
|
|
60
|
+
[key: string]: unknown;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export type OutgoingMessage =
|
|
64
|
+
| { type: "offer"; sdp?: string; preferences: RTCPreferences }
|
|
65
|
+
| { type: "ice_candidate"; candidate: RTCIceCandidateInit }
|
|
66
|
+
| { type: "config"; preferences: Partial<RTCPreferences> }
|
|
67
|
+
| {
|
|
68
|
+
type: "task_start";
|
|
69
|
+
task_id: string;
|
|
70
|
+
audio_key: string;
|
|
71
|
+
generate_name?: string;
|
|
72
|
+
format?: string;
|
|
73
|
+
preferences?: FileInferencePreferences;
|
|
74
|
+
}
|
|
75
|
+
| { type: "task_stop"; task_id?: string }
|
|
76
|
+
| { type: "exit" }
|
|
77
|
+
| { type: "ping" }
|
|
78
|
+
| { type: "pong" };
|
|
79
|
+
|
|
80
|
+
export enum RTCStatusCode {
|
|
81
|
+
ERROR = 2000,
|
|
82
|
+
GPU_INSUFFICIENT = 2001,
|
|
83
|
+
DUPLICATE_CONNECTION = 2002,
|
|
84
|
+
MODEL_NOT_FOUND = 2003,
|
|
85
|
+
UNPAID_SERVICE = 2004,
|
|
86
|
+
REQUEST_TOO_FAST = 2005,
|
|
87
|
+
|
|
88
|
+
CONNECTED = 3000,
|
|
89
|
+
REQUEST_RECEIVED = 3001,
|
|
90
|
+
TRACK_READY = 3002,
|
|
91
|
+
RESPONSE_SENT = 3003,
|
|
92
|
+
LOADING_MODEL = 3004,
|
|
93
|
+
SERVICE_READY = 3009,
|
|
94
|
+
|
|
95
|
+
// File inference (voice-to-voice) task lifecycle codes.
|
|
96
|
+
TASK_PROGRESS = 3010,
|
|
97
|
+
TASK_FINISHED = 3011,
|
|
98
|
+
TASK_ACK = 3012,
|
|
99
|
+
|
|
100
|
+
SHUTDOWN = 4000,
|
|
101
|
+
SERVER_CLOSED = 5000,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export type TaskStatus = "success" | "failure" | "cancelled";
|
|
105
|
+
|
|
106
|
+
export type IncomingMessage =
|
|
107
|
+
| {
|
|
108
|
+
type: "message" | "shutdown" | "error";
|
|
109
|
+
message?: string;
|
|
110
|
+
code?: number;
|
|
111
|
+
}
|
|
112
|
+
| { type: "answer"; sdp: string }
|
|
113
|
+
| { type: "ice_candidate"; candidate: RTCIceCandidateInit }
|
|
114
|
+
| {
|
|
115
|
+
type: "task_ack";
|
|
116
|
+
task_id: string;
|
|
117
|
+
status: "queued" | "started";
|
|
118
|
+
queue_position?: number;
|
|
119
|
+
code?: number;
|
|
120
|
+
}
|
|
121
|
+
| { type: "task_progress"; task_id: string; progress: number; code?: number }
|
|
122
|
+
| {
|
|
123
|
+
type: "task_finished";
|
|
124
|
+
task_id: string;
|
|
125
|
+
status: TaskStatus;
|
|
126
|
+
result_key?: string;
|
|
127
|
+
download_url?: string;
|
|
128
|
+
error?: string;
|
|
129
|
+
code?: number;
|
|
130
|
+
}
|
|
131
|
+
| { type: "ping" }
|
|
132
|
+
| { type: "pong" }
|
|
133
|
+
| Record<string, unknown>;
|
|
134
|
+
|
|
135
|
+
export type ConnectionState =
|
|
136
|
+
| "idle"
|
|
137
|
+
| "signaling"
|
|
138
|
+
| "negotiating"
|
|
139
|
+
| "connecting"
|
|
140
|
+
| "connected"
|
|
141
|
+
| "closing"
|
|
142
|
+
| "closed"
|
|
143
|
+
| "error";
|
|
144
|
+
|
|
145
|
+
export interface ConvbasedClientOptions {
|
|
146
|
+
/** Convbased API key issued in the Web console. Required unless `accessToken` is provided. */
|
|
147
|
+
apiKey?: string;
|
|
148
|
+
/** Optional JWT access token; takes a back seat to `apiKey` if both are provided. */
|
|
149
|
+
accessToken?: string;
|
|
150
|
+
/**
|
|
151
|
+
* Signaling WebSocket URL. Defaults to the production Convbased endpoint
|
|
152
|
+
* (`wss://api.weights.chat/api/signaling/ws`). Override only for
|
|
153
|
+
* self-hosted deployments. URLs ending in `/ws` are used as-is; bare
|
|
154
|
+
* hosts get `/signaling/ws` appended.
|
|
155
|
+
*/
|
|
156
|
+
signalingUrl?: string;
|
|
157
|
+
/**
|
|
158
|
+
* GraphQL endpoint used to fetch TURN credentials. Defaults to the
|
|
159
|
+
* production Convbased endpoint (`https://api.weights.chat/api/v1/graphql`).
|
|
160
|
+
* Pass `false` to disable the auto-fetch entirely (will fall back to
|
|
161
|
+
* `iceServers` if provided, else public STUN).
|
|
162
|
+
*/
|
|
163
|
+
graphqlUrl?: string | false;
|
|
164
|
+
/** Statically-configured ICE servers. If omitted and `graphqlUrl` is set, the SDK fetches `rtcServers`. */
|
|
165
|
+
iceServers?: RTCServersConfig[];
|
|
166
|
+
/** `relay` forces TURN-only — useful when STUN is blocked. */
|
|
167
|
+
iceTransportPolicy?: RTCIceTransportPolicy;
|
|
168
|
+
/** Opus bitrate in kbps. Default 64. */
|
|
169
|
+
bitrate?: number;
|
|
170
|
+
/** Send stereo. Default false. */
|
|
171
|
+
stereo?: boolean;
|
|
172
|
+
/** How long to wait for `SERVICE_READY` after sending the offer, in ms. Default 120_000. */
|
|
173
|
+
signalingTimeoutMs?: number;
|
|
174
|
+
/** How long to wait for the initial WebSocket open, in ms. Default 20_000. */
|
|
175
|
+
connectTimeoutMs?: number;
|
|
176
|
+
/** Optional logger; defaults to `console` for warn/error only. */
|
|
177
|
+
logger?: Partial<Pick<Console, "debug" | "info" | "warn" | "error">>;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
export interface ConnectOptions {
|
|
181
|
+
/** Model ID to load on the inference node — required. */
|
|
182
|
+
modelId: string;
|
|
183
|
+
/** Microphone constraints. Pass an existing `MediaStream` to skip getUserMedia entirely. */
|
|
184
|
+
audio?: MediaStream | MediaTrackConstraints | boolean;
|
|
185
|
+
/** Additional RVC preferences forwarded to the node. */
|
|
186
|
+
preferences?: Partial<Omit<RTCPreferences, "model_id" | "sample_rate">>;
|
|
187
|
+
/** Sample rate to advertise to the node. Defaults to the AudioContext's `sampleRate`, falling back to 48000. */
|
|
188
|
+
sampleRate?: number;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
export interface ConnectionStats {
|
|
192
|
+
rttMs: number;
|
|
193
|
+
jitter: number;
|
|
194
|
+
packetsLost: number;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export interface ServerMessageEvent {
|
|
198
|
+
code?: number;
|
|
199
|
+
message?: string;
|
|
200
|
+
raw: IncomingMessage;
|
|
201
|
+
}
|
package/src/upload.ts
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
// Audio upload helpers shared by TTS (reference voice) and file inference
|
|
2
|
+
// (source audio). Two steps mirror Convbased-Web: ask the GraphQL service for
|
|
3
|
+
// a presigned PUT (`requestAudioUpload`), then PUT the bytes straight to object
|
|
4
|
+
// storage. The returned COS `key` is what you hand to `submitTts` /
|
|
5
|
+
// `startTask`.
|
|
6
|
+
|
|
7
|
+
import { graphqlRequest, type GraphQLAuth } from "./graphql.js";
|
|
8
|
+
|
|
9
|
+
export interface HeaderKV {
|
|
10
|
+
name: string;
|
|
11
|
+
value: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface PresignedUpload {
|
|
15
|
+
/** COS object key — pass this back to the service (e.g. as `reference_key` / `audio_key`). */
|
|
16
|
+
key: string;
|
|
17
|
+
/** Presigned URL to PUT the bytes to. */
|
|
18
|
+
upload_url: string;
|
|
19
|
+
/** HTTP method to use for the upload (always `PUT` today). */
|
|
20
|
+
method: string;
|
|
21
|
+
/** Headers the upload request must include (notably `Content-Type`). */
|
|
22
|
+
headers: HeaderKV[];
|
|
23
|
+
/** Seconds until the presigned URL expires. */
|
|
24
|
+
expires_in: number;
|
|
25
|
+
bucket?: string | null;
|
|
26
|
+
region?: string | null;
|
|
27
|
+
/** CDN base URL for the bucket, when configured. */
|
|
28
|
+
url?: string | null;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const REQUEST_AUDIO_UPLOAD = /* GraphQL */ `
|
|
32
|
+
mutation RequestAudioUpload($input: RequestUploadInput!) {
|
|
33
|
+
requestAudioUpload(input: $input) {
|
|
34
|
+
key
|
|
35
|
+
upload_url
|
|
36
|
+
method
|
|
37
|
+
expires_in
|
|
38
|
+
headers {
|
|
39
|
+
name
|
|
40
|
+
value
|
|
41
|
+
}
|
|
42
|
+
bucket
|
|
43
|
+
region
|
|
44
|
+
url
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
`;
|
|
48
|
+
|
|
49
|
+
/** Ask the service for a presigned PUT for an audio file. */
|
|
50
|
+
export async function requestAudioUpload(
|
|
51
|
+
args: GraphQLAuth & {
|
|
52
|
+
graphqlUrl: string;
|
|
53
|
+
filename: string;
|
|
54
|
+
contentType: string;
|
|
55
|
+
size: number;
|
|
56
|
+
signal?: AbortSignal;
|
|
57
|
+
}
|
|
58
|
+
): Promise<PresignedUpload> {
|
|
59
|
+
const data = await graphqlRequest<{ requestAudioUpload: PresignedUpload }>({
|
|
60
|
+
graphqlUrl: args.graphqlUrl,
|
|
61
|
+
apiKey: args.apiKey,
|
|
62
|
+
accessToken: args.accessToken,
|
|
63
|
+
signal: args.signal,
|
|
64
|
+
query: REQUEST_AUDIO_UPLOAD,
|
|
65
|
+
variables: {
|
|
66
|
+
input: {
|
|
67
|
+
filename: args.filename,
|
|
68
|
+
content_type: args.contentType,
|
|
69
|
+
size: args.size,
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
return data.requestAudioUpload;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** PUT raw bytes to a presigned upload target. */
|
|
77
|
+
export async function putToPresigned(
|
|
78
|
+
presigned: PresignedUpload,
|
|
79
|
+
body: Blob | ArrayBuffer | ArrayBufferView,
|
|
80
|
+
signal?: AbortSignal
|
|
81
|
+
): Promise<void> {
|
|
82
|
+
const headers: Record<string, string> = {};
|
|
83
|
+
for (const h of presigned.headers ?? []) {
|
|
84
|
+
if (h?.name) headers[h.name] = h.value;
|
|
85
|
+
}
|
|
86
|
+
const res = await fetch(presigned.upload_url, {
|
|
87
|
+
method: presigned.method || "PUT",
|
|
88
|
+
headers,
|
|
89
|
+
body: body as BodyInit,
|
|
90
|
+
signal,
|
|
91
|
+
});
|
|
92
|
+
if (!res.ok) {
|
|
93
|
+
throw new Error(
|
|
94
|
+
`Audio upload failed: HTTP ${res.status} ${res.statusText}`
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Upload an audio `Blob`/`File` end-to-end (presign + PUT) and resolve the COS
|
|
101
|
+
* `key`. Filename and content type are taken from the `File` when available;
|
|
102
|
+
* override via `opts` when uploading a bare `Blob`.
|
|
103
|
+
*/
|
|
104
|
+
export async function uploadAudio(
|
|
105
|
+
args: GraphQLAuth & {
|
|
106
|
+
graphqlUrl: string;
|
|
107
|
+
file: Blob;
|
|
108
|
+
filename?: string;
|
|
109
|
+
contentType?: string;
|
|
110
|
+
signal?: AbortSignal;
|
|
111
|
+
}
|
|
112
|
+
): Promise<{ key: string }> {
|
|
113
|
+
const maybeFile = args.file as File;
|
|
114
|
+
const filename = args.filename ?? maybeFile.name ?? "audio.wav";
|
|
115
|
+
const contentType =
|
|
116
|
+
args.contentType ||
|
|
117
|
+
args.file.type ||
|
|
118
|
+
"application/octet-stream";
|
|
119
|
+
|
|
120
|
+
const presigned = await requestAudioUpload({
|
|
121
|
+
graphqlUrl: args.graphqlUrl,
|
|
122
|
+
apiKey: args.apiKey,
|
|
123
|
+
accessToken: args.accessToken,
|
|
124
|
+
signal: args.signal,
|
|
125
|
+
filename,
|
|
126
|
+
contentType,
|
|
127
|
+
size: args.file.size,
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
await putToPresigned(presigned, args.file, args.signal);
|
|
131
|
+
return { key: presigned.key };
|
|
132
|
+
}
|