@polytts/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +230 -0
- package/dist/index.d.ts +381 -0
- package/dist/index.js +817 -0
- package/package.json +50 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DengQing dengqing0821@gmail.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# @polytts/core
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/@polytts/core)
|
|
4
|
+
|
|
5
|
+
Core runtime and adapter contracts for [`polytts`](https://github.com/Dunqing/polytts).
|
|
6
|
+
|
|
7
|
+
Use this package when you need custom adapters, custom catalogs, or direct runtime orchestration. Most app code should start with `polytts`, `@polytts/browser`, or `@polytts/node` instead.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install @polytts/core
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```ts
|
|
18
|
+
import { createTTSRuntime } from "@polytts/core";
|
|
19
|
+
import { officialAdapters } from "@polytts/browser-adapters";
|
|
20
|
+
import { officialCatalog } from "@polytts/presets";
|
|
21
|
+
|
|
22
|
+
const runtime = createTTSRuntime({
|
|
23
|
+
adapters: officialAdapters,
|
|
24
|
+
catalogs: [officialCatalog],
|
|
25
|
+
initialModelId: "browser-speech",
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
await runtime.prepare("browser-speech");
|
|
29
|
+
await runtime.speak("Hello from the core runtime.");
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Custom models
|
|
33
|
+
|
|
34
|
+
There are two ways to extend `polytts` with new models.
|
|
35
|
+
|
|
36
|
+
### Add a model to an existing family
|
|
37
|
+
|
|
38
|
+
If the model fits one of the built-in adapters (Piper, Kokoro, KittenTTS, Supertonic), add a catalog entry and keep using the official adapter. This is the normal path for another Piper bundle, KittenTTS checkpoint, or any ONNX export that matches an existing adapter contract.
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
import { createStaticCatalog, type ModelSpec } from "@polytts/core";
|
|
42
|
+
import { createBrowserTTSRuntime } from "@polytts/browser";
|
|
43
|
+
|
|
44
|
+
const myPiperVoice: ModelSpec = {
|
|
45
|
+
id: "piper-en_US-custom-medium",
|
|
46
|
+
adapterId: "piper",
|
|
47
|
+
name: "Piper Custom",
|
|
48
|
+
family: "piper",
|
|
49
|
+
revision: "v1.0.0",
|
|
50
|
+
license: "mit",
|
|
51
|
+
languages: ["en-US"],
|
|
52
|
+
voiceMode: "per-voice-model",
|
|
53
|
+
distribution: {
|
|
54
|
+
kind: "managed-assets",
|
|
55
|
+
sizeBytes: 63_000_000,
|
|
56
|
+
assets: [
|
|
57
|
+
{
|
|
58
|
+
name: "en_US-custom-medium.onnx",
|
|
59
|
+
url: "https://example.com/en_US-custom-medium.onnx",
|
|
60
|
+
size: 63_000_000,
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
name: "en_US-custom-medium.onnx.json",
|
|
64
|
+
url: "https://example.com/en_US-custom-medium.onnx.json",
|
|
65
|
+
size: 8_192,
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
},
|
|
69
|
+
voices: [
|
|
70
|
+
{
|
|
71
|
+
id: "en_US-custom-medium",
|
|
72
|
+
name: "Custom",
|
|
73
|
+
language: "en-US",
|
|
74
|
+
},
|
|
75
|
+
],
|
|
76
|
+
defaultVoiceId: "en_US-custom-medium",
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const runtime = createBrowserTTSRuntime({
|
|
80
|
+
extraCatalogs: [createStaticCatalog([myPiperVoice])],
|
|
81
|
+
});
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Guidelines
|
|
85
|
+
|
|
86
|
+
- Use the built-in adapter id (`piper`, `kokoro`, `kitten`, or `supertonic`)
|
|
87
|
+
- Keep `family` stable if the model should appear in the same grouped family in the simple API
|
|
88
|
+
- Use `voiceMode: "per-voice-model"` when each downloadable bundle is a separate voice model
|
|
89
|
+
- Use `voiceMode: "multi"` when one model exposes multiple voices internally
|
|
90
|
+
- For `per-voice-model` families such as Piper, switch variants by model id rather than assuming one shared multi-voice model
|
|
91
|
+
|
|
92
|
+
### Add a new runtime family
|
|
93
|
+
|
|
94
|
+
If the model needs a different inference path, create a new adapter and register it alongside the built-in ones.
|
|
95
|
+
|
|
96
|
+
#### Model instance
|
|
97
|
+
|
|
98
|
+
Every adapter creates model instances. There are two kinds:
|
|
99
|
+
|
|
100
|
+
- **`SynthesizingModelInstance`** (`kind: "synthesizing"`) — returns audio data via `generate()`. Optionally supports `stream()` for incremental chunks. This is the most common type.
|
|
101
|
+
- **`SpeakingModelInstance`** (`kind: "speaking"`) — plays audio directly via `speak()` (e.g. the Web Speech API). No audio data is returned.
|
|
102
|
+
|
|
103
|
+
```ts
|
|
104
|
+
import { type ModelSpec, type SynthesizingModelInstance } from "@polytts/core";
|
|
105
|
+
|
|
106
|
+
class MyModel implements SynthesizingModelInstance {
|
|
107
|
+
readonly kind = "synthesizing" as const;
|
|
108
|
+
readonly modelId: string;
|
|
109
|
+
readonly adapterId: string;
|
|
110
|
+
|
|
111
|
+
constructor(private readonly spec: ModelSpec) {
|
|
112
|
+
this.modelId = spec.id;
|
|
113
|
+
this.adapterId = spec.adapterId;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async load(_signal: AbortSignal): Promise<void> {
|
|
117
|
+
// Initialize your runtime here (load ONNX, WASM, etc.).
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async generate(
|
|
121
|
+
text: string,
|
|
122
|
+
_voiceId: string,
|
|
123
|
+
_signal: AbortSignal,
|
|
124
|
+
): Promise<{ sampleRate: number; channels: Float32Array[] }> {
|
|
125
|
+
// Return PCM audio data for the synthesized speech.
|
|
126
|
+
throw new Error(`Not implemented for: ${text}`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
listVoices() {
|
|
130
|
+
return this.spec.voices ?? [];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
dispose(): void {}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### Adapter
|
|
138
|
+
|
|
139
|
+
The adapter tells the runtime how to create model instances and whether it can run on the current platform.
|
|
140
|
+
|
|
141
|
+
```ts
|
|
142
|
+
import {
|
|
143
|
+
createStaticCatalog,
|
|
144
|
+
createTTSRuntime,
|
|
145
|
+
type TTSAdapter,
|
|
146
|
+
type SynthesizingModelInstance,
|
|
147
|
+
} from "@polytts/core";
|
|
148
|
+
import { officialAdapters, officialCatalog } from "@polytts/browser";
|
|
149
|
+
|
|
150
|
+
const myAdapter: TTSAdapter<SynthesizingModelInstance> = {
|
|
151
|
+
id: "my-runtime",
|
|
152
|
+
name: "My Runtime",
|
|
153
|
+
isSupported: (_spec) => typeof Worker !== "undefined",
|
|
154
|
+
createModel(spec) {
|
|
155
|
+
return new MyModel(spec);
|
|
156
|
+
},
|
|
157
|
+
};
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
#### Catalog
|
|
161
|
+
|
|
162
|
+
Register model metadata via a catalog. Use `distribution` to describe how assets are delivered:
|
|
163
|
+
|
|
164
|
+
| `distribution.kind` | Meaning |
|
|
165
|
+
| ------------------- | --------------------------------------------------- |
|
|
166
|
+
| `"managed-assets"` | Runtime downloads and caches `assets` automatically |
|
|
167
|
+
| `"adapter-managed"` | Adapter owns download logic via custom `install()` |
|
|
168
|
+
| `"none"` (or omit) | No downloadable assets needed |
|
|
169
|
+
|
|
170
|
+
```ts
|
|
171
|
+
const myCatalog = createStaticCatalog([
|
|
172
|
+
{
|
|
173
|
+
id: "my-runtime-v1",
|
|
174
|
+
adapterId: "my-runtime",
|
|
175
|
+
name: "My Runtime V1",
|
|
176
|
+
family: "my-runtime",
|
|
177
|
+
revision: "v1",
|
|
178
|
+
license: "custom",
|
|
179
|
+
languages: ["en-US"],
|
|
180
|
+
voiceMode: "single",
|
|
181
|
+
distribution: { kind: "none" },
|
|
182
|
+
},
|
|
183
|
+
]);
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
#### Wire it up
|
|
187
|
+
|
|
188
|
+
```ts
|
|
189
|
+
const runtime = createTTSRuntime({
|
|
190
|
+
adapters: [...officialAdapters, myAdapter],
|
|
191
|
+
catalogs: [officialCatalog, myCatalog],
|
|
192
|
+
});
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### ModelSpec reference
|
|
196
|
+
|
|
197
|
+
Required fields for every `ModelSpec`:
|
|
198
|
+
|
|
199
|
+
| Field | Type | Description |
|
|
200
|
+
| ----------- | ------------------------------------------ | ----------------------------------------- |
|
|
201
|
+
| `id` | `string` | Unique model identifier |
|
|
202
|
+
| `adapterId` | `string` | Which adapter runs this model |
|
|
203
|
+
| `name` | `string` | Human-readable display name |
|
|
204
|
+
| `family` | `string` | Grouping key (e.g. `"piper"`, `"kokoro"`) |
|
|
205
|
+
| `revision` | `string` | Version string for cache invalidation |
|
|
206
|
+
| `license` | `string` | SPDX license identifier |
|
|
207
|
+
| `languages` | `string[]` | BCP-47 language codes |
|
|
208
|
+
| `voiceMode` | `"single" \| "multi" \| "per-voice-model"` | How the model handles voices |
|
|
209
|
+
|
|
210
|
+
Optional fields:
|
|
211
|
+
|
|
212
|
+
| Field | Type | Description |
|
|
213
|
+
| ---------------- | ------------------------- | ------------------------------------------ |
|
|
214
|
+
| `distribution` | `ModelDistribution` | Asset delivery strategy (see above) |
|
|
215
|
+
| `voices` | `Voice[]` | Pre-declared voice list |
|
|
216
|
+
| `defaultVoiceId` | `string` | Default voice to select |
|
|
217
|
+
| `requirements` | `ModelRequirements` | Runtime needs (`wasm`, `webgpu`, `worker`) |
|
|
218
|
+
| `config` | `Record<string, unknown>` | Adapter-specific configuration |
|
|
219
|
+
| `description` | `string` | Model description |
|
|
220
|
+
| `homepage` | `string` | Project URL |
|
|
221
|
+
| `tags` | `string[]` | Searchable tags |
|
|
222
|
+
|
|
223
|
+
## Exports
|
|
224
|
+
|
|
225
|
+
Key exports from `@polytts/core`:
|
|
226
|
+
|
|
227
|
+
- `createTTSRuntime` — create a low-level runtime
|
|
228
|
+
- `createStaticCatalog` — create a catalog from a model array
|
|
229
|
+
- `MemoryAssetStore` — in-memory asset store (useful for testing)
|
|
230
|
+
- Types: `TTSRuntime`, `TTSAdapter`, `TTSModelInstance`, `SynthesizingModelInstance`, `SpeakingModelInstance`, `ModelSpec`, `ModelCatalog`, `AssetStore`, `AudioData`, `Voice`
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
//#region src/types.d.ts
|
|
2
|
+
/** Unique identifier for a TTS model. */
|
|
3
|
+
type ModelId = string;
|
|
4
|
+
/** Unique identifier for a TTS adapter (engine backend). */
|
|
5
|
+
type AdapterId = string;
|
|
6
|
+
/** Unique identifier for a voice within a model. */
|
|
7
|
+
type VoiceId = string;
|
|
8
|
+
/** Default speaking speed multiplier (1x). */
|
|
9
|
+
declare const DEFAULT_SPEAK_SPEED = 1;
|
|
10
|
+
/** Minimum allowed speaking speed multiplier (0.5x). */
|
|
11
|
+
declare const MIN_SPEAK_SPEED = 0.5;
|
|
12
|
+
/** Maximum allowed speaking speed multiplier (2x). */
|
|
13
|
+
declare const MAX_SPEAK_SPEED = 2;
|
|
14
|
+
/**
|
|
15
|
+
* Clamps a speed value to the valid range, returning the default if the input is not a finite
|
|
16
|
+
* number.
|
|
17
|
+
*/
|
|
18
|
+
declare function normalizeSpeakSpeed(speed?: number): number;
|
|
19
|
+
/** A voice available for speech synthesis, associated with a language and optional gender. */
|
|
20
|
+
interface Voice {
|
|
21
|
+
id: VoiceId;
|
|
22
|
+
name: string;
|
|
23
|
+
language: string;
|
|
24
|
+
gender?: "male" | "female" | "neutral";
|
|
25
|
+
/** True if this voice was created via voice cloning rather than bundled with the model. */
|
|
26
|
+
isCloned?: boolean;
|
|
27
|
+
}
|
|
28
|
+
/** Raw audio output from synthesis, represented as per-channel float sample arrays. */
|
|
29
|
+
interface AudioData {
|
|
30
|
+
sampleRate: number;
|
|
31
|
+
channels: Float32Array[];
|
|
32
|
+
}
|
|
33
|
+
/** Alias for {@link AudioData}; use when accepting audio data from external sources. */
|
|
34
|
+
type AudioDataLike = AudioData;
|
|
35
|
+
/** A downloadable file (weights, config, etc.) required by a model. */
|
|
36
|
+
interface ModelAsset {
|
|
37
|
+
name: string;
|
|
38
|
+
url: string;
|
|
39
|
+
size: number;
|
|
40
|
+
sha256?: string;
|
|
41
|
+
}
|
|
42
|
+
/** Runtime capabilities a model needs from the browser environment. */
|
|
43
|
+
interface ModelRequirements {
|
|
44
|
+
wasm?: boolean;
|
|
45
|
+
webgpu?: boolean;
|
|
46
|
+
worker?: boolean;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* How a model handles voices: one fixed voice, multiple built-in voices, or a separate model per
|
|
50
|
+
* voice.
|
|
51
|
+
*/
|
|
52
|
+
type ModelVoiceMode = "single" | "multi" | "per-voice-model";
|
|
53
|
+
/**
|
|
54
|
+
* How model assets are distributed: no assets needed, managed by the core asset store, or managed
|
|
55
|
+
* externally by the adapter.
|
|
56
|
+
*/
|
|
57
|
+
type ModelDistributionKind = "none" | "managed-assets" | "adapter-managed";
|
|
58
|
+
/** Describes how a model's assets are packaged and delivered. */
|
|
59
|
+
interface ModelDistribution {
|
|
60
|
+
kind: ModelDistributionKind;
|
|
61
|
+
/** Total download size of all assets in bytes. */
|
|
62
|
+
sizeBytes?: number;
|
|
63
|
+
assets?: ModelAsset[];
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Full specification of a TTS model, including its identity, voices, licensing, distribution info,
|
|
67
|
+
* and runtime requirements.
|
|
68
|
+
*/
|
|
69
|
+
interface ModelSpec {
|
|
70
|
+
id: ModelId;
|
|
71
|
+
/** Identifier of the adapter (engine backend) that runs this model. */
|
|
72
|
+
adapterId: AdapterId;
|
|
73
|
+
name: string;
|
|
74
|
+
/** Model family grouping (e.g., "piper", "kokoro"). */
|
|
75
|
+
family: string;
|
|
76
|
+
/** Version string used to detect asset updates and cache invalidation. */
|
|
77
|
+
revision: string;
|
|
78
|
+
/** SPDX license identifier or descriptive license string. */
|
|
79
|
+
license: string;
|
|
80
|
+
/** BCP-47 language codes this model supports. */
|
|
81
|
+
languages: string[];
|
|
82
|
+
/** Whether the model uses a single fixed voice, multiple built-in voices, or one model per voice. */
|
|
83
|
+
voiceMode: ModelVoiceMode;
|
|
84
|
+
voices?: Voice[];
|
|
85
|
+
defaultVoiceId?: VoiceId;
|
|
86
|
+
description?: string;
|
|
87
|
+
/** URL to a remote manifest for dynamic model metadata updates. */
|
|
88
|
+
manifestUrl?: string;
|
|
89
|
+
homepage?: string;
|
|
90
|
+
/** Browser capabilities (WASM, WebGPU, Worker) the model needs at runtime. */
|
|
91
|
+
requirements?: ModelRequirements;
|
|
92
|
+
tags?: string[];
|
|
93
|
+
/** Adapter-specific configuration passed through to the model instance. */
|
|
94
|
+
config?: Record<string, unknown>;
|
|
95
|
+
distribution?: ModelDistribution;
|
|
96
|
+
}
|
|
97
|
+
/** Resolves a model's distribution info, defaulting to `{ kind: "none" }` when unset. */
|
|
98
|
+
declare function resolveModelDistribution(model: Pick<ModelSpec, "distribution">): ModelDistribution;
|
|
99
|
+
/** Returns the list of downloadable assets for a model. */
|
|
100
|
+
declare function getModelAssets(model: Pick<ModelSpec, "distribution">): ModelAsset[];
|
|
101
|
+
/** Returns the total download size in bytes for a model. */
|
|
102
|
+
declare function getModelSizeBytes(model: Pick<ModelSpec, "distribution">): number;
|
|
103
|
+
/** A collection of model specifications, optionally identified by a catalog id. */
|
|
104
|
+
interface ModelCatalog {
|
|
105
|
+
id?: string;
|
|
106
|
+
models: ModelSpec[];
|
|
107
|
+
}
|
|
108
|
+
/** A model catalog provided either directly or via a factory function. */
|
|
109
|
+
type CatalogSource = ModelCatalog | (() => ModelCatalog);
|
|
110
|
+
/** Current status of a model's installation lifecycle. */
|
|
111
|
+
type InstallStatus = "not-applicable" | "idle" | "installing" | "installed" | "error" | "unknown" | "ready";
|
|
112
|
+
/** High-level phase of the TTS runtime lifecycle. */
|
|
113
|
+
type RuntimePhase = "idle" | "installing" | "loading" | "speaking" | "error";
|
|
114
|
+
/** Tracks the installation state of a specific model, including progress and error info. */
|
|
115
|
+
interface InstallState {
|
|
116
|
+
modelId: ModelId;
|
|
117
|
+
adapterId: AdapterId;
|
|
118
|
+
revision: string;
|
|
119
|
+
/** Whether assets are managed by the core store, externally by the adapter, or not needed. */
|
|
120
|
+
kind: "not-applicable" | "managed" | "external";
|
|
121
|
+
/** True once all required assets are available locally. */
|
|
122
|
+
installed: boolean;
|
|
123
|
+
status: InstallStatus;
|
|
124
|
+
/** Download progress from 0 to 1, or null when not actively downloading. */
|
|
125
|
+
progress: number | null;
|
|
126
|
+
/** Human-readable error message if the install failed. */
|
|
127
|
+
error: string | null;
|
|
128
|
+
}
|
|
129
|
+
/** Diagnostic information about a loaded model's runtime environment (backend, worker usage, etc.). */
|
|
130
|
+
interface ModelRuntimeInfo {
|
|
131
|
+
/** Execution backend in use (e.g., "webgpu", "wasm"). */
|
|
132
|
+
backend: string | null;
|
|
133
|
+
/** Runtime environment label (e.g., "onnx", "sherpa"). */
|
|
134
|
+
runtime: string | null;
|
|
135
|
+
/** Whether the model is running inside a Web Worker. */
|
|
136
|
+
worker: boolean | null;
|
|
137
|
+
/** Free-form diagnostic string for debugging. */
|
|
138
|
+
detail?: string | null;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Returns the install state for a model, inferring a default from its distribution if none is
|
|
142
|
+
* provided.
|
|
143
|
+
*/
|
|
144
|
+
declare function resolveInstallState(model: Pick<ModelSpec, "id" | "adapterId" | "revision" | "distribution">, installState?: InstallState): InstallState;
|
|
145
|
+
/** Returns true if the model is ready to use (either installed or installation is not applicable). */
|
|
146
|
+
declare function isInstallStateAvailable(installState: InstallState | null | undefined): boolean;
|
|
147
|
+
/** Type guard that checks whether the install state is managed by the core asset store. */
|
|
148
|
+
declare function isManagedInstallState(installState: InstallState | null | undefined): installState is InstallState & {
|
|
149
|
+
kind: "managed";
|
|
150
|
+
};
|
|
151
|
+
/** Type guard that checks whether the install state is managed externally by the adapter. */
|
|
152
|
+
declare function isExternalInstallState(installState: InstallState | null | undefined): installState is InstallState & {
|
|
153
|
+
kind: "external";
|
|
154
|
+
};
|
|
155
|
+
/** Options for a speak or synthesize call, including model/voice selection and speed. */
|
|
156
|
+
interface SpeakOptions {
|
|
157
|
+
modelId?: ModelId;
|
|
158
|
+
voiceId?: VoiceId;
|
|
159
|
+
/** Playback speed multiplier (clamped to 0.5–2x). */
|
|
160
|
+
speed?: number;
|
|
161
|
+
/** Models to try in order if the primary model fails or is unavailable. */
|
|
162
|
+
fallbackModelIds?: ModelId[];
|
|
163
|
+
}
|
|
164
|
+
/** Options for preparing (loading) a model before speaking. */
|
|
165
|
+
interface PrepareOptions {
|
|
166
|
+
voiceId?: VoiceId;
|
|
167
|
+
onProgress?: (progress: number) => void;
|
|
168
|
+
}
|
|
169
|
+
/** Observable snapshot of the entire TTS runtime state, used by UI bindings and subscribers. */
|
|
170
|
+
interface RuntimeState {
|
|
171
|
+
/** All registered model specs from loaded catalogs. */
|
|
172
|
+
models: ModelSpec[];
|
|
173
|
+
/** Model IDs that pass the adapter's `isSupported` check on this platform. */
|
|
174
|
+
supportedModelIds: ModelId[];
|
|
175
|
+
/** Currently selected model for speech synthesis. */
|
|
176
|
+
activeModelId: ModelId | null;
|
|
177
|
+
/** Currently selected voice within the active model. */
|
|
178
|
+
activeVoiceId: VoiceId | null;
|
|
179
|
+
/** Voices available for the active model. */
|
|
180
|
+
voices: Voice[];
|
|
181
|
+
/** True while a model is being loaded or prepared. */
|
|
182
|
+
isPreparing: boolean;
|
|
183
|
+
isSpeaking: boolean;
|
|
184
|
+
/** High-level lifecycle phase (idle, installing, loading, speaking, error). */
|
|
185
|
+
phase: RuntimePhase;
|
|
186
|
+
/** Model that the current phase activity relates to. */
|
|
187
|
+
phaseModelId: ModelId | null;
|
|
188
|
+
/** Progress (0–1) of the current phase activity, or null when indeterminate. */
|
|
189
|
+
phaseProgress: number | null;
|
|
190
|
+
error: string | null;
|
|
191
|
+
/** Per-model installation states keyed by model ID. */
|
|
192
|
+
installStates: Record<ModelId, InstallState>;
|
|
193
|
+
/** True once install states have been loaded from the asset store on startup. */
|
|
194
|
+
installStateHydrated: boolean;
|
|
195
|
+
/** Per-model runtime diagnostics (backend, worker status), keyed by model ID. */
|
|
196
|
+
runtimeInfoByModel: Record<ModelId, ModelRuntimeInfo | null>;
|
|
197
|
+
}
|
|
198
|
+
/** Composite key that uniquely identifies a versioned bundle of model assets in the asset store. */
|
|
199
|
+
interface AssetBundleKey {
|
|
200
|
+
adapterId: AdapterId;
|
|
201
|
+
modelId: ModelId;
|
|
202
|
+
revision: string;
|
|
203
|
+
}
|
|
204
|
+
/** Persistent storage backend for model asset bundles (e.g., IndexedDB, filesystem). */
|
|
205
|
+
interface AssetStore {
|
|
206
|
+
/** Write a single asset to temporary staging before the bundle is activated. */
|
|
207
|
+
stageAsset(bundle: AssetBundleKey, assetName: string, data: ArrayBuffer): Promise<void>;
|
|
208
|
+
/** Promote staged assets into the active bundle, making them available via `getAsset`. */
|
|
209
|
+
activateBundle(bundle: AssetBundleKey, assetNames: string[]): Promise<void>;
|
|
210
|
+
/** Check whether all required assets for a bundle are present in the store. */
|
|
211
|
+
isInstalled(bundle: AssetBundleKey, requiredAssetNames?: string[]): Promise<boolean>;
|
|
212
|
+
/** Retrieve a single asset's data from an activated bundle. */
|
|
213
|
+
getAsset(bundle: AssetBundleKey, assetName: string): Promise<ArrayBuffer | null>;
|
|
214
|
+
/** Delete an entire bundle and all its assets from the store. */
|
|
215
|
+
removeBundle(bundle: AssetBundleKey): Promise<void>;
|
|
216
|
+
}
|
|
217
|
+
/** Services provided to adapters by the runtime (asset storage, fetch, abort helpers). */
|
|
218
|
+
interface TTSAdapterContext {
|
|
219
|
+
/** Persistent storage for downloading and caching model assets. */
|
|
220
|
+
assetStore: AssetStore;
|
|
221
|
+
/** Fetch implementation provided by the runtime (allows custom proxying or caching). */
|
|
222
|
+
fetch: typeof fetch;
|
|
223
|
+
/** Create a platform-appropriate `AbortError` for cancellation. */
|
|
224
|
+
createAbortError(): Error;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Declares which operations an adapter supports (install, speak, synthesize, stream, dynamic
|
|
228
|
+
* voices).
|
|
229
|
+
*/
|
|
230
|
+
interface TTSAdapterCapabilities {
|
|
231
|
+
/** Whether the adapter manages its own asset installation. */
|
|
232
|
+
install: boolean;
|
|
233
|
+
/** Whether the adapter can play audio directly (without returning AudioData). */
|
|
234
|
+
speak: boolean;
|
|
235
|
+
/** Whether the adapter can return a complete AudioData buffer. */
|
|
236
|
+
synthesize: boolean;
|
|
237
|
+
/** Whether the adapter supports streaming AudioData chunks incrementally. */
|
|
238
|
+
stream: boolean;
|
|
239
|
+
/** Whether the voice list can change at runtime (e.g., cloned voices). */
|
|
240
|
+
dynamicVoices: boolean;
|
|
241
|
+
}
|
|
242
|
+
/** Platform audio output interface used by the runtime to play synthesized speech. */
|
|
243
|
+
interface RuntimeAudioPlayer {
|
|
244
|
+
/** Pre-initialize audio context to avoid first-play latency. */
|
|
245
|
+
warmup?(): void;
|
|
246
|
+
play(audio: AudioData, signal: AbortSignal, speed?: number): Promise<void>;
|
|
247
|
+
/** Play audio chunks as they arrive from a streaming source. */
|
|
248
|
+
playStream?(audio: AsyncIterable<AudioData>, signal: AbortSignal, speed?: number): Promise<void>;
|
|
249
|
+
stop(): void;
|
|
250
|
+
dispose(): void;
|
|
251
|
+
}
|
|
252
|
+
/** Discriminant for model instance types. */
|
|
253
|
+
type ModelInstanceKind = "speaking" | "synthesizing";
|
|
254
|
+
/**
|
|
255
|
+
* A loaded, ready-to-use model instance created by an adapter. The base interface provides
|
|
256
|
+
* lifecycle methods common to all models. Synthesis capabilities are declared via
|
|
257
|
+
* {@link SpeakingModelInstance} (direct audio playback) or {@link SynthesizingModelInstance}
|
|
258
|
+
* (generates audio data).
|
|
259
|
+
*/
|
|
260
|
+
interface TTSModelInstance {
|
|
261
|
+
/** Discriminant that identifies the model's capability type. */
|
|
262
|
+
readonly kind: ModelInstanceKind;
|
|
263
|
+
readonly modelId: ModelId;
|
|
264
|
+
readonly adapterId: AdapterId;
|
|
265
|
+
/** Load model weights and initialize the backend; reports progress via callback. */
|
|
266
|
+
load(signal: AbortSignal, onProgress?: (progress: number) => void): Promise<void>;
|
|
267
|
+
listVoices(): Voice[] | Promise<Voice[]>;
|
|
268
|
+
/** Cancel any in-flight generation without disposing the instance. */
|
|
269
|
+
abortActiveGeneration?(): void;
|
|
270
|
+
/** Return diagnostic info about the active backend and runtime environment. */
|
|
271
|
+
getRuntimeInfo?(): ModelRuntimeInfo | null;
|
|
272
|
+
dispose(): void;
|
|
273
|
+
}
|
|
274
|
+
/** A model that plays audio directly through the adapter's own audio output (e.g. Web Speech API). */
|
|
275
|
+
interface SpeakingModelInstance extends TTSModelInstance {
|
|
276
|
+
readonly kind: "speaking";
|
|
277
|
+
speak(text: string, voiceId: VoiceId, signal: AbortSignal, speed?: number): Promise<void>;
|
|
278
|
+
}
|
|
279
|
+
/** A model that synthesizes audio data, optionally with streaming support. */
|
|
280
|
+
interface SynthesizingModelInstance extends TTSModelInstance {
|
|
281
|
+
readonly kind: "synthesizing";
|
|
282
|
+
generate(text: string, voiceId: VoiceId, signal: AbortSignal, speed?: number): Promise<AudioData>;
|
|
283
|
+
/** Yield AudioData chunks as they are synthesized for low-latency playback. */
|
|
284
|
+
stream?(text: string, voiceId: VoiceId, signal: AbortSignal, speed?: number): AsyncIterable<AudioData>;
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Backend engine that can create model instances and optionally manage installation. Each adapter
|
|
288
|
+
* corresponds to a specific TTS engine (e.g., Piper, Kokoro, Web Speech API).
|
|
289
|
+
*/
|
|
290
|
+
interface TTSAdapter<T extends TTSModelInstance = TTSModelInstance> {
|
|
291
|
+
readonly id: AdapterId;
|
|
292
|
+
readonly name: string;
|
|
293
|
+
/** Declares which operations this adapter supports; unset fields use defaults. */
|
|
294
|
+
readonly capabilities?: Partial<TTSAdapterCapabilities>;
|
|
295
|
+
/** Return false if the model cannot run on the current platform/browser. */
|
|
296
|
+
isSupported?(spec: ModelSpec): boolean;
|
|
297
|
+
/** Instantiate a model from its spec; may be async if initialization requires I/O. */
|
|
298
|
+
createModel(spec: ModelSpec, context: TTSAdapterContext): Promise<T> | T;
|
|
299
|
+
/** Download and store model assets; only called when `capabilities.install` is true. */
|
|
300
|
+
install?(spec: ModelSpec, context: TTSAdapterContext, signal: AbortSignal, onProgress?: (progress: number) => void): Promise<void>;
|
|
301
|
+
/** Remove previously installed model assets. */
|
|
302
|
+
uninstall?(spec: ModelSpec, context: TTSAdapterContext): Promise<void>;
|
|
303
|
+
}
|
|
304
|
+
/** Configuration for creating a {@link TTSRuntime} instance. */
|
|
305
|
+
interface TTSRuntimeOptions {
|
|
306
|
+
/** Engine backends to register (at least one required). */
|
|
307
|
+
adapters: TTSAdapter[];
|
|
308
|
+
/** Model catalogs to load; can be static objects or factory functions. */
|
|
309
|
+
catalogs: CatalogSource[];
|
|
310
|
+
/** Persistent storage for model assets; defaults to an in-memory store if omitted. */
|
|
311
|
+
assetStore?: AssetStore;
|
|
312
|
+
/** Platform audio player; pass null to disable direct playback. */
|
|
313
|
+
audioPlayer?: RuntimeAudioPlayer | null;
|
|
314
|
+
/** Custom fetch implementation for asset downloads. */
|
|
315
|
+
fetch?: typeof fetch;
|
|
316
|
+
/** Model to select on startup before any user interaction. */
|
|
317
|
+
initialModelId?: ModelId;
|
|
318
|
+
/** Voice to select on startup before any user interaction. */
|
|
319
|
+
initialVoiceId?: VoiceId;
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* The main public API for polytts. Manages models, voices, installation, and speech synthesis.
|
|
323
|
+
* Subscribe to state changes for reactive UI updates.
|
|
324
|
+
*/
|
|
325
|
+
interface TTSRuntime {
|
|
326
|
+
getState(): RuntimeState;
|
|
327
|
+
subscribe(listener: (state: RuntimeState) => void): () => void;
|
|
328
|
+
listModels(): ModelSpec[];
|
|
329
|
+
getModel(modelId: ModelId): ModelSpec | null;
|
|
330
|
+
listVoices(modelId?: ModelId): Promise<Voice[]>;
|
|
331
|
+
install(modelId: ModelId, onProgress?: (progress: number) => void): Promise<void>;
|
|
332
|
+
uninstall(modelId: ModelId): Promise<void>;
|
|
333
|
+
prepare(modelId: ModelId, options?: PrepareOptions): Promise<void>;
|
|
334
|
+
speak(text: string, options?: SpeakOptions): Promise<void>;
|
|
335
|
+
synthesizeStream(text: string, options?: SpeakOptions): AsyncIterable<AudioData>;
|
|
336
|
+
synthesize(text: string, options?: SpeakOptions): Promise<AudioData>;
|
|
337
|
+
stop(): void;
|
|
338
|
+
dispose(): void;
|
|
339
|
+
}
|
|
340
|
+
/** Resolves an adapter's full capabilities, using sensible defaults for any unspecified fields. */
|
|
341
|
+
declare function resolveAdapterCapabilities(adapter: Pick<TTSAdapter, "capabilities" | "install">): TTSAdapterCapabilities;
|
|
342
|
+
//#endregion
|
|
343
|
+
//#region src/audio.d.ts
|
|
344
|
+
/** Creates an AudioData object from a sample rate and an array of channel buffers. */
|
|
345
|
+
declare function createAudioData(sampleRate: number, channels: Float32Array[]): AudioData;
|
|
346
|
+
/** Wraps a single-channel PCM Float32Array into an AudioData object. */
|
|
347
|
+
declare function pcmToAudioData(data: Float32Array, sampleRate: number): AudioData;
|
|
348
|
+
/** Creates a silent AudioData object, useful as a placeholder or for padding. */
|
|
349
|
+
declare function createSilentAudioData(sampleRate?: number, frameCount?: number): AudioData;
|
|
350
|
+
//#endregion
|
|
351
|
+
//#region src/catalog.d.ts
|
|
352
|
+
/** Creates a model catalog from a static array of model specs. */
|
|
353
|
+
declare function createStaticCatalog(models: ModelSpec[]): ModelCatalog;
|
|
354
|
+
/** Resolves a catalog source (static object or factory function) into a ModelCatalog. */
|
|
355
|
+
declare function resolveCatalogSource(source: CatalogSource): ModelCatalog;
|
|
356
|
+
/** Merges multiple catalog sources into a single catalog, throwing on duplicate model IDs. */
|
|
357
|
+
declare function mergeCatalogs(...sources: CatalogSource[]): ModelCatalog;
|
|
358
|
+
/** Validates that every model in the catalog references a registered adapter. */
|
|
359
|
+
declare function validateCatalog(catalog: ModelCatalog, adapters: Map<string, TTSAdapter>): void;
|
|
360
|
+
//#endregion
|
|
361
|
+
//#region src/runtime.d.ts
|
|
362
|
+
/** Creates a new TTS runtime instance, the main entry point for text-to-speech functionality. */
|
|
363
|
+
declare function createTTSRuntime(options: TTSRuntimeOptions): TTSRuntime;
|
|
364
|
+
//#endregion
|
|
365
|
+
//#region src/storage/memory-asset-store.d.ts
|
|
366
|
+
/**
|
|
367
|
+
* In-memory implementation of AssetStore, primarily useful for testing and environments without
|
|
368
|
+
* persistent storage.
|
|
369
|
+
*/
|
|
370
|
+
declare class MemoryAssetStore implements AssetStore {
|
|
371
|
+
private readonly staging;
|
|
372
|
+
private readonly active;
|
|
373
|
+
private readonly meta;
|
|
374
|
+
stageAsset(bundle: AssetBundleKey, assetName: string, data: ArrayBuffer): Promise<void>;
|
|
375
|
+
activateBundle(bundle: AssetBundleKey, assetNames: string[]): Promise<void>;
|
|
376
|
+
isInstalled(bundle: AssetBundleKey, requiredAssetNames?: string[]): Promise<boolean>;
|
|
377
|
+
getAsset(bundle: AssetBundleKey, assetName: string): Promise<ArrayBuffer | null>;
|
|
378
|
+
removeBundle(bundle: AssetBundleKey): Promise<void>;
|
|
379
|
+
}
|
|
380
|
+
//#endregion
|
|
381
|
+
export { AdapterId, AssetBundleKey, AssetStore, AudioData, AudioDataLike, CatalogSource, DEFAULT_SPEAK_SPEED, InstallState, InstallStatus, MAX_SPEAK_SPEED, MIN_SPEAK_SPEED, MemoryAssetStore, ModelAsset, ModelCatalog, ModelDistribution, ModelDistributionKind, ModelId, ModelInstanceKind, ModelRequirements, ModelRuntimeInfo, ModelSpec, ModelVoiceMode, PrepareOptions, RuntimeAudioPlayer, RuntimePhase, RuntimeState, SpeakOptions, SpeakingModelInstance, SynthesizingModelInstance, TTSAdapter, TTSAdapterCapabilities, TTSAdapterContext, TTSModelInstance, TTSRuntime, TTSRuntimeOptions, Voice, VoiceId, createAudioData, createSilentAudioData, createStaticCatalog, createTTSRuntime, getModelAssets, getModelSizeBytes, isExternalInstallState, isInstallStateAvailable, isManagedInstallState, mergeCatalogs, normalizeSpeakSpeed, pcmToAudioData, resolveAdapterCapabilities, resolveCatalogSource, resolveInstallState, resolveModelDistribution, validateCatalog };
|