sinapsis-speech 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/tags.py +15 -0
  2. sinapsis_elevenlabs/src/sinapsis_elevenlabs/helpers/voice_utils.py +50 -14
  3. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/__init__.py +2 -0
  4. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_base.py +40 -54
  5. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_sts.py +60 -17
  6. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_tts.py +12 -8
  7. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_clone.py +89 -11
  8. sinapsis_elevenlabs/src/sinapsis_elevenlabs/templates/elevenlabs_voice_generation.py +7 -1
  9. sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/__init__.py +0 -0
  10. sinapsis_f5_tts/src/sinapsis_f5_tts/helpers/tags.py +10 -0
  11. sinapsis_f5_tts/src/sinapsis_f5_tts/templates/f5_tts_inference.py +13 -1
  12. sinapsis_kokoro/src/sinapsis_kokoro/__init__.py +0 -0
  13. sinapsis_kokoro/src/sinapsis_kokoro/helpers/tags.py +10 -0
  14. sinapsis_kokoro/src/sinapsis_kokoro/templates/kokoro_tts.py +14 -3
  15. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/__init__.py +0 -0
  16. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/__init__.py +0 -0
  17. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/helpers/tags.py +10 -0
  18. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/__init__.py +20 -0
  19. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/templates/orpheus_tts.py +312 -0
  20. sinapsis_orpheus_cpp/src/sinapsis_orpheus_cpp/thirdparty/helpers.py +69 -0
  21. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/__init__.py +0 -0
  22. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/__init__.py +0 -0
  23. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/helpers/tags.py +11 -0
  24. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/__init__.py +20 -0
  25. sinapsis_parakeet_tdt/src/sinapsis_parakeet_tdt/templates/parakeet_tdt.py +289 -0
  26. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/METADATA +68 -5
  27. sinapsis_speech-0.4.1.dist-info/RECORD +44 -0
  28. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/WHEEL +1 -1
  29. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/top_level.txt +2 -0
  30. sinapsis_zonos/src/sinapsis_zonos/helpers/tags.py +11 -0
  31. sinapsis_zonos/src/sinapsis_zonos/helpers/zonos_tts_utils.py +1 -1
  32. sinapsis_zonos/src/sinapsis_zonos/templates/zonos_tts.py +13 -13
  33. sinapsis_speech-0.3.5.dist-info/RECORD +0 -27
  34. {sinapsis_speech-0.3.5.dist-info → sinapsis_speech-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,6 +6,7 @@ from typing import Any, Literal
6
6
 
7
7
  import numpy as np
8
8
  import soundfile as sf
9
+ import torch
9
10
  from pydantic import Field
10
11
  from pydantic.dataclasses import dataclass
11
12
  from sinapsis_core.data_containers.data_packet import (
@@ -15,6 +16,8 @@ from sinapsis_core.data_containers.data_packet import (
15
16
  from sinapsis_core.template_base import Template
16
17
  from sinapsis_core.template_base.base_models import OutputTypes, TemplateAttributes, UIPropertiesMetadata
17
18
 
19
+ from sinapsis_f5_tts.helpers.tags import Tags
20
+
18
21
 
19
22
  @dataclass
20
23
  class F5CliKeys:
@@ -146,7 +149,11 @@ class F5TTSInference(Template):
146
149
  """
147
150
 
148
151
  AttributesBaseModel = F5TTSInferenceAttributes
149
- UIProperties = UIPropertiesMetadata(category="F5TTS", output_type=OutputTypes.AUDIO)
152
+ UIProperties = UIPropertiesMetadata(
153
+ category="F5TTS",
154
+ output_type=OutputTypes.AUDIO,
155
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.F5TTS, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
156
+ )
150
157
 
151
158
  def _add_attribute_to_command(self, cli_command: list[str], field_name: str, field: Any) -> None:
152
159
  """
@@ -357,3 +364,8 @@ class F5TTSInference(Template):
357
364
  )
358
365
 
359
366
  return container
367
+
368
+ def reset_state(self, template_name: str | None = None) -> None:
369
+ if "cuda" in self.attributes.device:
370
+ torch.cuda.empty_cache()
371
+ super().reset_state(template_name)
File without changes
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ KOKORO = "kokoro"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- from typing import Generator
2
+ from typing import Generator, Literal
3
3
  from urllib.error import HTTPError
4
4
 
5
5
  import torch
@@ -15,6 +15,7 @@ from sinapsis_core.template_base.template import Template
15
15
  from sinapsis_core.utils.logging_utils import make_loguru
16
16
 
17
17
  from sinapsis_kokoro.helpers.kokoro_utils import KokoroKeys, kokoro_voices
18
+ from sinapsis_kokoro.helpers.tags import Tags
18
19
 
19
20
 
20
21
  class KokoroTTS(Template):
@@ -39,7 +40,11 @@ class KokoroTTS(Template):
39
40
  voice: af_heart
40
41
  """
41
42
 
42
- UIProperties = UIPropertiesMetadata(category="Kokoro", output_type=OutputTypes.AUDIO)
43
+ UIProperties = UIPropertiesMetadata(
44
+ category="Kokoro",
45
+ output_type=OutputTypes.AUDIO,
46
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.KOKORO, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
47
+ )
43
48
 
44
49
  class AttributesBaseModel(TemplateAttributes):
45
50
  """
@@ -56,6 +61,7 @@ class KokoroTTS(Template):
56
61
  https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
57
62
  """
58
63
 
64
+ device: Literal["cpu", "cuda"] = "cpu"
59
65
  speed: int | float = 1
60
66
  split_pattern: str = r"\n+"
61
67
  voice: kokoro_voices = KokoroKeys.default_voice
@@ -73,7 +79,7 @@ class KokoroTTS(Template):
73
79
  Returns:
74
80
  KPipeline: The initialized TTS pipeline for generating speech.
75
81
  """
76
- return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id)
82
+ return KPipeline(lang_code=self.attributes.voice[0], repo_id=KokoroKeys.repo_id, device=self.attributes.device)
77
83
 
78
84
  def _create_audio_packet(
79
85
  self,
@@ -151,3 +157,8 @@ class KokoroTTS(Template):
151
157
  self.generate_speech(container)
152
158
 
153
159
  return container
160
+
161
+ def reset_state(self, template_name: str | None = None) -> None:
162
+ if "cuda" in self.attributes.device:
163
+ torch.cuda.empty_cache()
164
+ super().reset_state(template_name)
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ AUDIO_GENERATION = "audio_generation"
8
+ ORPHEUS_CPP = "orpheus_cpp"
9
+ SPEECH = "speech"
10
+ TEXT_TO_SPEECH = "text_to_speech"
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_orpheus_cpp.templates"
6
+
7
+ _template_lookup = {
8
+ "OrpheusTTS": f"{_root_lib_path}.orpheus_tts",
9
+ }
10
+
11
+
12
+ def __getattr__(name: str) -> Callable:
13
+ if name in _template_lookup:
14
+ module = importlib.import_module(_template_lookup[name])
15
+ return getattr(module, name)
16
+
17
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
18
+
19
+
20
+ __all__ = list(_template_lookup.keys())
@@ -0,0 +1,312 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import numpy as np
4
+ import torch
5
+ from llama_cpp import Llama
6
+ from orpheus_cpp import OrpheusCpp
7
+ from orpheus_cpp.model import TTSOptions
8
+ from pydantic import TypeAdapter
9
+ from sinapsis_core.data_containers.data_packet import (
10
+ AudioPacket,
11
+ DataContainer,
12
+ )
13
+ from sinapsis_core.template_base import Template
14
+ from sinapsis_core.template_base.base_models import (
15
+ OutputTypes,
16
+ TemplateAttributes,
17
+ TemplateAttributeType,
18
+ UIPropertiesMetadata,
19
+ )
20
+ from sinapsis_core.utils.env_var_keys import SINAPSIS_CACHE_DIR
21
+
22
+ from sinapsis_orpheus_cpp.helpers.tags import Tags
23
+ from sinapsis_orpheus_cpp.thirdparty.helpers import download_model, setup_snac_session
24
+
25
+
26
+ class OrpheusTTSAttributes(TemplateAttributes):
27
+ """Attributes configuration for Orpheus TTS Template.
28
+
29
+ This class defines all configurable parameters for the Orpheus TTS model,
30
+ including model configuration, GPU settings, and audio generation parameters.
31
+
32
+ Attributes:
33
+ n_gpu_layers (int): Number of model layers to offload to GPU.
34
+ -1 means use all available layers on GPU for maximum performance.
35
+ 0 means use CPU only. Default: -1.
36
+ n_threads (int): Number of CPU threads to use for model inference.
37
+ 0 means auto-detect optimal thread count. Default: 0.
38
+ n_ctx (int): Context window size (maximum number of tokens).
39
+ 0 means use the model's maximum trained context size.
40
+ Larger values require more GPU/RAM memory. Default: 8192.
41
+ model_id (str): Hugging Face model repository ID.
42
+ Must be a valid repository containing GGUF model files.
43
+ Required parameter with no default.
44
+ model_variant (str | None): Specific GGUF file to download from the repository.
45
+ If None, will auto-detect based on model_id naming convention.
46
+ Use this to specify exact quantization (e.g., "model-q4_k_m.gguf").
47
+ Default: None.
48
+ cache_dir (str): Directory to store downloaded models and cache files.
49
+ Default: SINAPSIS_CACHE_DIR environment variable.
50
+ verbose (bool): Enable verbose logging for model operations.
51
+ Shows detailed model loading and inference information. Default: False.
52
+ voice_id (str): Voice identifier for speech synthesis.
53
+ Must be a valid voice supported by the Orpheus model.
54
+ Available voices depend on the specific model variant.
55
+ Required parameter with no default.
56
+ batch_size (int): Batch size for model inference.
57
+ Higher values may improve throughput but require more memory.
58
+ Default: 1.
59
+ max_tokens (int): Maximum number of tokens to generate for speech.
60
+ Controls the length of generated audio sequences. Default: 2048.
61
+ temperature (float): Sampling temperature for token generation.
62
+ Higher values (>1.0) make output more random, lower values (<1.0)
63
+ make it more deterministic. Default: 0.8.
64
+ top_p (float): Nucleus sampling probability threshold.
65
+ Only tokens with cumulative probability <= top_p are considered.
66
+ Range: 0.0-1.0. Default: 0.95.
67
+ top_k (int): Top-k sampling parameter.
68
+ Only the top k most likely tokens are considered for sampling.
69
+ Default: 40.
70
+ min_p (float): Minimum probability threshold for token selection.
71
+ Tokens with probability below this threshold are filtered out.
72
+ Range: 0.0-1.0. Default: 0.05.
73
+ pre_buffer_size (float): Duration in seconds of audio to generate
74
+ before yielding the first chunk during streaming.
75
+ Larger values provide smoother audio but higher latency.
76
+ Default: 1.5.
77
+ """
78
+
79
+ n_gpu_layers: int = -1
80
+ n_threads: int = 0
81
+ n_ctx: int = 8192
82
+ model_id: str
83
+ model_variant: str | None = None
84
+ cache_dir: str = SINAPSIS_CACHE_DIR
85
+ verbose: bool = False
86
+ voice_id: str
87
+ batch_size: int = 1
88
+ max_tokens: int = 2048
89
+ temperature: float = 0.8
90
+ top_p: float = 0.95
91
+ top_k: int = 40
92
+ min_p: float = 0.05
93
+ pre_buffer_size: float = 1.5
94
+
95
+
96
+ class OrpheusTTS(Template):
97
+ """Text-to-Speech template using Orpheus model for speech synthesis.
98
+
99
+ This template converts text input into high-quality speech audio using
100
+ the Orpheus neural TTS model. It handles model downloading, initialization,
101
+ and audio generation with configurable voice parameters.
102
+
103
+ Usage example:
104
+
105
+ agent:
106
+ name: my_test_agent
107
+ templates:
108
+ - template_name: InputTemplate
109
+ class_name: InputTemplate
110
+ attributes: {}
111
+ - template_name: OrpheusTTS
112
+ class_name: OrpheusTTS
113
+ template_input: InputTemplate
114
+ attributes:
115
+ n_gpu_layers: -1
116
+ n_threads: 0
117
+ n_ctx: 8192
118
+ model_id: '`replace_me:<class ''str''>`'
119
+ model_variant: null
120
+ cache_dir: ~/sinapsis
121
+ verbose: false
122
+ voice_id: '`replace_me:<class ''str''>`'
123
+ batch_size: 1
124
+ max_tokens: 2048
125
+ temperature: 0.8
126
+ top_p: 0.95
127
+ top_k: 40
128
+ min_p: 0.05
129
+ pre_buffer_size: 1.5
130
+
131
+ """
132
+
133
+ AttributesBaseModel = OrpheusTTSAttributes
134
+ UIProperties = UIPropertiesMetadata(
135
+ category="TTS",
136
+ output_type=OutputTypes.AUDIO,
137
+ tags=[Tags.AUDIO, Tags.AUDIO_GENERATION, Tags.ORPHEUS_CPP, Tags.SPEECH, Tags.TEXT_TO_SPEECH],
138
+ )
139
+
140
+ def __init__(self, attributes: TemplateAttributeType) -> None:
141
+ super().__init__(attributes)
142
+ self._engine: OrpheusCpp
143
+ self._llm_available: bool = False
144
+ self._initialize_engine()
145
+
146
+ def _initialize_engine(self) -> None:
147
+ """Initialize the OrpheusCpp engine with downloaded model.
148
+
149
+ Creates a new OrpheusCpp instance without calling its constructor
150
+ to avoid default parameter conflicts, then manually configures
151
+ both the LLM and SNAC session components.
152
+
153
+ Raises:
154
+ ValueError: If model download fails.
155
+ RuntimeError: If engine initialization fails.
156
+ """
157
+ self._engine = OrpheusCpp.__new__(OrpheusCpp)
158
+ model_file = download_model(
159
+ model_id=self.attributes.model_id,
160
+ model_variant=self.attributes.model_variant,
161
+ cache_dir=self.attributes.cache_dir,
162
+ )
163
+ if model_file:
164
+ self._setup_llm(model_file)
165
+ self._setup_snac_session()
166
+
167
+ def _setup_llm(self, model_file: str) -> None:
168
+ """Setup the Large Language Model component with specified parameters.
169
+
170
+ Initializes the Llama model with custom configuration parameters.
171
+ Implements graceful error handling for Out-of-Memory conditions
172
+ by setting the LLM as unavailable instead of crashing.
173
+
174
+ Args:
175
+ model_file (str): Path to the downloaded GGUF model file.
176
+
177
+ Raises:
178
+ ValueError: For non-OOM related model initialization errors.
179
+
180
+ Note:
181
+ If a "Failed to create llama_context" error occurs (typically OOM),
182
+ the method logs the error and disables TTS functionality instead
183
+ of terminating the program.
184
+ """
185
+ try:
186
+ self._engine._llm = Llama(
187
+ model_path=model_file,
188
+ n_ctx=self.attributes.n_ctx,
189
+ verbose=self.attributes.verbose,
190
+ n_gpu_layers=self.attributes.n_gpu_layers,
191
+ n_threads=self.attributes.n_threads,
192
+ batch_size=self.attributes.batch_size,
193
+ )
194
+ self._llm_available = True
195
+ except ValueError as e:
196
+ if "Failed to create llama_context" in str(e):
197
+ error_msg = (
198
+ f"Failed to create llama_context - Out of Memory (OOM) issue. "
199
+ f"Current n_ctx: {self.attributes.n_ctx}, n_gpu_layers: {self.attributes.n_gpu_layers}. "
200
+ f"Try reducing n_ctx or "
201
+ f"reduce n_gpu_layers if using GPU. "
202
+ )
203
+ self.logger.error(error_msg)
204
+ self._engine._llm = None
205
+ self._llm_available = False
206
+ else:
207
+ raise
208
+
209
+ def _setup_snac_session(self) -> None:
210
+ """
211
+ Initializes the SNAC (Streaming Neural Audio Codec) session required
212
+ for converting model tokens to audio waveforms. Only sets up the session
213
+ if the LLM was successfully initialized.
214
+
215
+ Note:
216
+ SNAC session is only created when LLM is available to avoid
217
+ unnecessary resource allocation when TTS is disabled.
218
+ """
219
+ if self._llm_available:
220
+ self._engine._snac_session = setup_snac_session(self.attributes.cache_dir)
221
+ else:
222
+ self._engine._snac_session = None
223
+
224
+ def _create_tts_options(self) -> TTSOptions:
225
+ """
226
+ Dynamically builds a TTSOptions dictionary by filtering template attributes
227
+ to include only those that are valid TTSOptions parameters.
228
+
229
+ Returns:
230
+ TTSOptions: Dictionary containing TTS generation parameters.
231
+ """
232
+ tts_option_fields = TypeAdapter(TTSOptions)
233
+ attributes_dict = self.attributes.model_dump()
234
+ return tts_option_fields.validate_python(attributes_dict)
235
+
236
+ def generate_speech(self, text: str) -> tuple[int, np.ndarray] | None:
237
+ """
238
+ Converts text to speech using the Orpheus TTS model with configured
239
+ voice and generation parameters.
240
+
241
+ Args:
242
+ text (str): Input text to convert to speech.
243
+
244
+ Returns:
245
+ tuple[int, np.ndarray] | None: Tuple of (sample_rate, audio_array)
246
+ if generation succeeds, None if LLM is unavailable.
247
+
248
+ Note:
249
+ Returns None when LLM is not available (e.g., due to OOM errors)
250
+ instead of raising an exception, allowing graceful degradation.
251
+ """
252
+ if not self._llm_available:
253
+ return None
254
+ return self._engine.tts(text, options=self._create_tts_options())
255
+
256
+ def create_audio_packet(self, text: str, source: str | None = None) -> AudioPacket | None:
257
+ """
258
+ Generates speech from text and wraps the result in a
259
+ `AudioPacket` for data pipeline compatibility.
260
+
261
+ Args:
262
+ text (str): Input text to convert to speech.
263
+ source (str | None): Optional source identifier for traceability.
264
+
265
+ Returns:
266
+ AudioPacket | None: Audio packet containing generated speech,
267
+ or None if speech generation fails or is unavailable.
268
+ """
269
+ speech_result = self.generate_speech(text)
270
+ if speech_result is None:
271
+ return None
272
+
273
+ sample_rate, audio_data = speech_result
274
+ return AudioPacket(
275
+ content=audio_data,
276
+ source=source,
277
+ sample_rate=sample_rate,
278
+ )
279
+
280
+ def execute(self, container: DataContainer) -> DataContainer:
281
+ """
282
+ Processes all text packets in the input container and generates
283
+ corresponding audio packets using the Orpheus TTS model.
284
+
285
+ Args:
286
+ container (DataContainer): Input container with text packets to process.
287
+
288
+ Returns:
289
+ DataContainer: Updated container with generated audio packets added.
290
+
291
+ Note:
292
+ When LLM is unavailable (due to initialization failures), the method
293
+ logs a warning and returns the container without modifications rather
294
+ than raising an exception.
295
+ """
296
+ if not container.texts:
297
+ return container
298
+
299
+ if not self._llm_available:
300
+ return container
301
+
302
+ for text_packet in container.texts:
303
+ audio_packet = self.create_audio_packet(text=text_packet.content, source=text_packet.source)
304
+ if audio_packet is not None:
305
+ container.audios.append(audio_packet)
306
+
307
+ return container
308
+
309
+ def reset_state(self, template_name: str | None = None) -> None:
310
+ if torch.cuda.is_available():
311
+ torch.cuda.empty_cache()
312
+ super().reset_state(template_name)
@@ -0,0 +1,69 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Themethods declared in this file are inspired in the following source:
4
+
5
+ https://github.com/freddyaboulton/orpheus-cpp
6
+
7
+ which is Licensed under the MIT License.
8
+
9
+ """
10
+
11
+ import onnxruntime
12
+ from huggingface_hub import hf_hub_download
13
+ from sinapsis_core.utils.logging_utils import sinapsis_logger
14
+
15
+
16
+ def download_model(cache_dir: str, model_id: str, model_variant: str | None = None) -> str | None:
17
+ """
18
+ Download a model from Hugging Face Hub.
19
+
20
+ Args:
21
+ model_id: The model ID on Hugging Face Hub.
22
+ model_variant: The specific model variant file to download.
23
+ cache_dir: Directory to store downloaded models.
24
+
25
+ Returns:
26
+ Path to the downloaded model file or None if download fails.
27
+ """
28
+ if model_variant:
29
+ filename = model_variant
30
+ elif model_id.endswith(("-GGUF", "-gguf")):
31
+ filename = model_id.split("/")[-1].lower().replace("-gguf", ".gguf")
32
+ else:
33
+ filename = f"{model_id.split('/')[-1]}.gguf"
34
+
35
+ sinapsis_logger.info(f"Downloading model {model_id} with filename {filename}")
36
+
37
+ model_file = hf_hub_download(
38
+ repo_id=model_id,
39
+ filename=filename,
40
+ cache_dir=cache_dir,
41
+ )
42
+
43
+ sinapsis_logger.info(f"Successfully downloaded model to {model_file}")
44
+ return model_file
45
+
46
+
47
+ def setup_snac_session(cache_dir: str) -> onnxruntime.InferenceSession:
48
+ """
49
+ Download and setup the SNAC ONNX session for audio processing.
50
+
51
+ Args:
52
+ cache_dir: Directory to store downloaded models.
53
+
54
+ Returns:
55
+ Configured ONNX inference session.
56
+ """
57
+ repo_id = "onnx-community/snac_24khz-ONNX"
58
+ snac_model_file = "decoder_model.onnx"
59
+ snac_model_path = hf_hub_download(
60
+ repo_id,
61
+ subfolder="onnx",
62
+ filename=snac_model_file,
63
+ cache_dir=cache_dir,
64
+ )
65
+
66
+ return onnxruntime.InferenceSession(
67
+ snac_model_path,
68
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
69
+ )
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf-8 -*-
2
+ from enum import Enum
3
+
4
+
5
+ class Tags(Enum):
6
+ AUDIO = "audio"
7
+ SPEECH = "speech"
8
+ SPEECH_RECOGNITION = "speech_recognition"
9
+ PARAKEET_TDT = "parakeet_tdt"
10
+ SPEECH_TO_TEXT = "speech_to_text"
11
+ TRANSCRIPTION = "transcription"
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ import importlib
3
+ from typing import Callable
4
+
5
+ _root_lib_path = "sinapsis_parakeet_tdt.templates"
6
+
7
+ _template_lookup = {
8
+ "ParakeetTDTInference": f"{_root_lib_path}.parakeet_tdt",
9
+ }
10
+
11
+
12
+ def __getattr__(name: str) -> Callable:
13
+ if name in _template_lookup:
14
+ module = importlib.import_module(_template_lookup[name])
15
+ return getattr(module, name)
16
+
17
+ raise AttributeError(f"template `{name}` not found in {_root_lib_path}")
18
+
19
+
20
+ __all__ = list(_template_lookup.keys())