vocal-core 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual Environment
25
+ .venv/
26
+ venv/
27
+ ENV/
28
+ env/
29
+
30
+ # UV
31
+ uv.lock
32
+
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+ .DS_Store
40
+
41
+ # Testing
42
+ .pytest_cache/
43
+ .coverage
44
+ htmlcov/
45
+ .tox/
46
+
47
+ # Jupyter
48
+ .ipynb_checkpoints
49
+
50
+ # Model cache
51
+ .cache/
52
+ models/
53
+ *.ckpt
54
+ *.pth
55
+ *.pt
56
+ *.safetensors
57
+
58
+ # Audio files (test data)
59
+ *.mp3
60
+ *.wav
61
+ *.m4a
62
+ *.ogg
63
+ *.flac
64
+
65
+ # Logs
66
+ *.log
67
+ logs/
68
+
69
+ # Environment variables
70
+ .env
71
+ .env.local
72
+
73
+ # OS
74
+ Thumbs.db
@@ -0,0 +1,3 @@
1
+ include ../../../README.md
2
+ include ../../../LICENSE
3
+ recursive-include vocal_core *.py
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.4
2
+ Name: vocal-core
3
+ Version: 0.3.0
4
+ Summary: Core model registry and adapters for Vocal - Ollama for Voice Models
5
+ Project-URL: Homepage, https://github.com/niradler/vocal
6
+ Project-URL: Documentation, https://github.com/niradler/vocal#readme
7
+ Project-URL: Repository, https://github.com/niradler/vocal
8
+ Project-URL: Issues, https://github.com/niradler/vocal/issues
9
+ Author: Vocal Contributors
10
+ License: SSPL-1.0
11
+ Keywords: ai,model-registry,speech-to-text,stt,tts,whisper
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.11
21
+ Requires-Dist: faster-whisper>=1.0.0
22
+ Requires-Dist: huggingface-hub>=0.20.0
23
+ Requires-Dist: numpy>=1.24.0
24
+ Requires-Dist: pydantic-settings>=2.1.0
25
+ Requires-Dist: pydantic>=2.5.0
26
+ Requires-Dist: pyttsx3>=2.90
27
+ Provides-Extra: piper
28
+ Requires-Dist: piper-tts>=1.2.0; extra == 'piper'
29
+ Requires-Dist: scipy>=1.11.0; extra == 'piper'
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "vocal-core"
3
+ version = "0.3.0"
4
+ description = "Core model registry and adapters for Vocal - Ollama for Voice Models"
5
+ requires-python = ">=3.11"
6
+ license = { text = "SSPL-1.0" }
7
+ authors = [
8
+ { name = "Vocal Contributors" }
9
+ ]
10
+ keywords = ["speech-to-text", "tts", "stt", "whisper", "model-registry", "ai"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "License :: Other/Proprietary License",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Programming Language :: Python :: 3.13",
18
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ ]
21
+ dependencies = [
22
+ "faster-whisper>=1.0.0",
23
+ "huggingface-hub>=0.20.0",
24
+ "pydantic>=2.5.0",
25
+ "pydantic-settings>=2.1.0",
26
+ "pyttsx3>=2.90",
27
+ "numpy>=1.24.0",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/niradler/vocal"
32
+ Documentation = "https://github.com/niradler/vocal#readme"
33
+ Repository = "https://github.com/niradler/vocal"
34
+ Issues = "https://github.com/niradler/vocal/issues"
35
+
36
+ [project.optional-dependencies]
37
+ piper = [
38
+ "piper-tts>=1.2.0",
39
+ "scipy>=1.11.0",
40
+ ]
41
+
42
+ [build-system]
43
+ requires = ["hatchling"]
44
+ build-backend = "hatchling.build"
@@ -0,0 +1,44 @@
1
+ import pytest
2
+
3
+ from vocal_core import ModelRegistry
4
+
5
+
6
+ @pytest.mark.asyncio
7
+ async def test_model_registry_init():
8
+ """Test that ModelRegistry initializes correctly"""
9
+ registry = ModelRegistry()
10
+ assert registry is not None
11
+ assert registry.storage_path.exists()
12
+
13
+
14
+ @pytest.mark.asyncio
15
+ async def test_list_models():
16
+ """Test listing models from HuggingFace provider"""
17
+ registry = ModelRegistry()
18
+ models = await registry.list_models(task="stt")
19
+
20
+ assert len(models) > 0
21
+ assert all(m.task.value == "stt" for m in models)
22
+
23
+ whisper_models = [m for m in models if "whisper" in m.id.lower()]
24
+ assert len(whisper_models) > 0
25
+
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_get_model_info():
29
+ """Test getting specific model info"""
30
+ registry = ModelRegistry()
31
+ model = await registry.get_model("openai/whisper-tiny")
32
+
33
+ assert model is not None
34
+ assert model.id == "openai/whisper-tiny"
35
+ assert model.task.value == "stt"
36
+ assert model.provider.value == "huggingface"
37
+
38
+
39
+ @pytest.mark.asyncio
40
+ async def test_model_registry_providers():
41
+ """Test that registry has providers configured"""
42
+ registry = ModelRegistry()
43
+ assert "huggingface" in registry.providers
44
+ assert registry.providers["huggingface"] is not None
@@ -0,0 +1,35 @@
1
+ from .adapters import (
2
+ BaseAdapter,
3
+ FasterWhisperAdapter,
4
+ STTAdapter,
5
+ TranscriptionResult,
6
+ TranscriptionSegment,
7
+ TranscriptionWord,
8
+ )
9
+ from .registry import (
10
+ ModelBackend,
11
+ ModelInfo,
12
+ ModelProvider,
13
+ ModelRegistry,
14
+ ModelStatus,
15
+ ModelTask,
16
+ format_bytes,
17
+ )
18
+
19
+ __version__ = "0.3.0"
20
+
21
+ __all__ = [
22
+ "ModelRegistry",
23
+ "ModelInfo",
24
+ "ModelStatus",
25
+ "ModelBackend",
26
+ "ModelProvider",
27
+ "ModelTask",
28
+ "format_bytes",
29
+ "BaseAdapter",
30
+ "STTAdapter",
31
+ "TranscriptionResult",
32
+ "TranscriptionSegment",
33
+ "TranscriptionWord",
34
+ "FasterWhisperAdapter",
35
+ ]
@@ -0,0 +1,29 @@
1
+ from .base import BaseAdapter
2
+ from .stt import (
3
+ FasterWhisperAdapter,
4
+ STTAdapter,
5
+ TranscriptionResult,
6
+ TranscriptionSegment,
7
+ TranscriptionWord,
8
+ )
9
+ from .tts import (
10
+ PiperTTSAdapter,
11
+ SimpleTTSAdapter,
12
+ TTSAdapter,
13
+ TTSResult,
14
+ Voice,
15
+ )
16
+
17
+ __all__ = [
18
+ "BaseAdapter",
19
+ "STTAdapter",
20
+ "TranscriptionResult",
21
+ "TranscriptionSegment",
22
+ "TranscriptionWord",
23
+ "FasterWhisperAdapter",
24
+ "TTSAdapter",
25
+ "TTSResult",
26
+ "Voice",
27
+ "SimpleTTSAdapter",
28
+ "PiperTTSAdapter",
29
+ ]
@@ -0,0 +1,34 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+
6
+ class BaseAdapter(ABC):
7
+ """Base interface for model adapters (STT, TTS, etc.)"""
8
+
9
+ @abstractmethod
10
+ async def load_model(self, model_path: Path, device: str = "auto", **kwargs) -> None:
11
+ """
12
+ Load model from local path
13
+
14
+ Args:
15
+ model_path: Path to model files
16
+ device: Device to load model on ('cpu', 'cuda', 'auto')
17
+ **kwargs: Additional backend-specific parameters
18
+ """
19
+ pass
20
+
21
+ @abstractmethod
22
+ async def unload_model(self) -> None:
23
+ """Unload model from memory"""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def is_loaded(self) -> bool:
28
+ """Check if model is currently loaded"""
29
+ pass
30
+
31
+ @abstractmethod
32
+ def get_model_info(self) -> dict[str, Any]:
33
+ """Get information about the loaded model"""
34
+ pass
@@ -0,0 +1,15 @@
1
+ from .base import (
2
+ STTAdapter,
3
+ TranscriptionResult,
4
+ TranscriptionSegment,
5
+ TranscriptionWord,
6
+ )
7
+ from .faster_whisper import FasterWhisperAdapter
8
+
9
+ __all__ = [
10
+ "STTAdapter",
11
+ "TranscriptionResult",
12
+ "TranscriptionSegment",
13
+ "TranscriptionWord",
14
+ "FasterWhisperAdapter",
15
+ ]
@@ -0,0 +1,70 @@
1
+ from abc import abstractmethod
2
+ from pathlib import Path
3
+ from typing import BinaryIO
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from ..base import BaseAdapter
8
+
9
+
10
+ class TranscriptionSegment(BaseModel):
11
+ """A segment of transcribed text with timing"""
12
+
13
+ id: int
14
+ start: float
15
+ end: float
16
+ text: str
17
+ tokens: list[int] | None = None
18
+ temperature: float | None = None
19
+ avg_logprob: float | None = None
20
+ compression_ratio: float | None = None
21
+ no_speech_prob: float | None = None
22
+
23
+
24
+ class TranscriptionWord(BaseModel):
25
+ """Word-level timestamp"""
26
+
27
+ word: str
28
+ start: float
29
+ end: float
30
+ probability: float | None = None
31
+
32
+
33
+ class TranscriptionResult(BaseModel):
34
+ """Transcription result"""
35
+
36
+ text: str
37
+ language: str
38
+ duration: float
39
+ segments: list[TranscriptionSegment] | None = None
40
+ words: list[TranscriptionWord] | None = None
41
+
42
+
43
+ class STTAdapter(BaseAdapter):
44
+ """Base interface for Speech-to-Text adapters"""
45
+
46
+ @abstractmethod
47
+ async def transcribe(
48
+ self,
49
+ audio: str | Path | BinaryIO,
50
+ language: str | None = None,
51
+ task: str = "transcribe",
52
+ temperature: float = 0.0,
53
+ word_timestamps: bool = False,
54
+ **kwargs,
55
+ ) -> TranscriptionResult:
56
+ """
57
+ Transcribe audio to text
58
+
59
+ Args:
60
+ audio: Audio file path or file-like object
61
+ language: Language code (ISO 639-1) or None for auto-detect
62
+ task: Task type ('transcribe' or 'translate')
63
+ temperature: Sampling temperature (0.0 = greedy)
64
+ word_timestamps: Whether to include word-level timestamps
65
+ **kwargs: Additional backend-specific parameters
66
+
67
+ Returns:
68
+ TranscriptionResult with text and metadata
69
+ """
70
+ pass
@@ -0,0 +1,218 @@
1
+ import logging
2
+ import tempfile
3
+ from pathlib import Path
4
+ from typing import Any, BinaryIO
5
+
6
+ from faster_whisper import WhisperModel
7
+
8
+ from ...utils import optimize_inference_settings
9
+ from .base import (
10
+ STTAdapter,
11
+ TranscriptionResult,
12
+ TranscriptionSegment,
13
+ TranscriptionWord,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class FasterWhisperAdapter(STTAdapter):
20
+ """
21
+ faster-whisper implementation with GPU optimization
22
+
23
+ Automatically detects and uses GPU when available for 4x+ faster inference.
24
+ Falls back to optimized CPU inference when GPU is not available.
25
+ """
26
+
27
+ def __init__(self):
28
+ self.model: WhisperModel | None = None
29
+ self.model_path: Path | None = None
30
+ self.device: str = "auto"
31
+ self.compute_type: str = "auto"
32
+
33
+ async def load_model(
34
+ self,
35
+ model_path: Path,
36
+ device: str = "auto",
37
+ compute_type: str = "auto",
38
+ **kwargs,
39
+ ) -> None:
40
+ """
41
+ Load Whisper model with optimal settings
42
+
43
+ Args:
44
+ model_path: Path to model files
45
+ device: Device to use ('cpu', 'cuda', 'auto')
46
+ compute_type: Compute type ('int8', 'int8_float16', 'float16', 'float32', 'auto')
47
+ **kwargs: Additional faster-whisper parameters
48
+ """
49
+ if self.model is not None:
50
+ await self.unload_model()
51
+
52
+ self.model_path = model_path
53
+
54
+ model_size = "base"
55
+ if "tiny" in str(model_path):
56
+ model_size = "tiny"
57
+ elif "small" in str(model_path):
58
+ model_size = "small"
59
+ elif "medium" in str(model_path):
60
+ model_size = "medium"
61
+ elif "large" in str(model_path):
62
+ model_size = "large"
63
+
64
+ settings = optimize_inference_settings(device, model_size)
65
+
66
+ self.device = settings["device"]
67
+ self.compute_type = settings["compute_type"] if compute_type == "auto" else compute_type
68
+
69
+ load_kwargs = {
70
+ "device": self.device,
71
+ "compute_type": self.compute_type,
72
+ }
73
+
74
+ if self.device == "cpu" and "num_workers" in settings:
75
+ load_kwargs["num_workers"] = settings["num_workers"]
76
+ load_kwargs["cpu_threads"] = settings.get("cpu_threads", 0)
77
+
78
+ load_kwargs.update(kwargs)
79
+
80
+ logger.info(f"Loading model from {model_path} on {self.device} with compute_type={self.compute_type}")
81
+
82
+ self.model = WhisperModel(str(model_path), **load_kwargs)
83
+
84
+ logger.info(f"Model loaded successfully on {self.device}")
85
+
86
+ async def unload_model(self) -> None:
87
+ """Unload model from memory and free GPU/CPU resources"""
88
+ if self.model is not None:
89
+ del self.model
90
+ self.model = None
91
+ self.model_path = None
92
+
93
+ if self.device == "cuda":
94
+ try:
95
+ import torch
96
+
97
+ torch.cuda.empty_cache()
98
+ logger.info("GPU memory cleared")
99
+ except Exception as e:
100
+ logger.warning(f"Failed to clear GPU cache: {e}")
101
+
102
+ def is_loaded(self) -> bool:
103
+ """Check if model is loaded"""
104
+ return self.model is not None
105
+
106
+ def get_model_info(self) -> dict[str, Any]:
107
+ """Get model information including device and optimization details"""
108
+ info = {
109
+ "model_path": str(self.model_path) if self.model_path else None,
110
+ "device": self.device,
111
+ "compute_type": self.compute_type,
112
+ "loaded": self.is_loaded(),
113
+ }
114
+
115
+ if self.device == "cuda":
116
+ try:
117
+ import torch
118
+
119
+ if torch.cuda.is_available():
120
+ info["gpu_name"] = torch.cuda.get_device_name(0)
121
+ info["vram_allocated_gb"] = torch.cuda.memory_allocated(0) / (1024**3)
122
+ info["vram_total_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
123
+ except Exception:
124
+ pass
125
+
126
+ return info
127
+
128
+ async def transcribe(
129
+ self,
130
+ audio: str | Path | BinaryIO,
131
+ language: str | None = None,
132
+ task: str = "transcribe",
133
+ temperature: float = 0.0,
134
+ word_timestamps: bool = False,
135
+ beam_size: int = 5,
136
+ vad_filter: bool = True,
137
+ **kwargs,
138
+ ) -> TranscriptionResult:
139
+ """
140
+ Transcribe audio with optimized settings
141
+
142
+ Args:
143
+ audio: Audio file path or file-like object
144
+ language: Language code or None for auto-detect
145
+ task: 'transcribe' or 'translate'
146
+ temperature: Sampling temperature (0.0 for greedy)
147
+ word_timestamps: Enable word-level timestamps
148
+ beam_size: Beam size (5 is a good balance, 1 for faster greedy decoding)
149
+ vad_filter: Enable Voice Activity Detection for better performance
150
+ **kwargs: Additional faster-whisper parameters
151
+
152
+ Returns:
153
+ TranscriptionResult with text and metadata
154
+ """
155
+ if not self.is_loaded():
156
+ raise RuntimeError("Model not loaded. Call load_model() first.")
157
+
158
+ temp_file = None
159
+ try:
160
+ if isinstance(audio, (str, Path)):
161
+ audio_path = str(audio)
162
+ else:
163
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".audio")
164
+ temp_file.write(audio.read())
165
+ temp_file.close()
166
+ audio_path = temp_file.name
167
+
168
+ transcribe_kwargs = {
169
+ "language": language,
170
+ "task": task,
171
+ "temperature": temperature,
172
+ "word_timestamps": word_timestamps,
173
+ "beam_size": beam_size,
174
+ "vad_filter": vad_filter,
175
+ }
176
+ transcribe_kwargs.update(kwargs)
177
+
178
+ segments, info = self.model.transcribe(audio_path, **transcribe_kwargs)
179
+
180
+ segments_list = []
181
+ words_list = []
182
+ full_text = []
183
+
184
+ for idx, segment in enumerate(segments):
185
+ full_text.append(segment.text)
186
+
187
+ seg = TranscriptionSegment(
188
+ id=idx,
189
+ start=segment.start,
190
+ end=segment.end,
191
+ text=segment.text,
192
+ avg_logprob=segment.avg_logprob,
193
+ no_speech_prob=segment.no_speech_prob,
194
+ )
195
+ segments_list.append(seg)
196
+
197
+ if word_timestamps and hasattr(segment, "words"):
198
+ for word in segment.words:
199
+ words_list.append(
200
+ TranscriptionWord(
201
+ word=word.word,
202
+ start=word.start,
203
+ end=word.end,
204
+ probability=word.probability if hasattr(word, "probability") else None,
205
+ )
206
+ )
207
+
208
+ return TranscriptionResult(
209
+ text=" ".join(full_text).strip(),
210
+ language=info.language,
211
+ duration=info.duration if hasattr(info, "duration") else 0.0,
212
+ segments=segments_list if segments_list else None,
213
+ words=words_list if words_list else None,
214
+ )
215
+
216
+ finally:
217
+ if temp_file and Path(temp_file.name).exists():
218
+ Path(temp_file.name).unlink()
@@ -0,0 +1,4 @@
1
+ from .base import TTSAdapter, TTSResult, Voice
2
+ from .piper import PiperTTSAdapter, SimpleTTSAdapter
3
+
4
+ __all__ = ["TTSAdapter", "TTSResult", "Voice", "SimpleTTSAdapter", "PiperTTSAdapter"]
@@ -0,0 +1,63 @@
1
+ from abc import abstractmethod
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from ..base import BaseAdapter
6
+
7
+
8
+ class Voice(BaseModel):
9
+ """Voice configuration for TTS"""
10
+
11
+ id: str
12
+ name: str
13
+ language: str
14
+ gender: str | None = None
15
+
16
+
17
+ class TTSResult(BaseModel):
18
+ """Text-to-Speech result"""
19
+
20
+ audio_data: bytes
21
+ sample_rate: int
22
+ duration: float
23
+ format: str
24
+
25
+
26
+ class TTSAdapter(BaseAdapter):
27
+ """Base interface for Text-to-Speech adapters"""
28
+
29
+ @abstractmethod
30
+ async def synthesize(
31
+ self,
32
+ text: str,
33
+ voice: str | None = None,
34
+ speed: float = 1.0,
35
+ pitch: float = 1.0,
36
+ output_format: str = "wav",
37
+ **kwargs,
38
+ ) -> TTSResult:
39
+ """
40
+ Synthesize text to speech
41
+
42
+ Args:
43
+ text: Text to convert to speech
44
+ voice: Voice ID to use (None for default)
45
+ speed: Speech speed multiplier (1.0 = normal)
46
+ pitch: Voice pitch multiplier (1.0 = normal)
47
+ output_format: Output audio format (wav, mp3, etc.)
48
+ **kwargs: Additional backend-specific parameters
49
+
50
+ Returns:
51
+ TTSResult with audio data and metadata
52
+ """
53
+ pass
54
+
55
+ @abstractmethod
56
+ async def get_voices(self) -> list[Voice]:
57
+ """
58
+ Get list of available voices
59
+
60
+ Returns:
61
+ List of Voice objects
62
+ """
63
+ pass