vocal-core 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocal_core-0.3.0/.gitignore +74 -0
- vocal_core-0.3.0/MANIFEST.in +3 -0
- vocal_core-0.3.0/PKG-INFO +29 -0
- vocal_core-0.3.0/pyproject.toml +44 -0
- vocal_core-0.3.0/tests/test_registry.py +44 -0
- vocal_core-0.3.0/vocal_core/__init__.py +35 -0
- vocal_core-0.3.0/vocal_core/adapters/__init__.py +29 -0
- vocal_core-0.3.0/vocal_core/adapters/base.py +34 -0
- vocal_core-0.3.0/vocal_core/adapters/stt/__init__.py +15 -0
- vocal_core-0.3.0/vocal_core/adapters/stt/base.py +70 -0
- vocal_core-0.3.0/vocal_core/adapters/stt/faster_whisper.py +218 -0
- vocal_core-0.3.0/vocal_core/adapters/tts/__init__.py +4 -0
- vocal_core-0.3.0/vocal_core/adapters/tts/base.py +63 -0
- vocal_core-0.3.0/vocal_core/adapters/tts/piper.py +368 -0
- vocal_core-0.3.0/vocal_core/registry/__init__.py +19 -0
- vocal_core-0.3.0/vocal_core/registry/base.py +170 -0
- vocal_core-0.3.0/vocal_core/registry/model_info.py +88 -0
- vocal_core-0.3.0/vocal_core/registry/providers/__init__.py +7 -0
- vocal_core-0.3.0/vocal_core/registry/providers/base.py +69 -0
- vocal_core-0.3.0/vocal_core/registry/providers/huggingface.py +351 -0
- vocal_core-0.3.0/vocal_core/storage/__init__.py +3 -0
- vocal_core-0.3.0/vocal_core/utils/__init__.py +17 -0
- vocal_core-0.3.0/vocal_core/utils/device.py +160 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
MANIFEST
|
|
23
|
+
|
|
24
|
+
# Virtual Environment
|
|
25
|
+
.venv/
|
|
26
|
+
venv/
|
|
27
|
+
ENV/
|
|
28
|
+
env/
|
|
29
|
+
|
|
30
|
+
# UV
|
|
31
|
+
uv.lock
|
|
32
|
+
|
|
33
|
+
# IDE
|
|
34
|
+
.vscode/
|
|
35
|
+
.idea/
|
|
36
|
+
*.swp
|
|
37
|
+
*.swo
|
|
38
|
+
*~
|
|
39
|
+
.DS_Store
|
|
40
|
+
|
|
41
|
+
# Testing
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
.coverage
|
|
44
|
+
htmlcov/
|
|
45
|
+
.tox/
|
|
46
|
+
|
|
47
|
+
# Jupyter
|
|
48
|
+
.ipynb_checkpoints
|
|
49
|
+
|
|
50
|
+
# Model cache
|
|
51
|
+
.cache/
|
|
52
|
+
models/
|
|
53
|
+
*.ckpt
|
|
54
|
+
*.pth
|
|
55
|
+
*.pt
|
|
56
|
+
*.safetensors
|
|
57
|
+
|
|
58
|
+
# Audio files (test data)
|
|
59
|
+
*.mp3
|
|
60
|
+
*.wav
|
|
61
|
+
*.m4a
|
|
62
|
+
*.ogg
|
|
63
|
+
*.flac
|
|
64
|
+
|
|
65
|
+
# Logs
|
|
66
|
+
*.log
|
|
67
|
+
logs/
|
|
68
|
+
|
|
69
|
+
# Environment variables
|
|
70
|
+
.env
|
|
71
|
+
.env.local
|
|
72
|
+
|
|
73
|
+
# OS
|
|
74
|
+
Thumbs.db
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vocal-core
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Core model registry and adapters for Vocal - Ollama for Voice Models
|
|
5
|
+
Project-URL: Homepage, https://github.com/niradler/vocal
|
|
6
|
+
Project-URL: Documentation, https://github.com/niradler/vocal#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/niradler/vocal
|
|
8
|
+
Project-URL: Issues, https://github.com/niradler/vocal/issues
|
|
9
|
+
Author: Vocal Contributors
|
|
10
|
+
License: SSPL-1.0
|
|
11
|
+
Keywords: ai,model-registry,speech-to-text,stt,tts,whisper
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: Other/Proprietary License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: faster-whisper>=1.0.0
|
|
22
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
23
|
+
Requires-Dist: numpy>=1.24.0
|
|
24
|
+
Requires-Dist: pydantic-settings>=2.1.0
|
|
25
|
+
Requires-Dist: pydantic>=2.5.0
|
|
26
|
+
Requires-Dist: pyttsx3>=2.90
|
|
27
|
+
Provides-Extra: piper
|
|
28
|
+
Requires-Dist: piper-tts>=1.2.0; extra == 'piper'
|
|
29
|
+
Requires-Dist: scipy>=1.11.0; extra == 'piper'
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vocal-core"
|
|
3
|
+
version = "0.3.0"
|
|
4
|
+
description = "Core model registry and adapters for Vocal - Ollama for Voice Models"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = { text = "SSPL-1.0" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Vocal Contributors" }
|
|
9
|
+
]
|
|
10
|
+
keywords = ["speech-to-text", "tts", "stt", "whisper", "model-registry", "ai"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: Other/Proprietary License",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"faster-whisper>=1.0.0",
|
|
23
|
+
"huggingface-hub>=0.20.0",
|
|
24
|
+
"pydantic>=2.5.0",
|
|
25
|
+
"pydantic-settings>=2.1.0",
|
|
26
|
+
"pyttsx3>=2.90",
|
|
27
|
+
"numpy>=1.24.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/niradler/vocal"
|
|
32
|
+
Documentation = "https://github.com/niradler/vocal#readme"
|
|
33
|
+
Repository = "https://github.com/niradler/vocal"
|
|
34
|
+
Issues = "https://github.com/niradler/vocal/issues"
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
piper = [
|
|
38
|
+
"piper-tts>=1.2.0",
|
|
39
|
+
"scipy>=1.11.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from vocal_core import ModelRegistry
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.asyncio
|
|
7
|
+
async def test_model_registry_init():
|
|
8
|
+
"""Test that ModelRegistry initializes correctly"""
|
|
9
|
+
registry = ModelRegistry()
|
|
10
|
+
assert registry is not None
|
|
11
|
+
assert registry.storage_path.exists()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.asyncio
|
|
15
|
+
async def test_list_models():
|
|
16
|
+
"""Test listing models from HuggingFace provider"""
|
|
17
|
+
registry = ModelRegistry()
|
|
18
|
+
models = await registry.list_models(task="stt")
|
|
19
|
+
|
|
20
|
+
assert len(models) > 0
|
|
21
|
+
assert all(m.task.value == "stt" for m in models)
|
|
22
|
+
|
|
23
|
+
whisper_models = [m for m in models if "whisper" in m.id.lower()]
|
|
24
|
+
assert len(whisper_models) > 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.mark.asyncio
|
|
28
|
+
async def test_get_model_info():
|
|
29
|
+
"""Test getting specific model info"""
|
|
30
|
+
registry = ModelRegistry()
|
|
31
|
+
model = await registry.get_model("openai/whisper-tiny")
|
|
32
|
+
|
|
33
|
+
assert model is not None
|
|
34
|
+
assert model.id == "openai/whisper-tiny"
|
|
35
|
+
assert model.task.value == "stt"
|
|
36
|
+
assert model.provider.value == "huggingface"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.asyncio
|
|
40
|
+
async def test_model_registry_providers():
|
|
41
|
+
"""Test that registry has providers configured"""
|
|
42
|
+
registry = ModelRegistry()
|
|
43
|
+
assert "huggingface" in registry.providers
|
|
44
|
+
assert registry.providers["huggingface"] is not None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from .adapters import (
|
|
2
|
+
BaseAdapter,
|
|
3
|
+
FasterWhisperAdapter,
|
|
4
|
+
STTAdapter,
|
|
5
|
+
TranscriptionResult,
|
|
6
|
+
TranscriptionSegment,
|
|
7
|
+
TranscriptionWord,
|
|
8
|
+
)
|
|
9
|
+
from .registry import (
|
|
10
|
+
ModelBackend,
|
|
11
|
+
ModelInfo,
|
|
12
|
+
ModelProvider,
|
|
13
|
+
ModelRegistry,
|
|
14
|
+
ModelStatus,
|
|
15
|
+
ModelTask,
|
|
16
|
+
format_bytes,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__version__ = "0.3.0"
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"ModelRegistry",
|
|
23
|
+
"ModelInfo",
|
|
24
|
+
"ModelStatus",
|
|
25
|
+
"ModelBackend",
|
|
26
|
+
"ModelProvider",
|
|
27
|
+
"ModelTask",
|
|
28
|
+
"format_bytes",
|
|
29
|
+
"BaseAdapter",
|
|
30
|
+
"STTAdapter",
|
|
31
|
+
"TranscriptionResult",
|
|
32
|
+
"TranscriptionSegment",
|
|
33
|
+
"TranscriptionWord",
|
|
34
|
+
"FasterWhisperAdapter",
|
|
35
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from .base import BaseAdapter
|
|
2
|
+
from .stt import (
|
|
3
|
+
FasterWhisperAdapter,
|
|
4
|
+
STTAdapter,
|
|
5
|
+
TranscriptionResult,
|
|
6
|
+
TranscriptionSegment,
|
|
7
|
+
TranscriptionWord,
|
|
8
|
+
)
|
|
9
|
+
from .tts import (
|
|
10
|
+
PiperTTSAdapter,
|
|
11
|
+
SimpleTTSAdapter,
|
|
12
|
+
TTSAdapter,
|
|
13
|
+
TTSResult,
|
|
14
|
+
Voice,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BaseAdapter",
|
|
19
|
+
"STTAdapter",
|
|
20
|
+
"TranscriptionResult",
|
|
21
|
+
"TranscriptionSegment",
|
|
22
|
+
"TranscriptionWord",
|
|
23
|
+
"FasterWhisperAdapter",
|
|
24
|
+
"TTSAdapter",
|
|
25
|
+
"TTSResult",
|
|
26
|
+
"Voice",
|
|
27
|
+
"SimpleTTSAdapter",
|
|
28
|
+
"PiperTTSAdapter",
|
|
29
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseAdapter(ABC):
|
|
7
|
+
"""Base interface for model adapters (STT, TTS, etc.)"""
|
|
8
|
+
|
|
9
|
+
@abstractmethod
|
|
10
|
+
async def load_model(self, model_path: Path, device: str = "auto", **kwargs) -> None:
|
|
11
|
+
"""
|
|
12
|
+
Load model from local path
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
model_path: Path to model files
|
|
16
|
+
device: Device to load model on ('cpu', 'cuda', 'auto')
|
|
17
|
+
**kwargs: Additional backend-specific parameters
|
|
18
|
+
"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
async def unload_model(self) -> None:
|
|
23
|
+
"""Unload model from memory"""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def is_loaded(self) -> bool:
|
|
28
|
+
"""Check if model is currently loaded"""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def get_model_info(self) -> dict[str, Any]:
|
|
33
|
+
"""Get information about the loaded model"""
|
|
34
|
+
pass
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
STTAdapter,
|
|
3
|
+
TranscriptionResult,
|
|
4
|
+
TranscriptionSegment,
|
|
5
|
+
TranscriptionWord,
|
|
6
|
+
)
|
|
7
|
+
from .faster_whisper import FasterWhisperAdapter
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"STTAdapter",
|
|
11
|
+
"TranscriptionResult",
|
|
12
|
+
"TranscriptionSegment",
|
|
13
|
+
"TranscriptionWord",
|
|
14
|
+
"FasterWhisperAdapter",
|
|
15
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import BinaryIO
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from ..base import BaseAdapter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TranscriptionSegment(BaseModel):
|
|
11
|
+
"""A segment of transcribed text with timing"""
|
|
12
|
+
|
|
13
|
+
id: int
|
|
14
|
+
start: float
|
|
15
|
+
end: float
|
|
16
|
+
text: str
|
|
17
|
+
tokens: list[int] | None = None
|
|
18
|
+
temperature: float | None = None
|
|
19
|
+
avg_logprob: float | None = None
|
|
20
|
+
compression_ratio: float | None = None
|
|
21
|
+
no_speech_prob: float | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TranscriptionWord(BaseModel):
|
|
25
|
+
"""Word-level timestamp"""
|
|
26
|
+
|
|
27
|
+
word: str
|
|
28
|
+
start: float
|
|
29
|
+
end: float
|
|
30
|
+
probability: float | None = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TranscriptionResult(BaseModel):
|
|
34
|
+
"""Transcription result"""
|
|
35
|
+
|
|
36
|
+
text: str
|
|
37
|
+
language: str
|
|
38
|
+
duration: float
|
|
39
|
+
segments: list[TranscriptionSegment] | None = None
|
|
40
|
+
words: list[TranscriptionWord] | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class STTAdapter(BaseAdapter):
|
|
44
|
+
"""Base interface for Speech-to-Text adapters"""
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
async def transcribe(
|
|
48
|
+
self,
|
|
49
|
+
audio: str | Path | BinaryIO,
|
|
50
|
+
language: str | None = None,
|
|
51
|
+
task: str = "transcribe",
|
|
52
|
+
temperature: float = 0.0,
|
|
53
|
+
word_timestamps: bool = False,
|
|
54
|
+
**kwargs,
|
|
55
|
+
) -> TranscriptionResult:
|
|
56
|
+
"""
|
|
57
|
+
Transcribe audio to text
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
audio: Audio file path or file-like object
|
|
61
|
+
language: Language code (ISO 639-1) or None for auto-detect
|
|
62
|
+
task: Task type ('transcribe' or 'translate')
|
|
63
|
+
temperature: Sampling temperature (0.0 = greedy)
|
|
64
|
+
word_timestamps: Whether to include word-level timestamps
|
|
65
|
+
**kwargs: Additional backend-specific parameters
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
TranscriptionResult with text and metadata
|
|
69
|
+
"""
|
|
70
|
+
pass
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import tempfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, BinaryIO
|
|
5
|
+
|
|
6
|
+
from faster_whisper import WhisperModel
|
|
7
|
+
|
|
8
|
+
from ...utils import optimize_inference_settings
|
|
9
|
+
from .base import (
|
|
10
|
+
STTAdapter,
|
|
11
|
+
TranscriptionResult,
|
|
12
|
+
TranscriptionSegment,
|
|
13
|
+
TranscriptionWord,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FasterWhisperAdapter(STTAdapter):
|
|
20
|
+
"""
|
|
21
|
+
faster-whisper implementation with GPU optimization
|
|
22
|
+
|
|
23
|
+
Automatically detects and uses GPU when available for 4x+ faster inference.
|
|
24
|
+
Falls back to optimized CPU inference when GPU is not available.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self.model: WhisperModel | None = None
|
|
29
|
+
self.model_path: Path | None = None
|
|
30
|
+
self.device: str = "auto"
|
|
31
|
+
self.compute_type: str = "auto"
|
|
32
|
+
|
|
33
|
+
async def load_model(
|
|
34
|
+
self,
|
|
35
|
+
model_path: Path,
|
|
36
|
+
device: str = "auto",
|
|
37
|
+
compute_type: str = "auto",
|
|
38
|
+
**kwargs,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Load Whisper model with optimal settings
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
model_path: Path to model files
|
|
45
|
+
device: Device to use ('cpu', 'cuda', 'auto')
|
|
46
|
+
compute_type: Compute type ('int8', 'int8_float16', 'float16', 'float32', 'auto')
|
|
47
|
+
**kwargs: Additional faster-whisper parameters
|
|
48
|
+
"""
|
|
49
|
+
if self.model is not None:
|
|
50
|
+
await self.unload_model()
|
|
51
|
+
|
|
52
|
+
self.model_path = model_path
|
|
53
|
+
|
|
54
|
+
model_size = "base"
|
|
55
|
+
if "tiny" in str(model_path):
|
|
56
|
+
model_size = "tiny"
|
|
57
|
+
elif "small" in str(model_path):
|
|
58
|
+
model_size = "small"
|
|
59
|
+
elif "medium" in str(model_path):
|
|
60
|
+
model_size = "medium"
|
|
61
|
+
elif "large" in str(model_path):
|
|
62
|
+
model_size = "large"
|
|
63
|
+
|
|
64
|
+
settings = optimize_inference_settings(device, model_size)
|
|
65
|
+
|
|
66
|
+
self.device = settings["device"]
|
|
67
|
+
self.compute_type = settings["compute_type"] if compute_type == "auto" else compute_type
|
|
68
|
+
|
|
69
|
+
load_kwargs = {
|
|
70
|
+
"device": self.device,
|
|
71
|
+
"compute_type": self.compute_type,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if self.device == "cpu" and "num_workers" in settings:
|
|
75
|
+
load_kwargs["num_workers"] = settings["num_workers"]
|
|
76
|
+
load_kwargs["cpu_threads"] = settings.get("cpu_threads", 0)
|
|
77
|
+
|
|
78
|
+
load_kwargs.update(kwargs)
|
|
79
|
+
|
|
80
|
+
logger.info(f"Loading model from {model_path} on {self.device} with compute_type={self.compute_type}")
|
|
81
|
+
|
|
82
|
+
self.model = WhisperModel(str(model_path), **load_kwargs)
|
|
83
|
+
|
|
84
|
+
logger.info(f"Model loaded successfully on {self.device}")
|
|
85
|
+
|
|
86
|
+
async def unload_model(self) -> None:
|
|
87
|
+
"""Unload model from memory and free GPU/CPU resources"""
|
|
88
|
+
if self.model is not None:
|
|
89
|
+
del self.model
|
|
90
|
+
self.model = None
|
|
91
|
+
self.model_path = None
|
|
92
|
+
|
|
93
|
+
if self.device == "cuda":
|
|
94
|
+
try:
|
|
95
|
+
import torch
|
|
96
|
+
|
|
97
|
+
torch.cuda.empty_cache()
|
|
98
|
+
logger.info("GPU memory cleared")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.warning(f"Failed to clear GPU cache: {e}")
|
|
101
|
+
|
|
102
|
+
def is_loaded(self) -> bool:
|
|
103
|
+
"""Check if model is loaded"""
|
|
104
|
+
return self.model is not None
|
|
105
|
+
|
|
106
|
+
def get_model_info(self) -> dict[str, Any]:
|
|
107
|
+
"""Get model information including device and optimization details"""
|
|
108
|
+
info = {
|
|
109
|
+
"model_path": str(self.model_path) if self.model_path else None,
|
|
110
|
+
"device": self.device,
|
|
111
|
+
"compute_type": self.compute_type,
|
|
112
|
+
"loaded": self.is_loaded(),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if self.device == "cuda":
|
|
116
|
+
try:
|
|
117
|
+
import torch
|
|
118
|
+
|
|
119
|
+
if torch.cuda.is_available():
|
|
120
|
+
info["gpu_name"] = torch.cuda.get_device_name(0)
|
|
121
|
+
info["vram_allocated_gb"] = torch.cuda.memory_allocated(0) / (1024**3)
|
|
122
|
+
info["vram_total_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
|
123
|
+
except Exception:
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
return info
|
|
127
|
+
|
|
128
|
+
async def transcribe(
|
|
129
|
+
self,
|
|
130
|
+
audio: str | Path | BinaryIO,
|
|
131
|
+
language: str | None = None,
|
|
132
|
+
task: str = "transcribe",
|
|
133
|
+
temperature: float = 0.0,
|
|
134
|
+
word_timestamps: bool = False,
|
|
135
|
+
beam_size: int = 5,
|
|
136
|
+
vad_filter: bool = True,
|
|
137
|
+
**kwargs,
|
|
138
|
+
) -> TranscriptionResult:
|
|
139
|
+
"""
|
|
140
|
+
Transcribe audio with optimized settings
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
audio: Audio file path or file-like object
|
|
144
|
+
language: Language code or None for auto-detect
|
|
145
|
+
task: 'transcribe' or 'translate'
|
|
146
|
+
temperature: Sampling temperature (0.0 for greedy)
|
|
147
|
+
word_timestamps: Enable word-level timestamps
|
|
148
|
+
beam_size: Beam size (5 is a good balance, 1 for faster greedy decoding)
|
|
149
|
+
vad_filter: Enable Voice Activity Detection for better performance
|
|
150
|
+
**kwargs: Additional faster-whisper parameters
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
TranscriptionResult with text and metadata
|
|
154
|
+
"""
|
|
155
|
+
if not self.is_loaded():
|
|
156
|
+
raise RuntimeError("Model not loaded. Call load_model() first.")
|
|
157
|
+
|
|
158
|
+
temp_file = None
|
|
159
|
+
try:
|
|
160
|
+
if isinstance(audio, (str, Path)):
|
|
161
|
+
audio_path = str(audio)
|
|
162
|
+
else:
|
|
163
|
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".audio")
|
|
164
|
+
temp_file.write(audio.read())
|
|
165
|
+
temp_file.close()
|
|
166
|
+
audio_path = temp_file.name
|
|
167
|
+
|
|
168
|
+
transcribe_kwargs = {
|
|
169
|
+
"language": language,
|
|
170
|
+
"task": task,
|
|
171
|
+
"temperature": temperature,
|
|
172
|
+
"word_timestamps": word_timestamps,
|
|
173
|
+
"beam_size": beam_size,
|
|
174
|
+
"vad_filter": vad_filter,
|
|
175
|
+
}
|
|
176
|
+
transcribe_kwargs.update(kwargs)
|
|
177
|
+
|
|
178
|
+
segments, info = self.model.transcribe(audio_path, **transcribe_kwargs)
|
|
179
|
+
|
|
180
|
+
segments_list = []
|
|
181
|
+
words_list = []
|
|
182
|
+
full_text = []
|
|
183
|
+
|
|
184
|
+
for idx, segment in enumerate(segments):
|
|
185
|
+
full_text.append(segment.text)
|
|
186
|
+
|
|
187
|
+
seg = TranscriptionSegment(
|
|
188
|
+
id=idx,
|
|
189
|
+
start=segment.start,
|
|
190
|
+
end=segment.end,
|
|
191
|
+
text=segment.text,
|
|
192
|
+
avg_logprob=segment.avg_logprob,
|
|
193
|
+
no_speech_prob=segment.no_speech_prob,
|
|
194
|
+
)
|
|
195
|
+
segments_list.append(seg)
|
|
196
|
+
|
|
197
|
+
if word_timestamps and hasattr(segment, "words"):
|
|
198
|
+
for word in segment.words:
|
|
199
|
+
words_list.append(
|
|
200
|
+
TranscriptionWord(
|
|
201
|
+
word=word.word,
|
|
202
|
+
start=word.start,
|
|
203
|
+
end=word.end,
|
|
204
|
+
probability=word.probability if hasattr(word, "probability") else None,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return TranscriptionResult(
|
|
209
|
+
text=" ".join(full_text).strip(),
|
|
210
|
+
language=info.language,
|
|
211
|
+
duration=info.duration if hasattr(info, "duration") else 0.0,
|
|
212
|
+
segments=segments_list if segments_list else None,
|
|
213
|
+
words=words_list if words_list else None,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
finally:
|
|
217
|
+
if temp_file and Path(temp_file.name).exists():
|
|
218
|
+
Path(temp_file.name).unlink()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from ..base import BaseAdapter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Voice(BaseModel):
|
|
9
|
+
"""Voice configuration for TTS"""
|
|
10
|
+
|
|
11
|
+
id: str
|
|
12
|
+
name: str
|
|
13
|
+
language: str
|
|
14
|
+
gender: str | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TTSResult(BaseModel):
|
|
18
|
+
"""Text-to-Speech result"""
|
|
19
|
+
|
|
20
|
+
audio_data: bytes
|
|
21
|
+
sample_rate: int
|
|
22
|
+
duration: float
|
|
23
|
+
format: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TTSAdapter(BaseAdapter):
|
|
27
|
+
"""Base interface for Text-to-Speech adapters"""
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
async def synthesize(
|
|
31
|
+
self,
|
|
32
|
+
text: str,
|
|
33
|
+
voice: str | None = None,
|
|
34
|
+
speed: float = 1.0,
|
|
35
|
+
pitch: float = 1.0,
|
|
36
|
+
output_format: str = "wav",
|
|
37
|
+
**kwargs,
|
|
38
|
+
) -> TTSResult:
|
|
39
|
+
"""
|
|
40
|
+
Synthesize text to speech
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
text: Text to convert to speech
|
|
44
|
+
voice: Voice ID to use (None for default)
|
|
45
|
+
speed: Speech speed multiplier (1.0 = normal)
|
|
46
|
+
pitch: Voice pitch multiplier (1.0 = normal)
|
|
47
|
+
output_format: Output audio format (wav, mp3, etc.)
|
|
48
|
+
**kwargs: Additional backend-specific parameters
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
TTSResult with audio data and metadata
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
async def get_voices(self) -> list[Voice]:
|
|
57
|
+
"""
|
|
58
|
+
Get list of available voices
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of Voice objects
|
|
62
|
+
"""
|
|
63
|
+
pass
|