videosdk-plugins-navana 0.0.48__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- videosdk_plugins_navana-0.0.48/.gitignore +19 -0
- videosdk_plugins_navana-0.0.48/PKG-INFO +30 -0
- videosdk_plugins_navana-0.0.48/README.md +9 -0
- videosdk_plugins_navana-0.0.48/pyproject.toml +49 -0
- videosdk_plugins_navana-0.0.48/videosdk/plugins/navana/__init__.py +3 -0
- videosdk_plugins_navana-0.0.48/videosdk/plugins/navana/stt.py +158 -0
- videosdk_plugins_navana-0.0.48/videosdk/plugins/navana/version.py +1 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-navana
|
|
3
|
+
Version: 0.0.48
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for Navana STT services
|
|
5
|
+
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: ai,audio,bodhi,indian-languages,navana,speech-to-text,stt,video,videosdk
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: aiohttp
|
|
16
|
+
Requires-Dist: bodhi-sdk
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: scipy>=1.11.0
|
|
19
|
+
Requires-Dist: videosdk-agents>=0.0.48
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# VideoSDK Navana Plugin
|
|
23
|
+
|
|
24
|
+
Agent Framework plugin for STT services from Navana Tech.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install videosdk-plugins-navana
|
|
30
|
+
```
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-navana"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for Navana STT services"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "videosdk" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"video",
|
|
15
|
+
"audio",
|
|
16
|
+
"ai",
|
|
17
|
+
"videosdk",
|
|
18
|
+
"bodhi",
|
|
19
|
+
"navana",
|
|
20
|
+
"stt",
|
|
21
|
+
"speech-to-text",
|
|
22
|
+
"navana",
|
|
23
|
+
"indian-languages",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"Development Status :: 4 - Beta",
|
|
28
|
+
"Intended Audience :: Developers",
|
|
29
|
+
"Topic :: Communications :: Conferencing",
|
|
30
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
31
|
+
"Topic :: Multimedia :: Video",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
]
|
|
34
|
+
dependencies = [
|
|
35
|
+
"videosdk-agents>=0.0.48",
|
|
36
|
+
"aiohttp",
|
|
37
|
+
"numpy",
|
|
38
|
+
"bodhi-sdk",
|
|
39
|
+
"scipy>=1.11.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.version]
|
|
43
|
+
path = "videosdk/plugins/navana/version.py"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["videosdk"]
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.sdist]
|
|
49
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
import numpy as np
|
|
7
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechData, SpeechEventType, global_event_emitter
|
|
8
|
+
from bodhi import BodhiClient, TranscriptionConfig, TranscriptionResponse, LiveTranscriptionEvents
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from scipy import signal
|
|
12
|
+
SCIPY_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
SCIPY_AVAILABLE = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NavanaSTT(BaseSTT):
|
|
18
|
+
"""
|
|
19
|
+
VideoSDK Agent Framework STT plugin for Navana's Bodhi API.
|
|
20
|
+
|
|
21
|
+
This plugin uses the official 'bodhi-sdk' and implements best practices for audio handling,
|
|
22
|
+
including robust stereo-to-mono conversion and event model adaptation.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
api_key: str | None = None,
|
|
29
|
+
customer_id: str | None = None,
|
|
30
|
+
model: str = "en-general-v2-8khz",
|
|
31
|
+
language: str = "en",
|
|
32
|
+
input_sample_rate: int = 48000,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Initialize the Navana STT plugin.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
api_key (Optional[str], optional): Navana API key. Defaults to None.
|
|
38
|
+
customer_id (Optional[str], optional): Navana customer ID. Defaults to None.
|
|
39
|
+
model (str): The model to use for the STT plugin. Defaults to "en-general-v2-8khz".
|
|
40
|
+
language (str): The language to use for the STT plugin. Defaults to "en".
|
|
41
|
+
input_sample_rate (int): The input sample rate to use for the STT plugin. Defaults to 48000.
|
|
42
|
+
"""
|
|
43
|
+
super().__init__()
|
|
44
|
+
|
|
45
|
+
if not SCIPY_AVAILABLE:
|
|
46
|
+
raise ImportError(
|
|
47
|
+
"The 'scipy' library is not installed. Please install it with 'pip install scipy' to use the NavanaSTT plugin for audio resampling.")
|
|
48
|
+
|
|
49
|
+
self.customer_id = customer_id or os.getenv("NAVANA_CUSTOMER_ID")
|
|
50
|
+
self.api_key = api_key or os.getenv("NAVANA_API_KEY")
|
|
51
|
+
|
|
52
|
+
if not self.api_key or not self.customer_id:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"Navana API key and Customer ID must be provided either through parameters or "
|
|
55
|
+
"NAVANA_API_KEY/NAVANA_CUSTOMER_ID environment variables."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self.model = model
|
|
59
|
+
self.language = language
|
|
60
|
+
self.input_sample_rate = input_sample_rate
|
|
61
|
+
self.target_sample_rate = 8000
|
|
62
|
+
|
|
63
|
+
self.client = BodhiClient(
|
|
64
|
+
api_key=self.api_key, customer_id=self.customer_id)
|
|
65
|
+
self._connection_started = False
|
|
66
|
+
self._last_transcript_text = ""
|
|
67
|
+
|
|
68
|
+
self._register_event_handlers()
|
|
69
|
+
|
|
70
|
+
def _register_event_handlers(self):
|
|
71
|
+
"""Registers handlers for the Bodhi client's transcription events."""
|
|
72
|
+
self.client.on(LiveTranscriptionEvents.Transcript, self._on_transcript)
|
|
73
|
+
self.client.on(LiveTranscriptionEvents.UtteranceEnd,
|
|
74
|
+
self._on_utterance_end)
|
|
75
|
+
self.client.on(LiveTranscriptionEvents.SpeechStarted,
|
|
76
|
+
self._on_speech_started)
|
|
77
|
+
self.client.on(LiveTranscriptionEvents.Error, self._on_error)
|
|
78
|
+
self.client.on(LiveTranscriptionEvents.Close, self._on_close)
|
|
79
|
+
|
|
80
|
+
async def _on_transcript(self, response: TranscriptionResponse):
|
|
81
|
+
"""Handles interim results, updating the latest transcript buffer."""
|
|
82
|
+
if response.text and self._transcript_callback:
|
|
83
|
+
self._last_transcript_text = response.text
|
|
84
|
+
event = STTResponse(
|
|
85
|
+
event_type=SpeechEventType.INTERIM,
|
|
86
|
+
data=SpeechData(text=response.text,
|
|
87
|
+
language=self.language, confidence=1.0)
|
|
88
|
+
)
|
|
89
|
+
await self._transcript_callback(event)
|
|
90
|
+
|
|
91
|
+
async def _on_utterance_end(self, response: dict):
|
|
92
|
+
"""On utterance end, promotes the last known transcript to FINAL."""
|
|
93
|
+
if self._last_transcript_text and self._transcript_callback:
|
|
94
|
+
final_text = self._last_transcript_text
|
|
95
|
+
self._last_transcript_text = ""
|
|
96
|
+
event = STTResponse(
|
|
97
|
+
event_type=SpeechEventType.FINAL,
|
|
98
|
+
data=SpeechData(text=final_text,
|
|
99
|
+
language=self.language, confidence=1.0)
|
|
100
|
+
)
|
|
101
|
+
await self._transcript_callback(event)
|
|
102
|
+
|
|
103
|
+
async def _on_speech_started(self, response: TranscriptionResponse):
|
|
104
|
+
global_event_emitter.emit("speech_started")
|
|
105
|
+
|
|
106
|
+
async def _on_error(self, e: Exception):
|
|
107
|
+
error_message = f"Navana SDK Error: {str(e)}"
|
|
108
|
+
print(error_message)
|
|
109
|
+
self.emit("error", error_message)
|
|
110
|
+
|
|
111
|
+
async def _on_close(self):
|
|
112
|
+
print("Navana SDK connection closed.")
|
|
113
|
+
self._connection_started = False
|
|
114
|
+
|
|
115
|
+
async def process_audio(
|
|
116
|
+
self,
|
|
117
|
+
audio_frames: bytes,
|
|
118
|
+
language: Optional[str] = None,
|
|
119
|
+
**kwargs: Any
|
|
120
|
+
) -> None:
|
|
121
|
+
"""
|
|
122
|
+
Processes audio by converting stereo to mono, resampling, and sending to the STT service.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
if not self._connection_started:
|
|
126
|
+
config = TranscriptionConfig(
|
|
127
|
+
model=self.model,
|
|
128
|
+
sample_rate=self.target_sample_rate
|
|
129
|
+
)
|
|
130
|
+
await self.client.start_connection(config=config)
|
|
131
|
+
self._connection_started = True
|
|
132
|
+
|
|
133
|
+
raw_audio_data = np.frombuffer(audio_frames, dtype=np.int16)
|
|
134
|
+
stereo_audio = raw_audio_data.reshape(-1, 2)
|
|
135
|
+
mono_audio_float = stereo_audio.astype(np.float32).mean(axis=1)
|
|
136
|
+
resampled_data = signal.resample(
|
|
137
|
+
mono_audio_float,
|
|
138
|
+
int(len(mono_audio_float) *
|
|
139
|
+
self.target_sample_rate / self.input_sample_rate)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
audio_bytes = resampled_data.astype(np.int16).tobytes()
|
|
143
|
+
|
|
144
|
+
await self.client.send_audio_stream(audio_bytes)
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
error_message = f"Audio processing error: {str(e)}"
|
|
148
|
+
print(error_message)
|
|
149
|
+
self.emit("error", error_message)
|
|
150
|
+
self._connection_started = False
|
|
151
|
+
if self.client._live_client and not self.client._live_client.is_closed:
|
|
152
|
+
await self.client.close_connection()
|
|
153
|
+
|
|
154
|
+
async def aclose(self) -> None:
|
|
155
|
+
"""Cleans up resources by closing the SDK connection."""
|
|
156
|
+
if self._connection_started:
|
|
157
|
+
await self.client.close_connection()
|
|
158
|
+
await super().aclose()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.48"
|