bhashini-client-sdk 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {bhashini_client_sdk-0.2.2/bhashini_client_sdk.egg-info → bhashini_client_sdk-0.2.3}/PKG-INFO +2 -1
  2. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/core.py +9 -0
  3. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/asr_service.py +100 -0
  4. bhashini_client_sdk-0.2.3/bhashini_client/services/asr_streaming_client.py +350 -0
  5. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/audio_input_utils.py +1 -1
  6. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/utils/request_handler.py +1 -0
  7. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3/bhashini_client_sdk.egg-info}/PKG-INFO +2 -1
  8. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client_sdk.egg-info/SOURCES.txt +3 -0
  9. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client_sdk.egg-info/requires.txt +1 -0
  10. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/setup.py +11 -10
  11. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_asr.py +63 -0
  12. bhashini_client_sdk-0.2.3/tests/test_asr_streaming_live.py +73 -0
  13. bhashini_client_sdk-0.2.3/tests/test_asr_streaming_mic_live.py +118 -0
  14. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/LICENSE +0 -0
  15. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/MANIFEST.in +0 -0
  16. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/PYPI_DESCRIPTION.md +0 -0
  17. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/README.md +0 -0
  18. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/__init__.py +0 -0
  19. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/config.py +0 -0
  20. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/models_info.py +0 -0
  21. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/__init__.py +0 -0
  22. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/audio_language_detection_service.py +0 -0
  23. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/denoiser_service.py +0 -0
  24. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/image_lang_detection_service.py +0 -0
  25. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/itn_service.py +0 -0
  26. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/language_detection_service.py +0 -0
  27. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/ner_service.py +0 -0
  28. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/nmt_service.py +0 -0
  29. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/ocr_service.py +0 -0
  30. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/service_utils.py +0 -0
  31. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/speaker_diarization_service.py +0 -0
  32. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/speaker_enrollment_service.py +0 -0
  33. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/speaker_verification_service.py +0 -0
  34. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/text_normalization_service.py +0 -0
  35. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/transliteration_service.py +0 -0
  36. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/tts_service.py +0 -0
  37. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/services/voice_cloning_service.py +0 -0
  38. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client/utils/__init__.py +0 -0
  39. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client_sdk.egg-info/dependency_links.txt +0 -0
  40. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/bhashini_client_sdk.egg-info/top_level.txt +0 -0
  41. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/pyproject.toml +0 -0
  42. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/setup.cfg +0 -0
  43. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_audio_input_utils.py +0 -0
  44. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_audio_language_detection.py +0 -0
  45. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_bhashini_client.py +0 -0
  46. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_denoiser.py +0 -0
  47. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_image_lang_detection.py +0 -0
  48. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_itn.py +0 -0
  49. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_language_detection.py +0 -0
  50. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_ner.py +0 -0
  51. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_nmt.py +0 -0
  52. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_ocr.py +0 -0
  53. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_speaker_diarization.py +0 -0
  54. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_speaker_enrollment.py +0 -0
  55. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_speaker_verification.py +0 -0
  56. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_text_language_detection.py +0 -0
  57. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_text_normalization.py +0 -0
  58. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_transliteration.py +0 -0
  59. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_tts.py +0 -0
  60. {bhashini_client_sdk-0.2.2 → bhashini_client_sdk-0.2.3}/tests/test_voice_cloning.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bhashini-client-sdk
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Python SDK for Bhashini inference APIs with unified support for ASR, NMT, TTS, OCR, NER, speaker services, normalization, and image-language workflows.
5
5
  Home-page: https://github.com/bhashini-dibd/Bhashini-client-python-library.git
6
6
  Author: Nidhi Jha
@@ -25,6 +25,7 @@ Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: requests>=2.25.0
28
+ Requires-Dist: websockets>=12.0
28
29
  Requires-Dist: openpyxl>=3.1.0
29
30
  Dynamic: author
30
31
  Dynamic: classifier
@@ -45,6 +45,15 @@ class BhashiniClient:
45
45
  def asr(self, audio_input, source_lang: str):
46
46
  return self.asr_service.transcribe(audio_input, source_lang)
47
47
 
48
+ def asr_streaming_config(self, source_lang: str, **kwargs):
49
+ return self.asr_service.get_streaming_config(source_lang, **kwargs)
50
+
51
+ async def asr_stream(self, audio_chunks, source_lang: str, **kwargs):
52
+ return await self.asr_service.stream_transcribe(audio_chunks, source_lang, **kwargs)
53
+
54
+ def asr_stream_sync(self, audio_chunks, source_lang: str, **kwargs):
55
+ return self.asr_service.stream_transcribe_sync(audio_chunks, source_lang, **kwargs)
56
+
48
57
  def nmt(self, text: str, source_lang: str, target_lang: str):
49
58
  return self.nmt_service.translate(text, source_lang, target_lang)
50
59
 
@@ -1,5 +1,8 @@
1
+ import asyncio
2
+
1
3
  from ..models_info import get_available_models, validate_model_request
2
4
  from .audio_input_utils import preprocess_audio_or_uri_input
5
+ from .asr_streaming_client import ASRStreamingClient
3
6
  from .service_utils import api_error, input_error, is_error_response
4
7
 
5
8
 
@@ -23,6 +26,95 @@ class ASRService:
23
26
  def get_supported_models(self, language=None):
24
27
  return get_available_models("asr", language=language)
25
28
 
29
+ def get_streaming_config(
30
+ self,
31
+ source_lang: str,
32
+ *,
33
+ service_id: str = None,
34
+ post_processors=None,
35
+ interim_results=True,
36
+ ):
37
+ streaming_client = self._get_streaming_client(source_lang=source_lang, service_id=service_id)
38
+ return streaming_client.get_start_config(
39
+ source_lang=source_lang,
40
+ service_id=service_id,
41
+ post_processors=post_processors,
42
+ interim_results=interim_results,
43
+ )
44
+
45
+ async def stream_transcribe(
46
+ self,
47
+ audio_chunks,
48
+ source_lang: str,
49
+ *,
50
+ service_id: str = None,
51
+ post_processors=None,
52
+ interim_results=True,
53
+ receive_timeout=5,
54
+ chunk_delay_seconds=0,
55
+ return_details=False,
56
+ audio_sample_format="int16",
57
+ ):
58
+ streaming_client = self._get_streaming_client(source_lang=source_lang, service_id=service_id)
59
+ return await streaming_client.transcribe_audio_chunks(
60
+ audio_chunks,
61
+ source_lang=source_lang,
62
+ service_id=service_id,
63
+ post_processors=post_processors,
64
+ interim_results=interim_results,
65
+ receive_timeout=receive_timeout,
66
+ chunk_delay_seconds=chunk_delay_seconds,
67
+ return_details=return_details,
68
+ audio_sample_format=audio_sample_format,
69
+ )
70
+
71
+ def stream_transcribe_sync(
72
+ self,
73
+ audio_chunks,
74
+ source_lang: str,
75
+ *,
76
+ service_id: str = None,
77
+ post_processors=None,
78
+ interim_results=True,
79
+ receive_timeout=5,
80
+ chunk_delay_seconds=0,
81
+ return_details=False,
82
+ audio_sample_format="int16",
83
+ ):
84
+ try:
85
+ asyncio.get_running_loop()
86
+ except RuntimeError:
87
+ return asyncio.run(
88
+ self.stream_transcribe(
89
+ audio_chunks,
90
+ source_lang,
91
+ service_id=service_id,
92
+ post_processors=post_processors,
93
+ interim_results=interim_results,
94
+ receive_timeout=receive_timeout,
95
+ chunk_delay_seconds=chunk_delay_seconds,
96
+ return_details=return_details,
97
+ audio_sample_format=audio_sample_format,
98
+ )
99
+ )
100
+ return input_error("stream_transcribe_sync cannot be used inside a running event loop. Use await stream_transcribe(...) instead.")
101
+
102
+ async def stream_microphone(
103
+ self,
104
+ source_lang: str,
105
+ *,
106
+ service_id: str = None,
107
+ post_processors=None,
108
+ interim_results=True,
109
+ ):
110
+ streaming_client = self._get_streaming_client(source_lang=source_lang, service_id=service_id)
111
+ return await streaming_client.transcribe_microphone(
112
+ source_lang=source_lang,
113
+ service_id=service_id,
114
+ post_processors=post_processors,
115
+ interim_results=interim_results,
116
+ )
117
+
26
118
  def transcribe(
27
119
  self,
28
120
  input_data,
@@ -217,6 +309,14 @@ class ASRService:
217
309
  return "ai4bharat/conformer-multilingual-indo_aryan-gpu--t4"
218
310
  return "bhashini/ai4bharat/conformer-multilingual-asr"
219
311
 
312
+ def _get_streaming_client(self, *, source_lang: str, service_id: str = None):
313
+ api_key = getattr(self.handler, "api_key", None)
314
+ return ASRStreamingClient(
315
+ api_key=api_key,
316
+ source_lang=source_lang,
317
+ service_id=service_id or "bhashini/ai4b/indic-conformer/grpc",
318
+ )
319
+
220
320
  def postprocess(self, response):
221
321
  try:
222
322
  return (
@@ -0,0 +1,350 @@
1
+ import asyncio
2
+ import audioop
3
+ from array import array
4
+ import inspect
5
+ import json
6
+ from urllib.parse import quote
7
+
8
+ from ..config import API_KEY
9
+ from .service_utils import api_error, input_error
10
+
11
+
12
+ DEFAULT_ASR_STREAM_URL = "wss://dhruva-api.bhashini.gov.in/ws/v1/asr/stream"
13
+ DEFAULT_STREAMING_SERVICE_ID = "bhashini/ai4b/indic-conformer/grpc"
14
+ DEFAULT_SAMPLE_RATE = 16000
15
+ DEFAULT_CHUNK_DURATION_MS = 200
16
+ VALID_AUDIO_SAMPLE_FORMATS = {"int16", "float32", "raw"}
17
+
18
+
19
+ def int16_pcm_to_float32_bytes(audio_chunk):
20
+ samples = array("h")
21
+ samples.frombytes(bytes(audio_chunk))
22
+ scale = 32768.0
23
+ return array("f", (max(-1.0, min(1.0, sample / scale)) for sample in samples)).tobytes()
24
+
25
+
26
+ def float_samples_to_bytes(audio_chunk):
27
+ if isinstance(audio_chunk, (bytes, bytearray)):
28
+ return bytes(audio_chunk)
29
+ return array("f", audio_chunk).tobytes()
30
+
31
+
32
+ class VADProcessor:
33
+ def __init__(
34
+ self,
35
+ *,
36
+ speech_start_rms=0.020,
37
+ speech_end_rms=0.010,
38
+ min_speech_ms=200,
39
+ min_pause_ms=800,
40
+ chunk_duration_ms=DEFAULT_CHUNK_DURATION_MS,
41
+ ):
42
+ self.speech_start_rms = speech_start_rms
43
+ self.speech_end_rms = speech_end_rms
44
+ self.min_speech_ms = min_speech_ms
45
+ self.min_pause_ms = min_pause_ms
46
+ self.chunk_duration_ms = chunk_duration_ms
47
+ self.is_speaking = False
48
+ self.speech_run_ms = 0
49
+ self.silence_run_ms = 0
50
+
51
+ def process_chunk(self, audio_chunk):
52
+ if not audio_chunk:
53
+ return "IDLE"
54
+ rms = audioop.rms(audio_chunk, 2) / 32768.0
55
+ if not self.is_speaking:
56
+ if rms > self.speech_start_rms:
57
+ self.speech_run_ms += self.chunk_duration_ms
58
+ if self.speech_run_ms >= self.min_speech_ms:
59
+ self.is_speaking = True
60
+ self.speech_run_ms = 0
61
+ return "START"
62
+ else:
63
+ self.speech_run_ms = 0
64
+ else:
65
+ if rms < self.speech_end_rms:
66
+ self.silence_run_ms += self.chunk_duration_ms
67
+ if self.silence_run_ms >= self.min_pause_ms:
68
+ self.is_speaking = False
69
+ self.silence_run_ms = 0
70
+ return "STOP"
71
+ else:
72
+ self.silence_run_ms = 0
73
+ return "CONTINUE" if self.is_speaking else "IDLE"
74
+
75
+
76
+ class ASRStreamingClient:
77
+ def __init__(
78
+ self,
79
+ *,
80
+ api_key=None,
81
+ websocket_url=DEFAULT_ASR_STREAM_URL,
82
+ service_id=DEFAULT_STREAMING_SERVICE_ID,
83
+ source_lang="hi",
84
+ sample_rate=DEFAULT_SAMPLE_RATE,
85
+ chunk_duration_ms=DEFAULT_CHUNK_DURATION_MS,
86
+ ):
87
+ self.api_key = (api_key or API_KEY).strip()
88
+ self.websocket_url = websocket_url
89
+ self.service_id = service_id
90
+ self.source_lang = source_lang
91
+ self.sample_rate = sample_rate
92
+ self.chunk_duration_ms = chunk_duration_ms
93
+
94
+ def get_websocket_url(self):
95
+ if not self.api_key:
96
+ return input_error("BHASHINI_API_KEY is missing. Please set it before using ASR streaming.")
97
+ return f"{self.websocket_url}?api_key={quote(self.api_key)}"
98
+
99
+ def get_start_config(
100
+ self,
101
+ *,
102
+ source_lang=None,
103
+ service_id=None,
104
+ audio_format="pcm",
105
+ encoding="raw",
106
+ sampling_rate=None,
107
+ post_processors=None,
108
+ interim_results=True,
109
+ end_of_stream_policy="client_signal",
110
+ profanity_filter=True,
111
+ ):
112
+ resolved_source_lang = (source_lang or self.source_lang or "").strip().lower()
113
+ if not resolved_source_lang:
114
+ return input_error("Source language is missing. Please provide a valid ASR source language code.")
115
+ resolved_sampling_rate = sampling_rate or self.sample_rate
116
+ if resolved_sampling_rate != DEFAULT_SAMPLE_RATE:
117
+ return input_error("ASR streaming supports only 16000 Hz PCM audio chunks.")
118
+ return {
119
+ "type": "start",
120
+ "controlConfig": {"dataTracking": False},
121
+ "config": {
122
+ "serviceId": service_id or self.service_id,
123
+ "language": {"sourceLanguage": resolved_source_lang},
124
+ "audioFormat": audio_format,
125
+ "encoding": encoding,
126
+ "samplingRate": resolved_sampling_rate,
127
+ "transcriptionFormat": {"value": "transcript"},
128
+ "profanityFilter": profanity_filter,
129
+ "postProcessors": post_processors or ["itn", "punctuation"],
130
+ },
131
+ "streamingConfig": {
132
+ "chunkDurationMs": self.chunk_duration_ms,
133
+ "interimResults": interim_results,
134
+ "endOfStreamPolicy": end_of_stream_policy,
135
+ },
136
+ }
137
+
138
+ async def transcribe_audio_chunks(
139
+ self,
140
+ audio_chunks,
141
+ *,
142
+ source_lang=None,
143
+ service_id=None,
144
+ post_processors=None,
145
+ interim_results=True,
146
+ receive_timeout=5,
147
+ chunk_delay_seconds=0,
148
+ return_details=False,
149
+ wait_for_ready=True,
150
+ audio_sample_format="int16",
151
+ ):
152
+ try:
153
+ import websockets
154
+ except ImportError:
155
+ return input_error("The 'websockets' package is required for ASR streaming. Install it before using this feature.")
156
+
157
+ if audio_sample_format not in VALID_AUDIO_SAMPLE_FORMATS:
158
+ return input_error("audio_sample_format must be one of: int16, float32, raw.")
159
+
160
+ websocket_url = self.get_websocket_url()
161
+ if isinstance(websocket_url, str) and websocket_url.startswith("Input Error:"):
162
+ return websocket_url
163
+
164
+ start_config = self.get_start_config(
165
+ source_lang=source_lang,
166
+ service_id=service_id,
167
+ post_processors=post_processors,
168
+ interim_results=interim_results,
169
+ )
170
+ if isinstance(start_config, str) and start_config.startswith("Input Error:"):
171
+ return start_config
172
+
173
+ transcripts = []
174
+ messages = []
175
+
176
+ async def wait_until_ready(websocket):
177
+ while True:
178
+ message = await asyncio.wait_for(websocket.recv(), timeout=receive_timeout)
179
+ response = json.loads(message)
180
+ messages.append(response)
181
+ if response.get("type") == "ready":
182
+ return None
183
+ if response.get("type") == "error":
184
+ return response
185
+
186
+ async def receive_transcripts(websocket):
187
+ try:
188
+ while True:
189
+ message = await websocket.recv()
190
+ response = json.loads(message)
191
+ messages.append(response)
192
+ if response.get("type") != "transcript":
193
+ continue
194
+ text = (response.get("output") or [{}])[0].get("source")
195
+ if text:
196
+ transcripts.append(
197
+ {
198
+ "source": text,
199
+ "isFinal": response.get("isFinal", False),
200
+ "raw": response,
201
+ }
202
+ )
203
+ if response.get("isFinal"):
204
+ break
205
+ except Exception:
206
+ # Normal WebSocket closure is expected after the service finishes a stream.
207
+ return
208
+
209
+ def prepare_chunk(chunk):
210
+ if not isinstance(chunk, (bytes, bytearray)) or not chunk:
211
+ if audio_sample_format == "float32" and chunk:
212
+ return float_samples_to_bytes(chunk)
213
+ return None
214
+ if audio_sample_format == "int16":
215
+ return int16_pcm_to_float32_bytes(chunk)
216
+ if audio_sample_format == "float32":
217
+ return float_samples_to_bytes(chunk)
218
+ return bytes(chunk)
219
+
220
+ async def send_chunk(websocket, chunk):
221
+ prepared_chunk = prepare_chunk(chunk)
222
+ if not prepared_chunk:
223
+ return
224
+ await websocket.send(prepared_chunk)
225
+ if chunk_delay_seconds:
226
+ await asyncio.sleep(chunk_delay_seconds)
227
+
228
+ try:
229
+ async with websockets.connect(websocket_url) as websocket:
230
+ await websocket.send(json.dumps(start_config))
231
+ if wait_for_ready:
232
+ early_error = await wait_until_ready(websocket)
233
+ if early_error:
234
+ if return_details:
235
+ return {
236
+ "transcript": None,
237
+ "messages": messages,
238
+ "error": early_error.get("message") or "ASR streaming backend returned an error.",
239
+ }
240
+ return api_error(early_error.get("message") or "ASR streaming backend returned an error.")
241
+ receiver_task = asyncio.create_task(receive_transcripts(websocket))
242
+ if inspect.isasyncgen(audio_chunks):
243
+ async for chunk in audio_chunks:
244
+ await send_chunk(websocket, chunk)
245
+ else:
246
+ for chunk in audio_chunks:
247
+ await send_chunk(websocket, chunk)
248
+ try:
249
+ await websocket.send(json.dumps({"type": "end"}))
250
+ except Exception:
251
+ pass
252
+ try:
253
+ await asyncio.wait_for(receiver_task, timeout=receive_timeout)
254
+ except asyncio.TimeoutError:
255
+ receiver_task.cancel()
256
+ except Exception as exc:
257
+ error_messages = [message for message in messages if message.get("type") == "error"]
258
+ if error_messages:
259
+ backend_message = error_messages[-1].get("message") or "ASR streaming backend returned an error."
260
+ if return_details:
261
+ return {
262
+ "transcript": None,
263
+ "messages": messages,
264
+ "error": backend_message,
265
+ }
266
+ return api_error(backend_message)
267
+ if return_details:
268
+ return {
269
+ "transcript": None,
270
+ "messages": messages,
271
+ "error": "ASR streaming service did not return a valid response.",
272
+ "exception": f"{type(exc).__name__}: {exc}",
273
+ }
274
+ return api_error("ASR streaming service did not return a valid response.")
275
+
276
+ error_messages = [message for message in messages if message.get("type") == "error"]
277
+ if error_messages:
278
+ backend_message = error_messages[-1].get("message") or "ASR streaming backend returned an error."
279
+ if return_details:
280
+ return {
281
+ "transcript": None,
282
+ "messages": messages,
283
+ "error": backend_message,
284
+ }
285
+ return api_error(backend_message)
286
+ if not transcripts:
287
+ if return_details:
288
+ return {
289
+ "transcript": None,
290
+ "messages": messages,
291
+ "error": "ASR streaming service returned an empty transcription.",
292
+ }
293
+ return api_error("ASR streaming service returned an empty transcription.")
294
+ final_transcripts = [item for item in transcripts if item["isFinal"]]
295
+ transcript = (final_transcripts or transcripts)[-1]["source"]
296
+ if return_details:
297
+ return {
298
+ "transcript": transcript,
299
+ "messages": messages,
300
+ "final": bool(final_transcripts),
301
+ }
302
+ return transcript
303
+
304
+ async def transcribe_microphone(
305
+ self,
306
+ *,
307
+ source_lang=None,
308
+ service_id=None,
309
+ post_processors=None,
310
+ interim_results=True,
311
+ ):
312
+ try:
313
+ import pyaudio
314
+ except ImportError:
315
+ return input_error("The 'pyaudio' package is required for microphone ASR streaming.")
316
+
317
+ vad = VADProcessor(chunk_duration_ms=self.chunk_duration_ms)
318
+ samples_per_chunk = int(self.sample_rate * self.chunk_duration_ms / 1000)
319
+ audio = pyaudio.PyAudio()
320
+ stream = audio.open(
321
+ format=pyaudio.paInt16,
322
+ channels=1,
323
+ rate=self.sample_rate,
324
+ input=True,
325
+ frames_per_buffer=samples_per_chunk,
326
+ )
327
+
328
+ async def chunk_generator():
329
+ try:
330
+ while True:
331
+ chunk = stream.read(samples_per_chunk, exception_on_overflow=False)
332
+ state = vad.process_chunk(chunk)
333
+ if state in {"START", "CONTINUE"}:
334
+ yield chunk
335
+ if state == "STOP":
336
+ break
337
+ await asyncio.sleep(0.01)
338
+ finally:
339
+ stream.stop_stream()
340
+ stream.close()
341
+ audio.terminate()
342
+
343
+ return await self.transcribe_audio_chunks(
344
+ chunk_generator(),
345
+ source_lang=source_lang,
346
+ service_id=service_id,
347
+ post_processors=post_processors,
348
+ interim_results=interim_results,
349
+ chunk_delay_seconds=0,
350
+ )
@@ -189,7 +189,7 @@ def _inspect_and_prepare_wav(
189
189
  frame_count = audio_stream.getnframes()
190
190
  channels = audio_stream.getnchannels()
191
191
  sample_width = audio_stream.getsampwidth()
192
- frame_rate = audio_stream.getframerate()
192
+ frame_rate = audio_stream.getframerate()
193
193
  if frame_count == 0:
194
194
  return input_error(
195
195
  "The audio file does not contain any frames. Please provide an audio file with speech."
@@ -9,6 +9,7 @@ from ..config import API_KEY, BASE_URL, DEFAULT_HEADERS, DEFAULT_TIMEOUT
9
9
  class RequestHandler:
10
10
  def __init__(self, api_key: Optional[str] = None):
11
11
  auth_token = (api_key or API_KEY).strip()
12
+ self.api_key = auth_token
12
13
  self.headers = {**DEFAULT_HEADERS, "Authorization": auth_token}
13
14
 
14
15
  def post(self, payload: dict, url: Optional[str] = None, headers: Optional[dict] = None):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bhashini-client-sdk
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Python SDK for Bhashini inference APIs with unified support for ASR, NMT, TTS, OCR, NER, speaker services, normalization, and image-language workflows.
5
5
  Home-page: https://github.com/bhashini-dibd/Bhashini-client-python-library.git
6
6
  Author: Nidhi Jha
@@ -25,6 +25,7 @@ Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: requests>=2.25.0
28
+ Requires-Dist: websockets>=12.0
28
29
  Requires-Dist: openpyxl>=3.1.0
29
30
  Dynamic: author
30
31
  Dynamic: classifier
@@ -11,6 +11,7 @@ bhashini_client/core.py
11
11
  bhashini_client/models_info.py
12
12
  bhashini_client/services/__init__.py
13
13
  bhashini_client/services/asr_service.py
14
+ bhashini_client/services/asr_streaming_client.py
14
15
  bhashini_client/services/audio_input_utils.py
15
16
  bhashini_client/services/audio_language_detection_service.py
16
17
  bhashini_client/services/denoiser_service.py
@@ -36,6 +37,8 @@ bhashini_client_sdk.egg-info/dependency_links.txt
36
37
  bhashini_client_sdk.egg-info/requires.txt
37
38
  bhashini_client_sdk.egg-info/top_level.txt
38
39
  tests/test_asr.py
40
+ tests/test_asr_streaming_live.py
41
+ tests/test_asr_streaming_mic_live.py
39
42
  tests/test_audio_input_utils.py
40
43
  tests/test_audio_language_detection.py
41
44
  tests/test_bhashini_client.py
@@ -1,2 +1,3 @@
1
1
  requests>=2.25.0
2
+ websockets>=12.0
2
3
  openpyxl>=3.1.0
@@ -1,16 +1,16 @@
1
- from pathlib import Path
2
-
3
- from setuptools import find_packages, setup
4
-
5
-
6
- LONG_DESCRIPTION_PATH = Path(__file__).parent / "PYPI_DESCRIPTION.md"
1
+ from pathlib import Path
2
+
3
+ from setuptools import find_packages, setup
4
+
5
+
6
+ LONG_DESCRIPTION_PATH = Path(__file__).parent / "PYPI_DESCRIPTION.md"
7
7
 
8
8
 
9
9
  setup(
10
- name="bhashini-client-sdk",
11
- version="0.2.2",
12
- description="Python SDK for Bhashini inference APIs with unified support for ASR, NMT, TTS, OCR, NER, speaker services, normalization, and image-language workflows.",
13
- long_description=LONG_DESCRIPTION_PATH.read_text(encoding="utf-8"),
10
+ name="bhashini-client-sdk",
11
+ version="0.2.3",
12
+ description="Python SDK for Bhashini inference APIs with unified support for ASR, NMT, TTS, OCR, NER, speaker services, normalization, and image-language workflows.",
13
+ long_description=LONG_DESCRIPTION_PATH.read_text(encoding="utf-8"),
14
14
  long_description_content_type="text/markdown",
15
15
  author="Nidhi Jha",
16
16
  url="https://github.com/bhashini-dibd/Bhashini-client-python-library.git",
@@ -19,6 +19,7 @@ setup(
19
19
  include_package_data=True,
20
20
  install_requires=[
21
21
  "requests>=2.25.0",
22
+ "websockets>=12.0",
22
23
  "openpyxl>=3.1.0",
23
24
  ],
24
25
  python_requires=">=3.8",
@@ -2,6 +2,7 @@ import base64
2
2
  import wave
3
3
 
4
4
  from bhashini_client.services.asr_service import ASRService
5
+ from bhashini_client.services.asr_streaming_client import ASRStreamingClient, int16_pcm_to_float32_bytes
5
6
 
6
7
  from tests.assertions import INPUT_ERROR, assert_service_output
7
8
  from tests.result_logger import log_test_result
@@ -191,3 +192,65 @@ def test_asr_uri_payload_supports_flac():
191
192
  assert config["audioFormat"] == "flac"
192
193
  assert config["encoding"] == "FLAC"
193
194
  assert audio_payload["audioUri"] == "https://example.com/sample.flac"
195
+
196
+
197
+ def test_asr_streaming_config_uses_existing_api_key_and_defaults():
198
+ streaming_client = ASRStreamingClient(api_key="demo-key", source_lang="hi")
199
+
200
+ websocket_url = streaming_client.get_websocket_url()
201
+ start_config = streaming_client.get_start_config()
202
+
203
+ assert websocket_url == "wss://dhruva-api.bhashini.gov.in/ws/v1/asr/stream?api_key=demo-key"
204
+ assert start_config["type"] == "start"
205
+ assert start_config["controlConfig"]["dataTracking"] is False
206
+ assert start_config["config"]["serviceId"] == "bhashini/ai4b/indic-conformer/grpc"
207
+ assert start_config["config"]["language"]["sourceLanguage"] == "hi"
208
+ assert start_config["config"]["audioFormat"] == "pcm"
209
+ assert start_config["config"]["encoding"] == "raw"
210
+ assert start_config["config"]["samplingRate"] == 16000
211
+ assert start_config["config"]["transcriptionFormat"] == {"value": "transcript"}
212
+ assert start_config["config"]["profanityFilter"] is True
213
+ assert start_config["config"]["postProcessors"] == ["itn", "punctuation"]
214
+ assert start_config["streamingConfig"]["chunkDurationMs"] == 200
215
+ assert start_config["streamingConfig"]["interimResults"] is True
216
+
217
+
218
+ def test_asr_streaming_path_is_explicit_and_does_not_call_http_handler():
219
+ captured = {}
220
+
221
+ class Handler:
222
+ api_key = "demo-key"
223
+
224
+ def post(self, payload):
225
+ raise AssertionError("HTTP request-response path should not run for explicit streaming calls.")
226
+
227
+ class FakeStreamingClient:
228
+ async def transcribe_audio_chunks(self, audio_chunks, **kwargs):
229
+ captured["chunks"] = list(audio_chunks)
230
+ captured["kwargs"] = kwargs
231
+ return "streaming transcription"
232
+
233
+ service = ASRService(Handler())
234
+ service._get_streaming_client = lambda **kwargs: FakeStreamingClient()
235
+
236
+ actual_output = service.stream_transcribe_sync(
237
+ [b"\x01\x00" * 160],
238
+ "hi",
239
+ post_processors=["itn"],
240
+ receive_timeout=1,
241
+ audio_sample_format="int16",
242
+ )
243
+
244
+ assert actual_output == "streaming transcription"
245
+ assert captured["chunks"] == [b"\x01\x00" * 160]
246
+ assert captured["kwargs"]["source_lang"] == "hi"
247
+ assert captured["kwargs"]["post_processors"] == ["itn"]
248
+ assert captured["kwargs"]["receive_timeout"] == 1
249
+ assert captured["kwargs"]["audio_sample_format"] == "int16"
250
+
251
+
252
+ def test_asr_streaming_converts_int16_chunks_to_float32_wire_bytes():
253
+ streaming_client = ASRStreamingClient(api_key="demo-key", source_lang="hi")
254
+ converted = int16_pcm_to_float32_bytes(b"\x00\x00\x00@\x00\xc0")
255
+
256
+ assert len(converted) == 12
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import audioop
3
+ import json
4
+ import wave
5
+
6
+ from bhashini_client import BhashiniClient
7
+ from bhashini_client.services.asr_streaming_client import VADProcessor
8
+
9
+
10
+ def wav_chunks(path, *, sample_rate=16000, chunk_ms=200, vad=False):
11
+ with wave.open(path, "rb") as audio:
12
+ channels = audio.getnchannels()
13
+ sample_width = audio.getsampwidth()
14
+ frame_rate = audio.getframerate()
15
+ audio_bytes = audio.readframes(audio.getnframes())
16
+ if channels == 2:
17
+ audio_bytes = audioop.tomono(audio_bytes, sample_width, 0.5, 0.5)
18
+ channels = 1
19
+ if channels != 1:
20
+ raise ValueError(f"Streaming ASR expects mono or stereo audio, but received {channels} channels.")
21
+ if sample_width != 2:
22
+ audio_bytes = audioop.lin2lin(audio_bytes, sample_width, 2)
23
+ sample_width = 2
24
+ if frame_rate != sample_rate:
25
+ print(f"AUDIO_RESAMPLE: received {frame_rate} Hz, converting to {sample_rate} Hz for ASR streaming.")
26
+ audio_bytes, _ = audioop.ratecv(audio_bytes, sample_width, channels, frame_rate, sample_rate, None)
27
+ frames_per_chunk = int(sample_rate * chunk_ms / 1000)
28
+ bytes_per_chunk = frames_per_chunk * sample_width
29
+ vad_processor = VADProcessor(chunk_duration_ms=chunk_ms) if vad else None
30
+ for index in range(0, len(audio_bytes), bytes_per_chunk):
31
+ chunk = audio_bytes[index:index + bytes_per_chunk]
32
+ if vad_processor:
33
+ state = vad_processor.process_chunk(chunk)
34
+ if state not in {"START", "CONTINUE"}:
35
+ continue
36
+ yield chunk
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="Run a real ASR streaming test with a local WAV file.")
41
+ parser.add_argument("--audio", required=True, help="Path to a 16 kHz mono 16-bit PCM WAV file.")
42
+ parser.add_argument("--lang", default="hi", help="ASR source language code. Default: hi")
43
+ parser.add_argument("--chunk-ms", type=int, default=200, help="Chunk duration in milliseconds. Default: 200")
44
+ parser.add_argument("--timeout", type=int, default=10, help="Receive timeout after sending audio. Default: 10")
45
+ parser.add_argument("--details", action="store_true", help="Print raw backend messages along with the final transcript.")
46
+ parser.add_argument("--vad", action="store_true", help="Send only speech chunks using the same VAD style as the mentor script.")
47
+ parser.add_argument("--service-id", default=None, help="Optional ASR streaming service id override.")
48
+ parser.add_argument("--no-postprocessors", action="store_true", help="Send streaming audio without ITN/punctuation postprocessors.")
49
+ parser.add_argument("--audio-sample-format", default="int16", choices=["int16", "float32", "raw"], help="Input chunk format before SDK sends it. Default converts int16 WAV chunks to mentor-compatible float32 stream bytes.")
50
+ args = parser.parse_args()
51
+
52
+ client = BhashiniClient()
53
+ chunks = list(wav_chunks(args.audio, chunk_ms=args.chunk_ms, vad=args.vad))
54
+ print(f"STREAMING_CHUNKS_SENT: {len(chunks)}")
55
+ result = client.asr_stream_sync(
56
+ chunks,
57
+ args.lang,
58
+ service_id=args.service_id,
59
+ post_processors=[] if args.no_postprocessors else ["itn", "punctuation"],
60
+ receive_timeout=args.timeout,
61
+ chunk_delay_seconds=args.chunk_ms / 1000.0,
62
+ return_details=args.details,
63
+ audio_sample_format=args.audio_sample_format,
64
+ )
65
+ print("ASR_STREAMING_RESULT:")
66
+ if isinstance(result, (dict, list)):
67
+ print(json.dumps(result, indent=2, ensure_ascii=False))
68
+ else:
69
+ print(result)
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
@@ -0,0 +1,118 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+
5
+ from bhashini_client.services.asr_streaming_client import (
6
+ ASRStreamingClient,
7
+ VADProcessor,
8
+ int16_pcm_to_float32_bytes,
9
+ )
10
+
11
+
12
+ async def run_mic_stream(args):
13
+ try:
14
+ import pyaudio
15
+ import websockets
16
+ except ImportError as exc:
17
+ missing = getattr(exc, "name", None) or str(exc)
18
+ print(f"Missing dependency: {missing}")
19
+ print("Install microphone dependencies first, for example: python -m pip install pyaudio websockets")
20
+ return
21
+
22
+ streaming_client = ASRStreamingClient(source_lang=args.lang, chunk_duration_ms=args.chunk_ms)
23
+ websocket_url = streaming_client.get_websocket_url()
24
+ if isinstance(websocket_url, str) and websocket_url.startswith("Input Error:"):
25
+ print(websocket_url)
26
+ return
27
+
28
+ start_config = streaming_client.get_start_config(
29
+ source_lang=args.lang,
30
+ post_processors=[] if args.no_postprocessors else ["itn", "punctuation"],
31
+ )
32
+ if isinstance(start_config, str) and start_config.startswith("Input Error:"):
33
+ print(start_config)
34
+ return
35
+
36
+ samples_per_chunk = int(streaming_client.sample_rate * args.chunk_ms / 1000)
37
+ vad = VADProcessor(chunk_duration_ms=args.chunk_ms) if args.vad else None
38
+ audio = pyaudio.PyAudio()
39
+ stream = audio.open(
40
+ format=pyaudio.paInt16,
41
+ channels=1,
42
+ rate=streaming_client.sample_rate,
43
+ input=True,
44
+ frames_per_buffer=samples_per_chunk, )
45
+
46
+ chunks_sent = 0
47
+ stop_event = asyncio.Event()
48
+
49
+ print("MIC_STREAMING_STARTED")
50
+ print("Speak now. Press Ctrl+C to stop.")
51
+
52
+ async def receive_messages(websocket):
53
+ nonlocal stop_event
54
+ async for message in websocket:
55
+ response = json.loads(message)
56
+ if response.get("type") == "ready":
57
+ print(f"READY session={response.get('sessionId')} model={response.get('modelName')}")
58
+ elif response.get("type") == "transcript":
59
+ text = (response.get("output") or [{}])[0].get("source", "")
60
+ label = "FINAL" if response.get("isFinal") else "INTERIM"
61
+ print(f"{label}: {text}")
62
+ if response.get("isFinal"):
63
+ stop_event.set()
64
+ elif response.get("type") == "error":
65
+ print("ERROR:")
66
+ print(response.get("message") or response)
67
+ stop_event.set()
68
+ elif response.get("type") == "end":
69
+ print("END")
70
+ stop_event.set()
71
+
72
+ try:
73
+ async with websockets.connect(websocket_url) as websocket:
74
+ await websocket.send(json.dumps(start_config))
75
+ receiver_task = asyncio.create_task(receive_messages(websocket))
76
+ while not stop_event.is_set():
77
+ chunk = stream.read(samples_per_chunk, exception_on_overflow=False)
78
+ if vad:
79
+ state = vad.process_chunk(chunk)
80
+ if state not in {"START", "CONTINUE"}:
81
+ await asyncio.sleep(args.chunk_ms / 1000)
82
+ continue
83
+ await websocket.send(int16_pcm_to_float32_bytes(chunk))
84
+ await asyncio.sleep(args.chunk_ms / 1000)
85
+ chunks_sent += 1
86
+ if args.max_seconds and chunks_sent * args.chunk_ms >= args.max_seconds * 1000:
87
+ stop_event.set()
88
+ try:
89
+ await websocket.send(json.dumps({"type": "end"}))
90
+ except Exception:
91
+ pass
92
+ try:
93
+ await asyncio.wait_for(receiver_task, timeout=args.timeout)
94
+ except asyncio.TimeoutError:
95
+ receiver_task.cancel()
96
+ except KeyboardInterrupt:
97
+ print("Stopped by user.")
98
+ finally:
99
+ stream.stop_stream()
100
+ stream.close()
101
+ audio.terminate()
102
+ print(f"MIC_CHUNKS_SENT: {chunks_sent}")
103
+
104
+
105
+ def main():
106
+ parser = argparse.ArgumentParser(description="Live microphone ASR streaming demo.")
107
+ parser.add_argument("--lang", default="hi", help="ASR source language code. Default: hi")
108
+ parser.add_argument("--chunk-ms", type=int, default=200, help="Chunk duration in milliseconds. Default: 200")
109
+ parser.add_argument("--timeout", type=int, default=10, help="Wait time for final backend messages. Default: 10")
110
+ parser.add_argument("--max-seconds", type=int, default=8, help="Maximum seconds to record. Default: 8")
111
+ parser.add_argument("--vad", action="store_true", help="Send only speech chunks using VAD.")
112
+ parser.add_argument("--no-postprocessors", action="store_true", help="Disable ITN/punctuation postprocessors.")
113
+ args = parser.parse_args()
114
+ asyncio.run(run_mic_stream(args))
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()