speechmatics-rt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,51 @@
1
+ __version__ = "0.1.0"
2
+
3
+ from ._async_client import AsyncClient
4
+ from ._events import EventEmitter
5
+ from ._exceptions import AudioError
6
+ from ._exceptions import AuthenticationError
7
+ from ._exceptions import ConfigurationError
8
+ from ._exceptions import ConnectionError
9
+ from ._exceptions import EndOfTranscriptError
10
+ from ._exceptions import ForceEndSession
11
+ from ._exceptions import SessionError
12
+ from ._exceptions import TimeoutError
13
+ from ._exceptions import TranscriptionError
14
+ from ._exceptions import TransportError
15
+ from ._models import AudioEncoding
16
+ from ._models import AudioEventsConfig
17
+ from ._models import AudioFormat
18
+ from ._models import ClientMessageType
19
+ from ._models import ConnectionConfig
20
+ from ._models import OperatingPoint
21
+ from ._models import ServerMessageType
22
+ from ._models import SessionInfo
23
+ from ._models import TranscriptionConfig
24
+ from ._models import TranscriptResult
25
+ from ._models import TranslationConfig
26
+
27
+ __all__ = [
28
+ "AsyncClient",
29
+ "EventEmitter",
30
+ "AudioFormat",
31
+ "AudioEventsConfig",
32
+ "TranscriptionConfig",
33
+ "TranslationConfig",
34
+ "ConnectionConfig",
35
+ "SessionInfo",
36
+ "TranscriptResult",
37
+ "AudioEncoding",
38
+ "ClientMessageType",
39
+ "ServerMessageType",
40
+ "OperatingPoint",
41
+ "ConfigurationError",
42
+ "AuthenticationError",
43
+ "ConnectionError",
44
+ "TransportError",
45
+ "TranscriptionError",
46
+ "AudioError",
47
+ "SessionError",
48
+ "TimeoutError",
49
+ "EndOfTranscriptError",
50
+ "ForceEndSession",
51
+ ]
@@ -0,0 +1,542 @@
1
+ """
2
+ Asynchronous client for Speechmatics real-time transcription.
3
+
4
+ This module provides the main AsyncClient class that handles real-time
5
+ speech-to-text transcription using the Speechmatics RT API.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import os
12
+ import uuid
13
+ from typing import Any
14
+ from typing import BinaryIO
15
+ from typing import Optional
16
+
17
+ from ._events import EventEmitter
18
+ from ._exceptions import AudioError
19
+ from ._exceptions import ConfigurationError
20
+ from ._exceptions import EndOfTranscriptError
21
+ from ._exceptions import ForceEndSession
22
+ from ._exceptions import SessionError
23
+ from ._exceptions import TimeoutError
24
+ from ._exceptions import TranscriptionError
25
+ from ._helpers import read_audio_chunks
26
+ from ._logging import get_logger
27
+ from ._models import AudioEventsConfig
28
+ from ._models import AudioFormat
29
+ from ._models import ClientMessageType
30
+ from ._models import ConnectionConfig
31
+ from ._models import ServerMessageType
32
+ from ._models import SessionInfo
33
+ from ._models import TranscriptionConfig
34
+ from ._models import TranslationConfig
35
+ from ._transport import Transport
36
+
37
+
38
+ class AsyncClient(EventEmitter):
39
+ """
40
+ Asynchronous client for Speechmatics real-time audio transcription.
41
+
42
+ This client provides a full-featured async interface to the Speechmatics RT API,
43
+ supporting real-time audio streaming, event-driven transcript handling, and
44
+ comprehensive error management.
45
+
46
+ Args:
47
+ api_key: Speechmatics API key for authentication. If not provided,
48
+ uses the SPEECHMATICS_API_KEY environment variable.
49
+ url: WebSocket endpoint URL. If not provided, uses SPEECHMATICS_RT_URL
50
+ environment variable or defaults to EU endpoint.
51
+ conn_config: Complete connection configuration object. If provided, overrides
52
+ api_key and url parameters.
53
+
54
+ Raises:
55
+ ConfigurationError: If required configuration is missing or invalid.
56
+
57
+ Examples:
58
+ Basic usage with event handlers:
59
+ >>> async with AsyncClient(api_key="your-key") as client:
60
+ ... @client.on(ServerMessageType.ADD_TRANSCRIPT)
61
+ ... def handle_transcript(message):
62
+ ... result = TranscriptResult.from_message(message)
63
+ ... print(f"Final: {result.transcript}")
64
+ ...
65
+ ... with open("audio.wav", "rb") as audio:
66
+ ... await client.transcribe(audio)
67
+
68
+ With custom configuration:
69
+ >>> config = ConnectionConfig(
70
+ ... url="wss://eu2.rt.speechmatics.com/v2",
71
+ ... api_key="your-key",
72
+ ... )
73
+ >>> async with AsyncClient(conn_config=config) as client:
74
+ ... # Use client with custom settings
75
+ ... pass
76
+
77
+ Manual resource management:
78
+ >>> client = AsyncClient(api_key="your-key")
79
+ >>> try:
80
+ ... await client.transcribe(audio_stream)
81
+ ... finally:
82
+ ... await client.close()
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ *,
88
+ api_key: Optional[str] = None,
89
+ url: Optional[str] = None,
90
+ conn_config: Optional[ConnectionConfig] = None,
91
+ ) -> None:
92
+ """
93
+ Initialize the AsyncClient.
94
+
95
+ Args:
96
+ api_key: Speechmatics API key. If None, uses SPEECHMATICS_API_KEY env var.
97
+ url: WebSocket endpoint URL. If None, uses SPEECHMATICS_RT_URL env var
98
+ or defaults to EU endpoint.
99
+ conn_config: Complete connection configuration. Overrides api_key and url.
100
+
101
+ Raises:
102
+ ConfigurationError: If API key is not provided and not found in environment.
103
+ """
104
+ super().__init__()
105
+
106
+ if conn_config:
107
+ self._conn_config = conn_config
108
+ else:
109
+ api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY")
110
+ if not api_key:
111
+ raise ConfigurationError("API key required: provide api_key parameter or set SPEECHMATICS_API_KEY")
112
+
113
+ url = url or os.environ.get("SPEECHMATICS_RT_URL", "wss://eu2.rt.speechmatics.com/v2")
114
+ self._conn_config = ConnectionConfig(url=url, api_key=api_key) # type: ignore[arg-type]
115
+
116
+ self._session = SessionInfo(request_id=str(uuid.uuid4()))
117
+ self._transport = Transport(self._conn_config, self._session.request_id)
118
+ self._logger = get_logger(__name__)
119
+ self._recognition_started = asyncio.Event()
120
+ self._seq_no = 0
121
+
122
+ self._logger.debug("AsyncClient initialized with request_id=%s", self._session.request_id)
123
+
124
+ async def __aenter__(self) -> AsyncClient:
125
+ """
126
+ Async context manager entry.
127
+
128
+ Returns:
129
+ Self for use in async with statements.
130
+
131
+ Examples:
132
+ >>> async with AsyncClient(api_key="key") as client:
133
+ ... await client.transcribe(audio_stream)
134
+ """
135
+ return self
136
+
137
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
138
+ """
139
+ Async context manager exit with automatic cleanup.
140
+
141
+ Ensures all resources are properly cleaned up when exiting the
142
+ async context manager, including closing connections and removing
143
+ event listeners.
144
+
145
+ Args:
146
+ exc_type: Exception type if an exception occurred.
147
+ exc_val: Exception value if an exception occurred.
148
+ exc_tb: Exception traceback if an exception occurred.
149
+ """
150
+ await self.close()
151
+
152
+ async def transcribe(
153
+ self,
154
+ audio_stream: BinaryIO,
155
+ *,
156
+ transcription_config: Optional[TranscriptionConfig] = None,
157
+ audio_format: Optional[AudioFormat] = None,
158
+ translation_config: Optional[TranslationConfig] = None,
159
+ audio_events_config: Optional[AudioEventsConfig] = None,
160
+ ws_headers: Optional[dict] = None,
161
+ timeout: Optional[float] = None,
162
+ ) -> None:
163
+ """
164
+ This is the main method for transcribing audio. It establishes a WebSocket
165
+ connection to the Speechmatics RT API, streams audio data, and processes
166
+ transcription results through registered event handlers.
167
+
168
+ The method handles the complete transcription workflow:
169
+ 1. Validates input parameters
170
+ 2. Establishes WebSocket connection
171
+ 3. Starts recognition session
172
+ 4. Streams audio data concurrently with receiving results
173
+ 5. Handles session completion and cleanup
174
+
175
+ Args:
176
+ audio_stream: Audio data source with a read() method. Can be a file
177
+ object, BytesIO, or any object supporting the binary
178
+ read interface.
179
+ transcription_config: Configuration for transcription behavior such as
180
+ language, partial transcripts, and advanced features.
181
+ Uses default if not provided.
182
+ audio_format: Audio format specification including encoding, sample rate,
183
+ and chunk size. Uses default (PCM 16-bit LE, 44.1kHz) if
184
+ not provided.
185
+ ws_headers: Additional HTTP headers to include in the WebSocket handshake.
186
+ timeout: Maximum time in seconds to wait for transcription completion.
187
+ Uses connection default if not provided.
188
+
189
+ Raises:
190
+ AudioError: If the audio stream is invalid or audio processing fails.
191
+ AuthenticationError: If API key is invalid or authentication fails.
192
+ ConnectionError: If connection to the service cannot be established.
193
+ SessionError: If there's an error in session management.
194
+ TimeoutError: If the operation exceeds the specified timeout.
195
+ TranscriptionError: If transcription processing fails.
196
+ EndOfTranscriptError: Normal completion signal (caught internally).
197
+ ForceEndSession: If session is terminated early by user code.
198
+
199
+ Examples:
200
+ Basic transcription:
201
+ >>> async with AsyncClient(api_key="key") as client:
202
+ ... @client.on(ServerMessageType.ADD_TRANSCRIPT)
203
+ ... def handle_result(message):
204
+ ... result = TranscriptResult.from_message(message)
205
+ ... print(result.transcript)
206
+ ...
207
+ ... with open("audio.wav", "rb") as audio:
208
+ ... await client.transcribe(audio)
209
+
210
+ With custom configuration:
211
+ >>> config = TranscriptionConfig(
212
+ ... language="es",
213
+ ... enable_partials=True,
214
+ ... enable_entities=True
215
+ ... )
216
+ >>> audio_format = AudioFormat(
217
+ ... encoding=AudioEncoding.PCM_S16LE,
218
+ ... sample_rate=16000
219
+ ... )
220
+ >>>
221
+ >>> await client.transcribe(
222
+ ... audio_stream,
223
+ ... transcription_config=config,
224
+ ... audio_format=audio_format,
225
+ ... timeout=300.0
226
+ ... )
227
+ """
228
+ if not audio_stream:
229
+ raise AudioError("Audio stream cannot be None")
230
+
231
+ transcription_config = transcription_config or TranscriptionConfig()
232
+ audio_format = audio_format or AudioFormat()
233
+
234
+ self._session.is_running = False
235
+ self._recognition_started.clear()
236
+ self._end_of_stream_sent = False
237
+ self._seq_no = 0
238
+
239
+ self._logger.debug(
240
+ "Starting transcription (transcription_config=%s, audio_format=%s)",
241
+ transcription_config.to_dict(),
242
+ audio_format.to_dict(),
243
+ )
244
+
245
+ try:
246
+ await asyncio.wait_for(
247
+ self._run_transcription(
248
+ audio_stream,
249
+ transcription_config,
250
+ audio_format,
251
+ translation_config,
252
+ audio_events_config,
253
+ ws_headers,
254
+ ),
255
+ timeout=timeout,
256
+ )
257
+ except asyncio.TimeoutError:
258
+ raise TimeoutError(f"Transcription timed out after {timeout} seconds")
259
+ except (EndOfTranscriptError, ForceEndSession):
260
+ # Normal completion
261
+ pass
262
+ finally:
263
+ self._session.is_running = False
264
+
265
+ async def close(self) -> None:
266
+ """
267
+ Close the client and cleanup all resources.
268
+
269
+ This method ensures proper cleanup of all client resources including:
270
+ - Sending end-of-stream message if not already sent
271
+ - Closing WebSocket connection
272
+ - Removing all registered event listeners
273
+ - Marking session as not running
274
+
275
+ Examples:
276
+ >>> client = AsyncClient(api_key="key")
277
+ >>> try:
278
+ ... await client.transcribe(audio_stream)
279
+ >>> finally:
280
+ ... await client.close()
281
+ """
282
+ self._session.is_running = False
283
+
284
+ try:
285
+ if not self._end_of_stream_sent:
286
+ await self._send_end_of_stream()
287
+ except Exception:
288
+ pass
289
+
290
+ try:
291
+ await self._transport.close()
292
+ except Exception:
293
+ pass
294
+
295
+ self.remove_all_listeners()
296
+
297
+ async def _run_transcription(
298
+ self,
299
+ audio_stream: BinaryIO,
300
+ transcription_config: TranscriptionConfig,
301
+ audio_format: AudioFormat,
302
+ translation_config: Optional[TranslationConfig],
303
+ audio_events_config: Optional[AudioEventsConfig],
304
+ ws_headers: Optional[dict],
305
+ ) -> None:
306
+ """
307
+ This internal method orchestrates the entire transcription process:
308
+ 1. Establishes WebSocket connection
309
+ 2. Sends start recognition message
310
+ 3. Waits for recognition confirmation
311
+ 4. Runs audio streaming and message receiving concurrently
312
+
313
+ Args:
314
+ audio_stream: Audio data source to transcribe.
315
+ transcription_config: Transcription configuration settings.
316
+ audio_format: Audio format specification.
317
+ translation_config: Optional translation configuration.
318
+ audio_events_config: Optional audio events configuration.
319
+ ws_headers: Additional WebSocket headers.
320
+ """
321
+ self._logger.debug("Establishing WebSocket connection")
322
+ await self._transport.connect(ws_headers)
323
+
324
+ self._logger.debug("Starting recognition session")
325
+ await self._start_recognition(
326
+ transcription_config,
327
+ audio_format,
328
+ translation_config,
329
+ audio_events_config,
330
+ )
331
+
332
+ await asyncio.gather(
333
+ self._audio_producer(audio_stream, audio_format),
334
+ self._message_consumer(),
335
+ return_exceptions=True,
336
+ )
337
+
338
+ async def _start_recognition(
339
+ self,
340
+ transcription_config: TranscriptionConfig,
341
+ audio_format: AudioFormat,
342
+ translation_config: Optional[TranslationConfig] = None,
343
+ audio_events_config: Optional[AudioEventsConfig] = None,
344
+ ) -> None:
345
+ """
346
+ Send StartRecognition message to begin transcription session.
347
+
348
+ Constructs and sends the StartRecognition message with the specified
349
+ transcription and audio format configuration to initialize the
350
+ transcription session.
351
+
352
+ Args:
353
+ transcription_config: Configuration for transcription behavior.
354
+ audio_format: Audio format specification for the session.
355
+ translation_config: Optional configuration for translation.
356
+ audio_events_config: Optional configuration for audio events.
357
+
358
+ Raises:
359
+ TransportError: If sending the message fails.
360
+ """
361
+ start_message = {
362
+ "message": ClientMessageType.START_RECOGNITION,
363
+ "audio_format": audio_format.to_dict(),
364
+ "transcription_config": transcription_config.to_dict(),
365
+ }
366
+
367
+ if translation_config:
368
+ start_message["translation_config"] = translation_config.to_dict()
369
+
370
+ if audio_events_config:
371
+ start_message["audio_events_config"] = audio_events_config.to_dict()
372
+
373
+ self._logger.debug("Sending StartRecognition message for language=%s", transcription_config.language)
374
+ await self._transport.send_message(start_message)
375
+ self._session.is_running = True
376
+
377
+ async def _audio_producer(self, audio_stream: BinaryIO, audio_format: AudioFormat) -> None:
378
+ """
379
+ This method continuously reads audio chunks from the input stream and
380
+ sends them to the service via WebSocket. It handles the audio streaming
381
+ loop and sends an end-of-stream message when complete.
382
+
383
+ Args:
384
+ audio_stream: Audio data source with read() method.
385
+ audio_format: Audio format specification including chunk size.
386
+
387
+ Raises:
388
+ AudioError: If audio reading or sending fails.
389
+ """
390
+ await self._recognition_started.wait()
391
+ self._logger.debug("Recognition started, beginning audio streaming (chunk_size=%d)", audio_format.chunk_size)
392
+
393
+ try:
394
+ chunk_count = 0
395
+ last_log_time = 0.0
396
+ import time
397
+
398
+ async for chunk in read_audio_chunks(audio_stream, audio_format.chunk_size):
399
+ if not self._session.is_running:
400
+ break
401
+
402
+ self._seq_no += 1
403
+ chunk_count += 1
404
+ await self._transport.send_message(chunk)
405
+
406
+ # Log progress every 5 seconds
407
+ current_time: float = time.time()
408
+ if current_time - last_log_time >= 5.0:
409
+ self._logger.debug("Audio streaming progress (chunks=%d, seq_no=%d)", chunk_count, self._seq_no)
410
+ last_log_time = current_time
411
+
412
+ self._logger.debug("Audio streaming complete (%d chunks total)", chunk_count)
413
+ await self._send_end_of_stream()
414
+
415
+ except Exception as e:
416
+ self._logger.error("Audio sender error: %s", e)
417
+ self._session.is_running = False
418
+ raise AudioError(f"Failed to send audio: {e}")
419
+
420
+ async def _message_consumer(self) -> None:
421
+ """
422
+ Continuously receive and process messages from the transcription service.
423
+
424
+ This method runs a message receiving loop that handles all incoming
425
+ messages from the server including transcription results, errors, and
426
+ session control messages. It processes each message and routes them
427
+ to appropriate handlers.
428
+
429
+ The method uses timeouts to avoid blocking indefinitely and handles
430
+ connection closure gracefully with specific error detection for
431
+ authentication failures.
432
+
433
+ Raises:
434
+ AuthenticationError: If authentication fails or connection is closed
435
+ due to invalid credentials.
436
+ SessionError: For other session or connection errors.
437
+ """
438
+ try:
439
+ while self._session.is_running:
440
+ try:
441
+ message = await asyncio.wait_for(
442
+ self._transport.receive_message(),
443
+ timeout=1.0,
444
+ )
445
+ await self._handle_message(message)
446
+ except asyncio.TimeoutError:
447
+ continue
448
+
449
+ except (EndOfTranscriptError, ForceEndSession):
450
+ # These are expected control flow exceptions, not errors
451
+ # EndOfTranscriptError signals normal completion
452
+ # ForceEndSession signals user-requested early termination
453
+ self._session.is_running = False
454
+ raise
455
+
456
+ except Exception as e:
457
+ self._logger.error("Message receiver error: %s", e)
458
+ self._session.is_running = False
459
+ raise SessionError(f"Message receiver error: {e}")
460
+
461
+ async def _handle_message(self, message: dict[str, Any]) -> None:
462
+ """
463
+ Process an incoming message from the transcription service.
464
+
465
+ This method handles different types of server messages including:
466
+ - RecognitionStarted: Marks recognition as started
467
+ - EndOfTranscript: Signals transcription completion
468
+ - Error: Handles various error conditions
469
+ - All other messages: Emitted as events to registered handlers
470
+
471
+ Args:
472
+ message: The message dictionary received from the server.
473
+
474
+ Raises:
475
+ AuthenticationError: For authentication-related errors.
476
+ SessionError: For other server errors.
477
+ EndOfTranscriptError: When transcription completes normally.
478
+ ForceEndSession: If user code requests early termination.
479
+ """
480
+ message_type = message.get("message")
481
+
482
+ if not message_type:
483
+ return
484
+
485
+ try:
486
+ server_msg_type = ServerMessageType(message_type)
487
+ except ValueError:
488
+ self._logger.warning("Unknown message type: %s", message_type)
489
+ return
490
+
491
+ if server_msg_type == ServerMessageType.RECOGNITION_STARTED:
492
+ self._session.session_id = message.get("id")
493
+ self._recognition_started.set()
494
+
495
+ elif server_msg_type == ServerMessageType.END_OF_TRANSCRIPT:
496
+ self._session.is_running = False
497
+ raise EndOfTranscriptError("Transcription completed")
498
+
499
+ elif server_msg_type == ServerMessageType.WARNING:
500
+ self._logger.warning(
501
+ "Session warning (session_id=%s): %s", self._session.session_id, message.get("reason", "Unknown")
502
+ )
503
+
504
+ elif server_msg_type == ServerMessageType.ERROR:
505
+ self._session.is_running = False
506
+ self._logger.error(
507
+ "Transcription error (session_id=%s): %s",
508
+ self._session.session_id,
509
+ message.get("reason", "Unknown error"),
510
+ )
511
+ raise TranscriptionError(message.get("reason", "Unknown error"))
512
+
513
+ try:
514
+ self.emit(server_msg_type, message)
515
+ except ForceEndSession:
516
+ self._logger.warning(
517
+ "Session was ended forcefully by an event handler (session_id=%s)", self._session.session_id
518
+ )
519
+ self._session.is_running = False
520
+ raise
521
+ except Exception as e:
522
+ self._logger.warning("Event handler error: %s", e)
523
+
524
+ async def _send_end_of_stream(self) -> None:
525
+ """
526
+ This method constructs and sends the EndOfStream message to notify
527
+ the server that no more audio data will be sent. It includes the
528
+ last sequence number for tracking.
529
+
530
+ Raises:
531
+ TransportError: If sending the message fails.
532
+ """
533
+ if self._end_of_stream_sent:
534
+ return
535
+
536
+ end_message = {
537
+ "message": ClientMessageType.END_OF_STREAM,
538
+ "last_seq_no": self._seq_no,
539
+ }
540
+ self._logger.debug("Sending EndOfStream message (last_seq_no=%d)", self._seq_no)
541
+ await self._transport.send_message(end_message)
542
+ self._end_of_stream_sent = True