simulstream 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. docs/source/conf.py +47 -0
  2. simulstream/__init__.py +15 -0
  3. simulstream/client/__init__.py +0 -0
  4. simulstream/client/wav_reader_client.py +228 -0
  5. simulstream/config.py +31 -0
  6. simulstream/inference.py +170 -0
  7. simulstream/metrics/__init__.py +0 -0
  8. simulstream/metrics/detokenizers.py +71 -0
  9. simulstream/metrics/logger.py +32 -0
  10. simulstream/metrics/readers.py +348 -0
  11. simulstream/metrics/score_latency.py +130 -0
  12. simulstream/metrics/score_quality.py +169 -0
  13. simulstream/metrics/scorers/__init__.py +0 -0
  14. simulstream/metrics/scorers/latency/__init__.py +115 -0
  15. simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
  16. simulstream/metrics/scorers/latency/stream_laal.py +119 -0
  17. simulstream/metrics/scorers/quality/__init__.py +132 -0
  18. simulstream/metrics/scorers/quality/comet.py +57 -0
  19. simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
  20. simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
  21. simulstream/metrics/stats.py +184 -0
  22. simulstream/server/__init__.py +0 -0
  23. simulstream/server/http_server.py +95 -0
  24. simulstream/server/message_processor.py +156 -0
  25. simulstream/server/speech_processors/__init__.py +173 -0
  26. simulstream/server/speech_processors/base.py +135 -0
  27. simulstream/server/speech_processors/base_streamatt.py +320 -0
  28. simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
  29. simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
  30. simulstream/server/speech_processors/incremental_output.py +85 -0
  31. simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
  32. simulstream/server/speech_processors/seamless_streamatt.py +268 -0
  33. simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
  34. simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
  35. simulstream/server/speech_processors/vad_wrapper.py +180 -0
  36. simulstream/server/websocket_server.py +236 -0
  37. simulstream-0.1.0.dist-info/METADATA +465 -0
  38. simulstream-0.1.0.dist-info/RECORD +48 -0
  39. simulstream-0.1.0.dist-info/WHEEL +5 -0
  40. simulstream-0.1.0.dist-info/entry_points.txt +8 -0
  41. simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
  42. simulstream-0.1.0.dist-info/top_level.txt +3 -0
  43. uts/__init__.py +0 -0
  44. uts/metrics/__init__.py +0 -0
  45. uts/metrics/log_reader.py +50 -0
  46. uts/speech_processors/__init__.py +0 -0
  47. uts/speech_processors/test_simuleval_wrapper.py +88 -0
  48. uts/utils.py +5 -0
@@ -0,0 +1,135 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ from difflib import SequenceMatcher
16
+ from types import SimpleNamespace
17
+ from typing import List
18
+
19
+ import torch
20
+
21
+ from simulstream.server.speech_processors import SAMPLE_RATE
22
+ from simulstream.server.speech_processors.base import BaseSpeechProcessor
23
+ from simulstream.server.speech_processors.incremental_output import IncrementalOutput
24
+
25
+
26
+ class SlidingWindowRetranslator(BaseSpeechProcessor):
27
+ """
28
+ A speech processor that applies a fixed-length sliding window retranslation with
29
+ deduplication to mitigate overlapping outputs when processing unsegmented audio streams.
30
+
31
+ This class implements the algorithm introduced in:
32
+
33
+ S. Sen, et al. 2025. *"Simultaneous Translation for Unsegmented Input:
34
+ A Sliding Window Approach"* (https://arxiv.org/pdf/2210.09754)
35
+
36
+ The approach relies on detecting the **longest common subsequence** between the current window
37
+ and the previous one, in order to prevent repeating tokens caused by overlapping audio windows.
38
+
39
+ Args:
40
+ config (SimpleNamespace): Configuration object. The following attributes are expected:
41
+
42
+ - **window_len (int)**: Length of the sliding window (in seconds).
43
+ - **matching_threshold (float, optional)**: Minimum fraction of the current tokens that
44
+ must match the previous history to be considered aligned. Default = ``0.1``.
45
+ - **override_on_failed_match (bool, optional)**: If ``True``, the previous history is
46
+ deleted from the output when no sufficient match is found. Otherwise, previous history
47
+ is kept and the new output is appended to the end of the previous history.
48
+ Default = ``False``.
49
+ - **max_tokens_per_second (int, optional)**: Maximum output tokens allowed per second of
50
+ audio. Default = ``10``.
51
+ """
52
+
53
+ def __init__(self, config: SimpleNamespace):
54
+ super().__init__(config)
55
+ self.window_len = self.config.window_len * SAMPLE_RATE
56
+ self.matching_threshold = getattr(self.config, "matching_threshold", 0.1)
57
+ self.override_on_failed_match = getattr(self.config, "override_on_failed_match", False)
58
+ self.max_tokens_per_second = getattr(self.config, "max_tokens_per_second", 10)
59
+ self.within_first_window = True
60
+
61
+ def _build_incremental_outputs(self, generated_tokens: List[str]) -> IncrementalOutput:
62
+ """
63
+ Deduplicates the output stream of overlapping windows using the algorithm introduced in
64
+ `S. Sen, et al. 2025. "Simultaneous Translation for Unsegmented Input:
65
+ A Sliding Window Approach" <https://arxiv.org/pdf/2210.09754>`_
66
+
67
+ This algorithm is based on the longest matching substring between the current and previous
68
+ window. We use tokens instead of string to match, though, as we have empirically observed
69
+ that tokenization is mostly consistent across generations of the same word.
70
+ """
71
+ if self.text_history is None or len(self.text_history) == 0:
72
+ self.text_history = generated_tokens
73
+ generated_string = self.tokens_to_string(generated_tokens)
74
+ return IncrementalOutput(
75
+ new_tokens=generated_tokens,
76
+ new_string=generated_string,
77
+ deleted_tokens=[],
78
+ deleted_string=""
79
+ )
80
+ seq_matcher = SequenceMatcher(None, self.text_history, generated_tokens, autojunk=False)
81
+ longest_match = seq_matcher.find_longest_match()
82
+ if longest_match.size >= self.matching_threshold * len(generated_tokens):
83
+ new_tokens = generated_tokens[longest_match.b + longest_match.size:]
84
+ deleted_tokens = self.text_history[longest_match.a + longest_match.size:]
85
+ new_string = self.tokens_to_string(new_tokens)
86
+ deleted_string = self.tokens_to_string(deleted_tokens)
87
+ # we take the matching part and the last part of the generated string as part of
88
+ # the history. Then we take from the history the tokens corresponding to the amount
89
+ # generated in this step, to ensure we have a sufficiently wide window
90
+ matching_and_last_tokens = generated_tokens[longest_match.b:]
91
+ initial_discarded_tokens = len(generated_tokens) - len(matching_and_last_tokens)
92
+ history_tokens_discarded = self.text_history[longest_match.a:]
93
+ history_initial_tokens = len(self.text_history) - len(history_tokens_discarded)
94
+ new_history_initial_tokens = self.text_history[
95
+ max(history_initial_tokens - initial_discarded_tokens, 0):history_initial_tokens]
96
+ self.text_history = new_history_initial_tokens + matching_and_last_tokens
97
+ else:
98
+ if self.within_first_window or self.override_on_failed_match:
99
+ deleted_tokens = self.text_history
100
+ deleted_string = self.tokens_to_string(self.text_history)
101
+ self.text_history = None
102
+ else:
103
+ deleted_tokens = []
104
+ deleted_string = ""
105
+ new_tokens = generated_tokens
106
+ new_string = self.tokens_to_string(generated_tokens)
107
+ self.text_history = generated_tokens
108
+ return IncrementalOutput(
109
+ new_tokens=new_tokens,
110
+ new_string=new_string,
111
+ deleted_tokens=deleted_tokens,
112
+ deleted_string=deleted_string,
113
+ )
114
+
115
+ def _update_speech_history(
116
+ self,
117
+ new_speech: torch.Tensor,
118
+ generated_tokens: List[str],
119
+ new_output: IncrementalOutput) -> None:
120
+ if self.within_first_window and len(self.audio_history) >= self.window_len:
121
+ self.within_first_window = False
122
+
123
+ def _update_text_history(
124
+ self,
125
+ new_speech: torch.Tensor,
126
+ generated_tokens: List[str],
127
+ new_output: IncrementalOutput) -> None:
128
+ pass
129
+
130
+ def end_of_stream(self) -> IncrementalOutput:
131
+ return IncrementalOutput([], "", [], "")
132
+
133
+ def clear(self) -> None:
134
+ super().clear()
135
+ self.within_first_window = True
@@ -0,0 +1,180 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ from types import SimpleNamespace
16
+ from typing import List
17
+
18
+ import numpy as np
19
+ from silero_vad import load_silero_vad, VADIterator
20
+
21
+ from simulstream.server.speech_processors import SAMPLE_RATE, SpeechProcessor, \
22
+ speech_processor_class_load
23
+ from simulstream.server.speech_processors.incremental_output import merge_incremental_outputs, \
24
+ IncrementalOutput
25
+
26
+
27
+ class VADWrapperSpeechProcessor(SpeechProcessor):
28
+ """
29
+ A speech processor that integrates **Voice Activity Detection (VAD)** to filter and split
30
+ continuous audio streams into meaningful speech chunks before processing them with an
31
+ underlying speech processor.
32
+
33
+ This class wraps a :class:`SpeechProcessor` implementation (defined by in the configuration via
34
+ the attribute `base_speech_processor_class`) with a Silero VAD-based iterator that detects the
35
+ start and end of speech segments. Audio outside of speech is ignored, and each detected segment
36
+ is passed to the underlying speech processor.
37
+
38
+ Args:
39
+ config (SimpleNamespace): Configuration object. The following attributes are used:
40
+
41
+ - **base_speech_processor_class (str)**: full name of the underlying speech processor
42
+ class to use.
43
+ - **vad_threshold (float, optional)**: VAD probability threshold. Default = ``0.5``.
44
+ - **vad_min_silence_duration_ms (int, optional)**: Minimum silence duration
45
+ (milliseconds) to consider the end of a speech segment. Default = ``100``.
46
+ - **vad_speech_pad_ms (int, optional)**: Padding (milliseconds) to include before and
47
+ after detected speech. Default = ``30``.
48
+ - **min_speech_size (int, optional)**: Minimum segment size in seconds; shorter
49
+ segments are ignored. Default = ``1``.
50
+ - Any additional attributes required by the subclass :py:attr:`speech_processor_class`.
51
+ """
52
+
53
+ @classmethod
54
+ def speech_processor_class(cls, config: SimpleNamespace) -> type[SpeechProcessor]:
55
+ return speech_processor_class_load(config.base_speech_processor_class)
56
+
57
+ @classmethod
58
+ def load_model(cls, config: SimpleNamespace):
59
+ super().load_model(config)
60
+ if not hasattr(cls, "vad_model") or cls.vad_model is None:
61
+ cls.vad_model = load_silero_vad()
62
+ cls.speech_processor_class(config).load_model(config)
63
+
64
+ def __init__(self, config: SimpleNamespace):
65
+ super().__init__(config)
66
+ self.speech_processor = self.speech_processor_class(self.config)(self.config)
67
+ self.min_speech_size = getattr(self.config, "min_speech_size", 1) * SAMPLE_RATE
68
+ self.vad_iterator = VADIterator(
69
+ self.vad_model,
70
+ threshold=getattr(self.config, "vad_threshold", 0.5),
71
+ sampling_rate=SAMPLE_RATE,
72
+ min_silence_duration_ms=getattr(self.config, "vad_min_silence_duration_ms", 100),
73
+ speech_pad_ms=getattr(self.config, "vad_speech_pad_ms", 30),
74
+ )
75
+ self.residual_prev_audio = None
76
+ self.speech_buffer = None
77
+ self.audio_cursor_position = 0
78
+ self.in_speech = False
79
+ assert SAMPLE_RATE == 16000, \
80
+ "SileroHFSlidingWindowRetranslator supports only 16kHz sampling rate"
81
+ self.window_size_samples = 512 # assuming 16kHz
82
+ self.previous_audio_chunk = None # needed as VAD uses padding before start
83
+
84
+ def clear(self) -> None:
85
+ super().clear()
86
+ self.residual_prev_audio = None
87
+ self.speech_buffer = None
88
+ self.audio_cursor_position = 0
89
+ self.in_speech = False
90
+ self.previous_audio_chunk = None
91
+ self.vad_iterator.reset_states()
92
+ self.speech_processor.clear()
93
+
94
+ def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
95
+ if self.residual_prev_audio is not None:
96
+ waveform = np.concatenate((self.residual_prev_audio, waveform))
97
+ self.residual_prev_audio = None
98
+ # we can have more than one generate if there are multiple speech segments in the current
99
+ # chunk
100
+ outputs = []
101
+
102
+ for i in range(0, len(waveform), self.window_size_samples):
103
+ chunk = waveform[i: i + self.window_size_samples]
104
+ if len(chunk) < self.window_size_samples:
105
+ # process tailing audio with the next waveform chunk
106
+ self.residual_prev_audio = chunk
107
+ break
108
+ speech_dict = self.vad_iterator(chunk, return_seconds=False)
109
+ if speech_dict:
110
+ # if a VAD event happens, update the status accordingly
111
+ assert not ('start' in speech_dict and 'end' in speech_dict)
112
+ if 'start' in speech_dict:
113
+ assert not self.in_speech, \
114
+ "Cannot start a new segment when current one is being processed. " \
115
+ "This means there is a bug in the implementation."
116
+ start_offset = self.audio_cursor_position - len(self.previous_audio_chunk)
117
+ chunk_start_position = speech_dict['start'] - start_offset
118
+ assert chunk_start_position >= 0
119
+ self.speech_buffer = np.concatenate(
120
+ (self.previous_audio_chunk, chunk))[chunk_start_position:]
121
+ self.in_speech = True
122
+ if 'end' in speech_dict:
123
+ assert self.in_speech, \
124
+ "Cannot end a segment when no current segment is being processed. " \
125
+ "This means there is a bug in the implementation."
126
+ speech_buffer_len = len(self.speech_buffer) \
127
+ if self.speech_buffer is not None else 0
128
+ speech_buffer_offset = self.audio_cursor_position - speech_buffer_len
129
+ chunk_end_position = speech_dict['end'] - speech_buffer_offset
130
+ # In case we already processed more audio than needed (i.e., we already
131
+ # processed beyond the end by VAD, we skip further processing; otherwise, we
132
+ # process the remaining unhandled audio).
133
+ # We can process more audio after the end as the VAD takes ~100ms (see the
134
+ # min_silence_duration_ms parameter of the VAD) to emit the end signal, so if
135
+ # we already processed the partial speech buffer (see min_speech_size in this
136
+ # processor), we may have processed up to extra 100ms.
137
+ if chunk_end_position >= 0:
138
+ self.append_to_speech_buffer(chunk)
139
+ self.speech_buffer = self.speech_buffer[:chunk_end_position]
140
+ outputs.append(self.speech_processor.process_chunk(self.speech_buffer))
141
+ self.in_speech = False
142
+ self.speech_buffer = None
143
+ outputs.append(self.speech_processor.end_of_stream())
144
+ # reset history at the end of a segment
145
+ if hasattr(self.speech_processor, 'text_history'):
146
+ self.speech_processor.text_history = None
147
+ if hasattr(self.speech_processor, 'audio_history'):
148
+ self.speech_processor.audio_history = None
149
+ else:
150
+ # if no VAD event happens, we just ignore the audio if we are outside speech and
151
+ # update the buffer in case we are in speech
152
+ if self.in_speech:
153
+ self.append_to_speech_buffer(chunk)
154
+ # update cursor position
155
+ self.audio_cursor_position += self.window_size_samples
156
+ self.previous_audio_chunk = chunk
157
+
158
+ if self.in_speech and len(self.speech_buffer) > self.min_speech_size:
159
+ outputs.append(self.speech_processor.process_chunk(self.speech_buffer))
160
+ self.speech_buffer = None
161
+
162
+ return merge_incremental_outputs(outputs, self.tokens_to_string)
163
+
164
+ def append_to_speech_buffer(self, audio_chunk: np.float32) -> None:
165
+ if self.speech_buffer is None:
166
+ self.speech_buffer = audio_chunk
167
+ else:
168
+ self.speech_buffer = np.concatenate((self.speech_buffer, audio_chunk))
169
+
170
+ def set_source_language(self, language: str) -> None:
171
+ self.speech_processor.set_source_language(language)
172
+
173
+ def set_target_language(self, language: str) -> None:
174
+ self.speech_processor.set_target_language(language)
175
+
176
+ def tokens_to_string(self, tokens: List[str]) -> str:
177
+ return self.speech_processor.tokens_to_string(tokens)
178
+
179
+ def end_of_stream(self) -> IncrementalOutput:
180
+ return self.speech_processor.end_of_stream()
@@ -0,0 +1,236 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import asyncio
17
+ import logging
18
+ import time
19
+ from contextlib import asynccontextmanager
20
+ from types import SimpleNamespace
21
+ from typing import Callable, Awaitable
22
+
23
+ import websockets
24
+ from websockets.asyncio.server import serve, ServerConnection
25
+ import json
26
+
27
+ import simulstream
28
+ from simulstream.config import yaml_config
29
+ from simulstream.metrics.logger import setup_metrics_logger, METRICS_LOGGER
30
+ from simulstream.server.message_processor import MessageProcessor
31
+ from simulstream.server.speech_processors import build_speech_processor
32
+
33
+
34
+ logging.basicConfig(
35
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
36
+ datefmt='%Y-%m-%d %H:%M:%S',
37
+ level=logging.INFO,
38
+ )
39
+ LOGGER = logging.getLogger('simulstream.websocket_server')
40
+
41
+
42
+ class SpeechProcessorPool:
43
+ """
44
+ A pool of speech processors initialized at startup and made available to clients.
45
+
46
+ Args:
47
+ speech_processor_config (SimpleNamespace): configuration for the speech processors to
48
+ create.
49
+ size (int): number of speech processors to have in the pool.
50
+ acquire_timeout (int): timeout (in seconds) for waiting the availability of a speech
51
+ processor.
52
+ """
53
+ def __init__(
54
+ self,
55
+ speech_processor_config: SimpleNamespace,
56
+ size: int,
57
+ acquire_timeout: int):
58
+ self.size = size
59
+ self.acquire_timeout = acquire_timeout
60
+ self.available = asyncio.Queue(maxsize=size)
61
+ for _ in range(size):
62
+ self.available.put_nowait(build_speech_processor(speech_processor_config))
63
+
64
+ @asynccontextmanager
65
+ async def acquire(self):
66
+ """
67
+ Acquire one process from the pool and release it automatically.
68
+
69
+ Returns:
70
+ SpeechProcessor: a speech processor available for usage.
71
+
72
+ Raises:
73
+ TimeoutError: if no speech processor is available within the configured timeout.
74
+ """
75
+ speech_processor = await asyncio.wait_for(
76
+ self.available.get(), timeout=self.acquire_timeout)
77
+ try:
78
+ yield speech_processor
79
+ finally:
80
+ # Return worker to pool
81
+ self.available.put_nowait(speech_processor)
82
+
83
+
84
+ def connection_handler_factory(
85
+ speech_processor_pool: SpeechProcessorPool
86
+ ) -> Callable[[ServerConnection], Awaitable[None]]:
87
+ """
88
+ Factory function that creates a connection handler for the WebSocket server.
89
+
90
+ The returned connection handler routine will process audio and metadata messages sent by a
91
+ single client over WebSocket.
92
+
93
+ The handler receives client data (raw audio chunks and textual metadata) and forwards it to a
94
+ message processor using a speech processor retrieved form the pool of the available ones.
95
+ If no speech processor is available for the configured waiting time, the client connection is
96
+ closed.
97
+ The handler also sends incremental processing results back to the client in JSON format.
98
+
99
+ Args:
100
+ speech_processor_pool (SpeechProcessorPool): Pool of speech processors to use to handle
101
+ client connections.
102
+
103
+ Returns:
104
+ Callable[[websockets.asyncio.server.ServerConnection], Awaitable[None]]: An asynchronous
105
+ WebSocket connection handler coroutine.
106
+ """
107
+
108
+ async def handle_connection(websocket: ServerConnection) -> None:
109
+ """
110
+ Handles a single client WebSocket connection.
111
+
112
+ This is the coroutine that processes incoming messages from a client:
113
+
114
+ - If the message is binary (``bytes``), it is interpreted as raw audio data and
115
+ buffered until a full chunk is ready for processing.
116
+ - If the message is text (``str``), it is parsed as JSON metadata and can:
117
+
118
+ - Set the input sample rate.
119
+ - Set source and target languages for translation.
120
+ - Log custom metadata to the metrics logger.
121
+ - Indicate the end of the audio stream.
122
+
123
+ At the end of the stream, any remaining audio is processed, the processor state is cleared,
124
+ and an ``end_of_processing`` message is sent to the client.
125
+
126
+ Args:
127
+ websocket (websockets.asyncio.server.ServerConnection): The WebSocket connection for
128
+ the client.
129
+ """
130
+ loop = asyncio.get_running_loop()
131
+ client_id = id(websocket)
132
+ LOGGER.info(f"Client {client_id} connected")
133
+ try:
134
+ async with speech_processor_pool.acquire() as speech_processor:
135
+ message_processor = MessageProcessor(client_id, speech_processor)
136
+
137
+ try:
138
+ async for message in websocket:
139
+ if isinstance(message, bytes):
140
+ # in this case we are processing an audio chunk
141
+ incremental_output = await loop.run_in_executor(
142
+ None, message_processor.process_speech, message)
143
+ if incremental_output is not None:
144
+ await websocket.send(incremental_output.strings_to_json())
145
+ elif isinstance(message, str):
146
+ # textual message are used to handle metadata
147
+ try:
148
+ data = json.loads(message)
149
+ if 'end_of_stream' in data:
150
+ incremental_output = await loop.run_in_executor(
151
+ None, message_processor.end_of_stream)
152
+ await websocket.send(incremental_output.strings_to_json())
153
+ await websocket.send(json.dumps({'end_of_processing': True}))
154
+ else:
155
+ message_processor.process_metadata(data)
156
+ except Exception as e:
157
+ LOGGER.error(
158
+ f"Invalid string message: {message}. Error: {e}. Ignoring it.")
159
+ except websockets.exceptions.ConnectionClosed:
160
+ LOGGER.info(f"Client {client_id} disconnected.")
161
+ except Exception as e:
162
+ LOGGER.exception(f"Error: {e}")
163
+ finally:
164
+ message_processor.clear()
165
+ except TimeoutError:
166
+ LOGGER.error(f"Timeout waiting for a new processor for client {client_id}")
167
+ await websocket.close()
168
+
169
+ return handle_connection
170
+
171
+
172
+ async def main(args: argparse.Namespace):
173
+ """
174
+ Main entry point for running the WebSocket speech server.
175
+
176
+ This function loads the server and speech processor configurations from YAML,
177
+ initializes logging (including metrics logging), and starts the WebSocket server
178
+ on the configured host and port.
179
+
180
+ Args:
181
+ args (argparse.Namespace): parsed command-line arguments with configuration file paths.
182
+ """
183
+ LOGGER.info(f"Loading server configuration from {args.server_config}")
184
+ server_config = yaml_config(args.server_config)
185
+ LOGGER.info(
186
+ f"Metric logging is{'' if server_config.metrics.enabled else ' NOT'} enabled at "
187
+ f"{server_config.metrics.filename}")
188
+ setup_metrics_logger(server_config.metrics)
189
+ LOGGER.info(f"Loading speech processor from {args.speech_processor_config}")
190
+ speech_processor_config = yaml_config(args.speech_processor_config)
191
+ LOGGER.info(f"Using as speech processor: {speech_processor_config.type}")
192
+ speech_processor_loading_time = time.time()
193
+ speech_processors_pool = SpeechProcessorPool(
194
+ speech_processor_config, server_config.pool_size, server_config.acquire_timeout)
195
+ speech_processor_loading_time = time.time() - speech_processor_loading_time
196
+ LOGGER.info(f"Loaded speech processor in {speech_processor_loading_time:.3f} seconds")
197
+ METRICS_LOGGER.info(json.dumps({
198
+ "model_loading_time": speech_processor_loading_time,
199
+ }))
200
+ LOGGER.info(f"Serving websocket server at {server_config.hostname}:{server_config.port}")
201
+ async with serve(
202
+ connection_handler_factory(speech_processors_pool),
203
+ server_config.hostname,
204
+ server_config.port,
205
+ ping_timeout=None) as server:
206
+ await server.serve_forever()
207
+
208
+
209
+ def cli_main():
210
+ """
211
+ Simulstream WebSocket server command-line interface (CLI) entry point.
212
+
213
+ This function parses command-line arguments and starts the asynchronous :func:`main` routine.
214
+
215
+ Example usage::
216
+
217
+ $ python websocket_server.py --server-config config/server.yaml \\
218
+ --speech-processor-config config/speech.yaml
219
+
220
+ Command-line arguments:
221
+
222
+ - ``--server-config`` (str, optional): Path to the server configuration file
223
+ (default: ``config/server.yaml``).
224
+ - ``--speech-processor-config`` (str, required): Path to the speech processor configuration
225
+ file.
226
+ """
227
+ LOGGER.info(f"Websocket server version: {simulstream.__version__}")
228
+ parser = argparse.ArgumentParser("websocket_simul_server")
229
+ parser.add_argument("--server-config", type=str, default="config/server.yaml")
230
+ parser.add_argument("--speech-processor-config", type=str, required=True)
231
+ args = parser.parse_args()
232
+ asyncio.run(main(args))
233
+
234
+
235
+ if __name__ == "__main__":
236
+ cli_main()