simulstream 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +47 -0
- simulstream/__init__.py +15 -0
- simulstream/client/__init__.py +0 -0
- simulstream/client/wav_reader_client.py +228 -0
- simulstream/config.py +31 -0
- simulstream/inference.py +170 -0
- simulstream/metrics/__init__.py +0 -0
- simulstream/metrics/detokenizers.py +71 -0
- simulstream/metrics/logger.py +32 -0
- simulstream/metrics/readers.py +348 -0
- simulstream/metrics/score_latency.py +130 -0
- simulstream/metrics/score_quality.py +169 -0
- simulstream/metrics/scorers/__init__.py +0 -0
- simulstream/metrics/scorers/latency/__init__.py +115 -0
- simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
- simulstream/metrics/scorers/latency/stream_laal.py +119 -0
- simulstream/metrics/scorers/quality/__init__.py +132 -0
- simulstream/metrics/scorers/quality/comet.py +57 -0
- simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
- simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
- simulstream/metrics/stats.py +184 -0
- simulstream/server/__init__.py +0 -0
- simulstream/server/http_server.py +95 -0
- simulstream/server/message_processor.py +156 -0
- simulstream/server/speech_processors/__init__.py +173 -0
- simulstream/server/speech_processors/base.py +135 -0
- simulstream/server/speech_processors/base_streamatt.py +320 -0
- simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
- simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
- simulstream/server/speech_processors/incremental_output.py +85 -0
- simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
- simulstream/server/speech_processors/seamless_streamatt.py +268 -0
- simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
- simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
- simulstream/server/speech_processors/vad_wrapper.py +180 -0
- simulstream/server/websocket_server.py +236 -0
- simulstream-0.1.0.dist-info/METADATA +465 -0
- simulstream-0.1.0.dist-info/RECORD +48 -0
- simulstream-0.1.0.dist-info/WHEEL +5 -0
- simulstream-0.1.0.dist-info/entry_points.txt +8 -0
- simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
- simulstream-0.1.0.dist-info/top_level.txt +3 -0
- uts/__init__.py +0 -0
- uts/metrics/__init__.py +0 -0
- uts/metrics/log_reader.py +50 -0
- uts/speech_processors/__init__.py +0 -0
- uts/speech_processors/test_simuleval_wrapper.py +88 -0
- uts/utils.py +5 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from difflib import SequenceMatcher
|
|
16
|
+
from types import SimpleNamespace
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
import torch
|
|
20
|
+
|
|
21
|
+
from simulstream.server.speech_processors import SAMPLE_RATE
|
|
22
|
+
from simulstream.server.speech_processors.base import BaseSpeechProcessor
|
|
23
|
+
from simulstream.server.speech_processors.incremental_output import IncrementalOutput
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SlidingWindowRetranslator(BaseSpeechProcessor):
|
|
27
|
+
"""
|
|
28
|
+
A speech processor that applies a fixed-length sliding window retranslation with
|
|
29
|
+
deduplication to mitigate overlapping outputs when processing unsegmented audio streams.
|
|
30
|
+
|
|
31
|
+
This class implements the algorithm introduced in:
|
|
32
|
+
|
|
33
|
+
S. Sen, et al. 2025. *"Simultaneous Translation for Unsegmented Input:
|
|
34
|
+
A Sliding Window Approach"* (https://arxiv.org/pdf/2210.09754)
|
|
35
|
+
|
|
36
|
+
The approach relies on detecting the **longest common subsequence** between the current window
|
|
37
|
+
and the previous one, in order to prevent repeating tokens caused by overlapping audio windows.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config (SimpleNamespace): Configuration object. The following attributes are expected:
|
|
41
|
+
|
|
42
|
+
- **window_len (int)**: Length of the sliding window (in seconds).
|
|
43
|
+
- **matching_threshold (float, optional)**: Minimum fraction of the current tokens that
|
|
44
|
+
must match the previous history to be considered aligned. Default = ``0.1``.
|
|
45
|
+
- **override_on_failed_match (bool, optional)**: If ``True``, the previous history is
|
|
46
|
+
deleted from the output when no sufficient match is found. Otherwise, previous history
|
|
47
|
+
is kept and the new output is appended to the end of the previous history.
|
|
48
|
+
Default = ``False``.
|
|
49
|
+
- **max_tokens_per_second (int, optional)**: Maximum output tokens allowed per second of
|
|
50
|
+
audio. Default = ``10``.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, config: SimpleNamespace):
|
|
54
|
+
super().__init__(config)
|
|
55
|
+
self.window_len = self.config.window_len * SAMPLE_RATE
|
|
56
|
+
self.matching_threshold = getattr(self.config, "matching_threshold", 0.1)
|
|
57
|
+
self.override_on_failed_match = getattr(self.config, "override_on_failed_match", False)
|
|
58
|
+
self.max_tokens_per_second = getattr(self.config, "max_tokens_per_second", 10)
|
|
59
|
+
self.within_first_window = True
|
|
60
|
+
|
|
61
|
+
def _build_incremental_outputs(self, generated_tokens: List[str]) -> IncrementalOutput:
|
|
62
|
+
"""
|
|
63
|
+
Deduplicates the output stream of overlapping windows using the algorithm introduced in
|
|
64
|
+
`S. Sen, et al. 2025. "Simultaneous Translation for Unsegmented Input:
|
|
65
|
+
A Sliding Window Approach" <https://arxiv.org/pdf/2210.09754>`_
|
|
66
|
+
|
|
67
|
+
This algorithm is based on the longest matching substring between the current and previous
|
|
68
|
+
window. We use tokens instead of string to match, though, as we have empirically observed
|
|
69
|
+
that tokenization is mostly consistent across generations of the same word.
|
|
70
|
+
"""
|
|
71
|
+
if self.text_history is None or len(self.text_history) == 0:
|
|
72
|
+
self.text_history = generated_tokens
|
|
73
|
+
generated_string = self.tokens_to_string(generated_tokens)
|
|
74
|
+
return IncrementalOutput(
|
|
75
|
+
new_tokens=generated_tokens,
|
|
76
|
+
new_string=generated_string,
|
|
77
|
+
deleted_tokens=[],
|
|
78
|
+
deleted_string=""
|
|
79
|
+
)
|
|
80
|
+
seq_matcher = SequenceMatcher(None, self.text_history, generated_tokens, autojunk=False)
|
|
81
|
+
longest_match = seq_matcher.find_longest_match()
|
|
82
|
+
if longest_match.size >= self.matching_threshold * len(generated_tokens):
|
|
83
|
+
new_tokens = generated_tokens[longest_match.b + longest_match.size:]
|
|
84
|
+
deleted_tokens = self.text_history[longest_match.a + longest_match.size:]
|
|
85
|
+
new_string = self.tokens_to_string(new_tokens)
|
|
86
|
+
deleted_string = self.tokens_to_string(deleted_tokens)
|
|
87
|
+
# we take the matching part and the last part of the generated string as part of
|
|
88
|
+
# the history. Then we take from the history the tokens corresponding to the amount
|
|
89
|
+
# generated in this step, to ensure we have a sufficiently wide window
|
|
90
|
+
matching_and_last_tokens = generated_tokens[longest_match.b:]
|
|
91
|
+
initial_discarded_tokens = len(generated_tokens) - len(matching_and_last_tokens)
|
|
92
|
+
history_tokens_discarded = self.text_history[longest_match.a:]
|
|
93
|
+
history_initial_tokens = len(self.text_history) - len(history_tokens_discarded)
|
|
94
|
+
new_history_initial_tokens = self.text_history[
|
|
95
|
+
max(history_initial_tokens - initial_discarded_tokens, 0):history_initial_tokens]
|
|
96
|
+
self.text_history = new_history_initial_tokens + matching_and_last_tokens
|
|
97
|
+
else:
|
|
98
|
+
if self.within_first_window or self.override_on_failed_match:
|
|
99
|
+
deleted_tokens = self.text_history
|
|
100
|
+
deleted_string = self.tokens_to_string(self.text_history)
|
|
101
|
+
self.text_history = None
|
|
102
|
+
else:
|
|
103
|
+
deleted_tokens = []
|
|
104
|
+
deleted_string = ""
|
|
105
|
+
new_tokens = generated_tokens
|
|
106
|
+
new_string = self.tokens_to_string(generated_tokens)
|
|
107
|
+
self.text_history = generated_tokens
|
|
108
|
+
return IncrementalOutput(
|
|
109
|
+
new_tokens=new_tokens,
|
|
110
|
+
new_string=new_string,
|
|
111
|
+
deleted_tokens=deleted_tokens,
|
|
112
|
+
deleted_string=deleted_string,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _update_speech_history(
|
|
116
|
+
self,
|
|
117
|
+
new_speech: torch.Tensor,
|
|
118
|
+
generated_tokens: List[str],
|
|
119
|
+
new_output: IncrementalOutput) -> None:
|
|
120
|
+
if self.within_first_window and len(self.audio_history) >= self.window_len:
|
|
121
|
+
self.within_first_window = False
|
|
122
|
+
|
|
123
|
+
def _update_text_history(
|
|
124
|
+
self,
|
|
125
|
+
new_speech: torch.Tensor,
|
|
126
|
+
generated_tokens: List[str],
|
|
127
|
+
new_output: IncrementalOutput) -> None:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def end_of_stream(self) -> IncrementalOutput:
|
|
131
|
+
return IncrementalOutput([], "", [], "")
|
|
132
|
+
|
|
133
|
+
def clear(self) -> None:
|
|
134
|
+
super().clear()
|
|
135
|
+
self.within_first_window = True
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from types import SimpleNamespace
|
|
16
|
+
from typing import List
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from silero_vad import load_silero_vad, VADIterator
|
|
20
|
+
|
|
21
|
+
from simulstream.server.speech_processors import SAMPLE_RATE, SpeechProcessor, \
|
|
22
|
+
speech_processor_class_load
|
|
23
|
+
from simulstream.server.speech_processors.incremental_output import merge_incremental_outputs, \
|
|
24
|
+
IncrementalOutput
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VADWrapperSpeechProcessor(SpeechProcessor):
|
|
28
|
+
"""
|
|
29
|
+
A speech processor that integrates **Voice Activity Detection (VAD)** to filter and split
|
|
30
|
+
continuous audio streams into meaningful speech chunks before processing them with an
|
|
31
|
+
underlying speech processor.
|
|
32
|
+
|
|
33
|
+
This class wraps a :class:`SpeechProcessor` implementation (defined by in the configuration via
|
|
34
|
+
the attribute `base_speech_processor_class`) with a Silero VAD-based iterator that detects the
|
|
35
|
+
start and end of speech segments. Audio outside of speech is ignored, and each detected segment
|
|
36
|
+
is passed to the underlying speech processor.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
config (SimpleNamespace): Configuration object. The following attributes are used:
|
|
40
|
+
|
|
41
|
+
- **base_speech_processor_class (str)**: full name of the underlying speech processor
|
|
42
|
+
class to use.
|
|
43
|
+
- **vad_threshold (float, optional)**: VAD probability threshold. Default = ``0.5``.
|
|
44
|
+
- **vad_min_silence_duration_ms (int, optional)**: Minimum silence duration
|
|
45
|
+
(milliseconds) to consider the end of a speech segment. Default = ``100``.
|
|
46
|
+
- **vad_speech_pad_ms (int, optional)**: Padding (milliseconds) to include before and
|
|
47
|
+
after detected speech. Default = ``30``.
|
|
48
|
+
- **min_speech_size (int, optional)**: Minimum segment size in seconds; shorter
|
|
49
|
+
segments are ignored. Default = ``1``.
|
|
50
|
+
- Any additional attributes required by the subclass :py:attr:`speech_processor_class`.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def speech_processor_class(cls, config: SimpleNamespace) -> type[SpeechProcessor]:
|
|
55
|
+
return speech_processor_class_load(config.base_speech_processor_class)
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def load_model(cls, config: SimpleNamespace):
|
|
59
|
+
super().load_model(config)
|
|
60
|
+
if not hasattr(cls, "vad_model") or cls.vad_model is None:
|
|
61
|
+
cls.vad_model = load_silero_vad()
|
|
62
|
+
cls.speech_processor_class(config).load_model(config)
|
|
63
|
+
|
|
64
|
+
def __init__(self, config: SimpleNamespace):
|
|
65
|
+
super().__init__(config)
|
|
66
|
+
self.speech_processor = self.speech_processor_class(self.config)(self.config)
|
|
67
|
+
self.min_speech_size = getattr(self.config, "min_speech_size", 1) * SAMPLE_RATE
|
|
68
|
+
self.vad_iterator = VADIterator(
|
|
69
|
+
self.vad_model,
|
|
70
|
+
threshold=getattr(self.config, "vad_threshold", 0.5),
|
|
71
|
+
sampling_rate=SAMPLE_RATE,
|
|
72
|
+
min_silence_duration_ms=getattr(self.config, "vad_min_silence_duration_ms", 100),
|
|
73
|
+
speech_pad_ms=getattr(self.config, "vad_speech_pad_ms", 30),
|
|
74
|
+
)
|
|
75
|
+
self.residual_prev_audio = None
|
|
76
|
+
self.speech_buffer = None
|
|
77
|
+
self.audio_cursor_position = 0
|
|
78
|
+
self.in_speech = False
|
|
79
|
+
assert SAMPLE_RATE == 16000, \
|
|
80
|
+
"SileroHFSlidingWindowRetranslator supports only 16kHz sampling rate"
|
|
81
|
+
self.window_size_samples = 512 # assuming 16kHz
|
|
82
|
+
self.previous_audio_chunk = None # needed as VAD uses padding before start
|
|
83
|
+
|
|
84
|
+
def clear(self) -> None:
|
|
85
|
+
super().clear()
|
|
86
|
+
self.residual_prev_audio = None
|
|
87
|
+
self.speech_buffer = None
|
|
88
|
+
self.audio_cursor_position = 0
|
|
89
|
+
self.in_speech = False
|
|
90
|
+
self.previous_audio_chunk = None
|
|
91
|
+
self.vad_iterator.reset_states()
|
|
92
|
+
self.speech_processor.clear()
|
|
93
|
+
|
|
94
|
+
def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
|
|
95
|
+
if self.residual_prev_audio is not None:
|
|
96
|
+
waveform = np.concatenate((self.residual_prev_audio, waveform))
|
|
97
|
+
self.residual_prev_audio = None
|
|
98
|
+
# we can have more than one generate if there are multiple speech segments in the current
|
|
99
|
+
# chunk
|
|
100
|
+
outputs = []
|
|
101
|
+
|
|
102
|
+
for i in range(0, len(waveform), self.window_size_samples):
|
|
103
|
+
chunk = waveform[i: i + self.window_size_samples]
|
|
104
|
+
if len(chunk) < self.window_size_samples:
|
|
105
|
+
# process tailing audio with the next waveform chunk
|
|
106
|
+
self.residual_prev_audio = chunk
|
|
107
|
+
break
|
|
108
|
+
speech_dict = self.vad_iterator(chunk, return_seconds=False)
|
|
109
|
+
if speech_dict:
|
|
110
|
+
# if a VAD event happens, update the status accordingly
|
|
111
|
+
assert not ('start' in speech_dict and 'end' in speech_dict)
|
|
112
|
+
if 'start' in speech_dict:
|
|
113
|
+
assert not self.in_speech, \
|
|
114
|
+
"Cannot start a new segment when current one is being processed. " \
|
|
115
|
+
"This means there is a bug in the implementation."
|
|
116
|
+
start_offset = self.audio_cursor_position - len(self.previous_audio_chunk)
|
|
117
|
+
chunk_start_position = speech_dict['start'] - start_offset
|
|
118
|
+
assert chunk_start_position >= 0
|
|
119
|
+
self.speech_buffer = np.concatenate(
|
|
120
|
+
(self.previous_audio_chunk, chunk))[chunk_start_position:]
|
|
121
|
+
self.in_speech = True
|
|
122
|
+
if 'end' in speech_dict:
|
|
123
|
+
assert self.in_speech, \
|
|
124
|
+
"Cannot end a segment when no current segment is being processed. " \
|
|
125
|
+
"This means there is a bug in the implementation."
|
|
126
|
+
speech_buffer_len = len(self.speech_buffer) \
|
|
127
|
+
if self.speech_buffer is not None else 0
|
|
128
|
+
speech_buffer_offset = self.audio_cursor_position - speech_buffer_len
|
|
129
|
+
chunk_end_position = speech_dict['end'] - speech_buffer_offset
|
|
130
|
+
# In case we already processed more audio than needed (i.e., we already
|
|
131
|
+
# processed beyond the end by VAD, we skip further processing; otherwise, we
|
|
132
|
+
# process the remaining unhandled audio).
|
|
133
|
+
# We can process more audio after the end as the VAD takes ~100ms (see the
|
|
134
|
+
# min_silence_duration_ms parameter of the VAD) to emit the end signal, so if
|
|
135
|
+
# we already processed the partial speech buffer (see min_speech_size in this
|
|
136
|
+
# processor), we may have processed up to extra 100ms.
|
|
137
|
+
if chunk_end_position >= 0:
|
|
138
|
+
self.append_to_speech_buffer(chunk)
|
|
139
|
+
self.speech_buffer = self.speech_buffer[:chunk_end_position]
|
|
140
|
+
outputs.append(self.speech_processor.process_chunk(self.speech_buffer))
|
|
141
|
+
self.in_speech = False
|
|
142
|
+
self.speech_buffer = None
|
|
143
|
+
outputs.append(self.speech_processor.end_of_stream())
|
|
144
|
+
# reset history at the end of a segment
|
|
145
|
+
if hasattr(self.speech_processor, 'text_history'):
|
|
146
|
+
self.speech_processor.text_history = None
|
|
147
|
+
if hasattr(self.speech_processor, 'audio_history'):
|
|
148
|
+
self.speech_processor.audio_history = None
|
|
149
|
+
else:
|
|
150
|
+
# if no VAD event happens, we just ignore the audio if we are outside speech and
|
|
151
|
+
# update the buffer in case we are in speech
|
|
152
|
+
if self.in_speech:
|
|
153
|
+
self.append_to_speech_buffer(chunk)
|
|
154
|
+
# update cursor position
|
|
155
|
+
self.audio_cursor_position += self.window_size_samples
|
|
156
|
+
self.previous_audio_chunk = chunk
|
|
157
|
+
|
|
158
|
+
if self.in_speech and len(self.speech_buffer) > self.min_speech_size:
|
|
159
|
+
outputs.append(self.speech_processor.process_chunk(self.speech_buffer))
|
|
160
|
+
self.speech_buffer = None
|
|
161
|
+
|
|
162
|
+
return merge_incremental_outputs(outputs, self.tokens_to_string)
|
|
163
|
+
|
|
164
|
+
def append_to_speech_buffer(self, audio_chunk: np.float32) -> None:
|
|
165
|
+
if self.speech_buffer is None:
|
|
166
|
+
self.speech_buffer = audio_chunk
|
|
167
|
+
else:
|
|
168
|
+
self.speech_buffer = np.concatenate((self.speech_buffer, audio_chunk))
|
|
169
|
+
|
|
170
|
+
def set_source_language(self, language: str) -> None:
|
|
171
|
+
self.speech_processor.set_source_language(language)
|
|
172
|
+
|
|
173
|
+
def set_target_language(self, language: str) -> None:
|
|
174
|
+
self.speech_processor.set_target_language(language)
|
|
175
|
+
|
|
176
|
+
def tokens_to_string(self, tokens: List[str]) -> str:
|
|
177
|
+
return self.speech_processor.tokens_to_string(tokens)
|
|
178
|
+
|
|
179
|
+
def end_of_stream(self) -> IncrementalOutput:
|
|
180
|
+
return self.speech_processor.end_of_stream()
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import asyncio
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
from contextlib import asynccontextmanager
|
|
20
|
+
from types import SimpleNamespace
|
|
21
|
+
from typing import Callable, Awaitable
|
|
22
|
+
|
|
23
|
+
import websockets
|
|
24
|
+
from websockets.asyncio.server import serve, ServerConnection
|
|
25
|
+
import json
|
|
26
|
+
|
|
27
|
+
import simulstream
|
|
28
|
+
from simulstream.config import yaml_config
|
|
29
|
+
from simulstream.metrics.logger import setup_metrics_logger, METRICS_LOGGER
|
|
30
|
+
from simulstream.server.message_processor import MessageProcessor
|
|
31
|
+
from simulstream.server.speech_processors import build_speech_processor
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
36
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
37
|
+
level=logging.INFO,
|
|
38
|
+
)
|
|
39
|
+
LOGGER = logging.getLogger('simulstream.websocket_server')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SpeechProcessorPool:
|
|
43
|
+
"""
|
|
44
|
+
A pool of speech processors initialized at startup and made available to clients.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
speech_processor_config (SimpleNamespace): configuration for the speech processors to
|
|
48
|
+
create.
|
|
49
|
+
size (int): number of speech processors to have in the pool.
|
|
50
|
+
acquire_timeout (int): timeout (in seconds) for waiting the availability of a speech
|
|
51
|
+
processor.
|
|
52
|
+
"""
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
speech_processor_config: SimpleNamespace,
|
|
56
|
+
size: int,
|
|
57
|
+
acquire_timeout: int):
|
|
58
|
+
self.size = size
|
|
59
|
+
self.acquire_timeout = acquire_timeout
|
|
60
|
+
self.available = asyncio.Queue(maxsize=size)
|
|
61
|
+
for _ in range(size):
|
|
62
|
+
self.available.put_nowait(build_speech_processor(speech_processor_config))
|
|
63
|
+
|
|
64
|
+
@asynccontextmanager
|
|
65
|
+
async def acquire(self):
|
|
66
|
+
"""
|
|
67
|
+
Acquire one process from the pool and release it automatically.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
SpeechProcessor: a speech processor available for usage.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
TimeoutError: if no speech processor is available within the configured timeout.
|
|
74
|
+
"""
|
|
75
|
+
speech_processor = await asyncio.wait_for(
|
|
76
|
+
self.available.get(), timeout=self.acquire_timeout)
|
|
77
|
+
try:
|
|
78
|
+
yield speech_processor
|
|
79
|
+
finally:
|
|
80
|
+
# Return worker to pool
|
|
81
|
+
self.available.put_nowait(speech_processor)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def connection_handler_factory(
|
|
85
|
+
speech_processor_pool: SpeechProcessorPool
|
|
86
|
+
) -> Callable[[ServerConnection], Awaitable[None]]:
|
|
87
|
+
"""
|
|
88
|
+
Factory function that creates a connection handler for the WebSocket server.
|
|
89
|
+
|
|
90
|
+
The returned connection handler routine will process audio and metadata messages sent by a
|
|
91
|
+
single client over WebSocket.
|
|
92
|
+
|
|
93
|
+
The handler receives client data (raw audio chunks and textual metadata) and forwards it to a
|
|
94
|
+
message processor using a speech processor retrieved form the pool of the available ones.
|
|
95
|
+
If no speech processor is available for the configured waiting time, the client connection is
|
|
96
|
+
closed.
|
|
97
|
+
The handler also sends incremental processing results back to the client in JSON format.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
speech_processor_pool (SpeechProcessorPool): Pool of speech processors to use to handle
|
|
101
|
+
client connections.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Callable[[websockets.asyncio.server.ServerConnection], Awaitable[None]]: An asynchronous
|
|
105
|
+
WebSocket connection handler coroutine.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
async def handle_connection(websocket: ServerConnection) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Handles a single client WebSocket connection.
|
|
111
|
+
|
|
112
|
+
This is the coroutine that processes incoming messages from a client:
|
|
113
|
+
|
|
114
|
+
- If the message is binary (``bytes``), it is interpreted as raw audio data and
|
|
115
|
+
buffered until a full chunk is ready for processing.
|
|
116
|
+
- If the message is text (``str``), it is parsed as JSON metadata and can:
|
|
117
|
+
|
|
118
|
+
- Set the input sample rate.
|
|
119
|
+
- Set source and target languages for translation.
|
|
120
|
+
- Log custom metadata to the metrics logger.
|
|
121
|
+
- Indicate the end of the audio stream.
|
|
122
|
+
|
|
123
|
+
At the end of the stream, any remaining audio is processed, the processor state is cleared,
|
|
124
|
+
and an ``end_of_processing`` message is sent to the client.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
websocket (websockets.asyncio.server.ServerConnection): The WebSocket connection for
|
|
128
|
+
the client.
|
|
129
|
+
"""
|
|
130
|
+
loop = asyncio.get_running_loop()
|
|
131
|
+
client_id = id(websocket)
|
|
132
|
+
LOGGER.info(f"Client {client_id} connected")
|
|
133
|
+
try:
|
|
134
|
+
async with speech_processor_pool.acquire() as speech_processor:
|
|
135
|
+
message_processor = MessageProcessor(client_id, speech_processor)
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
async for message in websocket:
|
|
139
|
+
if isinstance(message, bytes):
|
|
140
|
+
# in this case we are processing an audio chunk
|
|
141
|
+
incremental_output = await loop.run_in_executor(
|
|
142
|
+
None, message_processor.process_speech, message)
|
|
143
|
+
if incremental_output is not None:
|
|
144
|
+
await websocket.send(incremental_output.strings_to_json())
|
|
145
|
+
elif isinstance(message, str):
|
|
146
|
+
# textual message are used to handle metadata
|
|
147
|
+
try:
|
|
148
|
+
data = json.loads(message)
|
|
149
|
+
if 'end_of_stream' in data:
|
|
150
|
+
incremental_output = await loop.run_in_executor(
|
|
151
|
+
None, message_processor.end_of_stream)
|
|
152
|
+
await websocket.send(incremental_output.strings_to_json())
|
|
153
|
+
await websocket.send(json.dumps({'end_of_processing': True}))
|
|
154
|
+
else:
|
|
155
|
+
message_processor.process_metadata(data)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
LOGGER.error(
|
|
158
|
+
f"Invalid string message: {message}. Error: {e}. Ignoring it.")
|
|
159
|
+
except websockets.exceptions.ConnectionClosed:
|
|
160
|
+
LOGGER.info(f"Client {client_id} disconnected.")
|
|
161
|
+
except Exception as e:
|
|
162
|
+
LOGGER.exception(f"Error: {e}")
|
|
163
|
+
finally:
|
|
164
|
+
message_processor.clear()
|
|
165
|
+
except TimeoutError:
|
|
166
|
+
LOGGER.error(f"Timeout waiting for a new processor for client {client_id}")
|
|
167
|
+
await websocket.close()
|
|
168
|
+
|
|
169
|
+
return handle_connection
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def main(args: argparse.Namespace):
|
|
173
|
+
"""
|
|
174
|
+
Main entry point for running the WebSocket speech server.
|
|
175
|
+
|
|
176
|
+
This function loads the server and speech processor configurations from YAML,
|
|
177
|
+
initializes logging (including metrics logging), and starts the WebSocket server
|
|
178
|
+
on the configured host and port.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
args (argparse.Namespace): parsed command-line arguments with configuration file paths.
|
|
182
|
+
"""
|
|
183
|
+
LOGGER.info(f"Loading server configuration from {args.server_config}")
|
|
184
|
+
server_config = yaml_config(args.server_config)
|
|
185
|
+
LOGGER.info(
|
|
186
|
+
f"Metric logging is{'' if server_config.metrics.enabled else ' NOT'} enabled at "
|
|
187
|
+
f"{server_config.metrics.filename}")
|
|
188
|
+
setup_metrics_logger(server_config.metrics)
|
|
189
|
+
LOGGER.info(f"Loading speech processor from {args.speech_processor_config}")
|
|
190
|
+
speech_processor_config = yaml_config(args.speech_processor_config)
|
|
191
|
+
LOGGER.info(f"Using as speech processor: {speech_processor_config.type}")
|
|
192
|
+
speech_processor_loading_time = time.time()
|
|
193
|
+
speech_processors_pool = SpeechProcessorPool(
|
|
194
|
+
speech_processor_config, server_config.pool_size, server_config.acquire_timeout)
|
|
195
|
+
speech_processor_loading_time = time.time() - speech_processor_loading_time
|
|
196
|
+
LOGGER.info(f"Loaded speech processor in {speech_processor_loading_time:.3f} seconds")
|
|
197
|
+
METRICS_LOGGER.info(json.dumps({
|
|
198
|
+
"model_loading_time": speech_processor_loading_time,
|
|
199
|
+
}))
|
|
200
|
+
LOGGER.info(f"Serving websocket server at {server_config.hostname}:{server_config.port}")
|
|
201
|
+
async with serve(
|
|
202
|
+
connection_handler_factory(speech_processors_pool),
|
|
203
|
+
server_config.hostname,
|
|
204
|
+
server_config.port,
|
|
205
|
+
ping_timeout=None) as server:
|
|
206
|
+
await server.serve_forever()
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def cli_main():
|
|
210
|
+
"""
|
|
211
|
+
Simulstream WebSocket server command-line interface (CLI) entry point.
|
|
212
|
+
|
|
213
|
+
This function parses command-line arguments and starts the asynchronous :func:`main` routine.
|
|
214
|
+
|
|
215
|
+
Example usage::
|
|
216
|
+
|
|
217
|
+
$ python websocket_server.py --server-config config/server.yaml \\
|
|
218
|
+
--speech-processor-config config/speech.yaml
|
|
219
|
+
|
|
220
|
+
Command-line arguments:
|
|
221
|
+
|
|
222
|
+
- ``--server-config`` (str, optional): Path to the server configuration file
|
|
223
|
+
(default: ``config/server.yaml``).
|
|
224
|
+
- ``--speech-processor-config`` (str, required): Path to the speech processor configuration
|
|
225
|
+
file.
|
|
226
|
+
"""
|
|
227
|
+
LOGGER.info(f"Websocket server version: {simulstream.__version__}")
|
|
228
|
+
parser = argparse.ArgumentParser("websocket_simul_server")
|
|
229
|
+
parser.add_argument("--server-config", type=str, default="config/server.yaml")
|
|
230
|
+
parser.add_argument("--speech-processor-config", type=str, required=True)
|
|
231
|
+
args = parser.parse_args()
|
|
232
|
+
asyncio.run(main(args))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == "__main__":
|
|
236
|
+
cli_main()
|