simulstream 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +47 -0
- simulstream/__init__.py +15 -0
- simulstream/client/__init__.py +0 -0
- simulstream/client/wav_reader_client.py +228 -0
- simulstream/config.py +31 -0
- simulstream/inference.py +170 -0
- simulstream/metrics/__init__.py +0 -0
- simulstream/metrics/detokenizers.py +71 -0
- simulstream/metrics/logger.py +32 -0
- simulstream/metrics/readers.py +348 -0
- simulstream/metrics/score_latency.py +130 -0
- simulstream/metrics/score_quality.py +169 -0
- simulstream/metrics/scorers/__init__.py +0 -0
- simulstream/metrics/scorers/latency/__init__.py +115 -0
- simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
- simulstream/metrics/scorers/latency/stream_laal.py +119 -0
- simulstream/metrics/scorers/quality/__init__.py +132 -0
- simulstream/metrics/scorers/quality/comet.py +57 -0
- simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
- simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
- simulstream/metrics/stats.py +184 -0
- simulstream/server/__init__.py +0 -0
- simulstream/server/http_server.py +95 -0
- simulstream/server/message_processor.py +156 -0
- simulstream/server/speech_processors/__init__.py +173 -0
- simulstream/server/speech_processors/base.py +135 -0
- simulstream/server/speech_processors/base_streamatt.py +320 -0
- simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
- simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
- simulstream/server/speech_processors/incremental_output.py +85 -0
- simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
- simulstream/server/speech_processors/seamless_streamatt.py +268 -0
- simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
- simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
- simulstream/server/speech_processors/vad_wrapper.py +180 -0
- simulstream/server/websocket_server.py +236 -0
- simulstream-0.1.0.dist-info/METADATA +465 -0
- simulstream-0.1.0.dist-info/RECORD +48 -0
- simulstream-0.1.0.dist-info/WHEEL +5 -0
- simulstream-0.1.0.dist-info/entry_points.txt +8 -0
- simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
- simulstream-0.1.0.dist-info/top_level.txt +3 -0
- uts/__init__.py +0 -0
- uts/metrics/__init__.py +0 -0
- uts/metrics/log_reader.py +50 -0
- uts/speech_processors/__init__.py +0 -0
- uts/speech_processors/test_simuleval_wrapper.py +88 -0
- uts/utils.py +5 -0
docs/source/conf.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
|
2
|
+
#
|
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
sys.path.insert(0, os.path.abspath('../../'))
|
|
9
|
+
|
|
10
|
+
# -- Project information -----------------------------------------------------
|
|
11
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
|
12
|
+
|
|
13
|
+
project = 'simulstream'
|
|
14
|
+
copyright = '2025, FBK'
|
|
15
|
+
author = 'Marco Gaido, FBK MT Unit'
|
|
16
|
+
release = '0.1.0'
|
|
17
|
+
|
|
18
|
+
# -- General configuration ---------------------------------------------------
|
|
19
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
20
|
+
|
|
21
|
+
extensions = [
|
|
22
|
+
'sphinx.ext.duration',
|
|
23
|
+
'sphinx.ext.doctest',
|
|
24
|
+
'sphinx.ext.autosummary',
|
|
25
|
+
'sphinx.ext.intersphinx',
|
|
26
|
+
"sphinx.ext.autodoc",
|
|
27
|
+
"sphinx.ext.napoleon",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
intersphinx_mapping = {
|
|
31
|
+
'python': ('https://docs.python.org/3/', None),
|
|
32
|
+
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
|
|
33
|
+
}
|
|
34
|
+
intersphinx_disabled_domains = ['std']
|
|
35
|
+
|
|
36
|
+
templates_path = ['_templates']
|
|
37
|
+
exclude_patterns = []
|
|
38
|
+
|
|
39
|
+
# -- Options for HTML output -------------------------------------------------
|
|
40
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
|
41
|
+
|
|
42
|
+
html_logo = "_static/logo.png"
|
|
43
|
+
html_favicon = "_static/logo.png"
|
|
44
|
+
html_css_files = ['custom.css']
|
|
45
|
+
|
|
46
|
+
html_theme = 'sphinx_rtd_theme'
|
|
47
|
+
html_static_path = ['_static']
|
simulstream/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
__version__ = '0.1.0'
|
|
File without changes
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import asyncio
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import wave
|
|
20
|
+
from typing import Tuple, Optional, List
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import websockets
|
|
24
|
+
import os
|
|
25
|
+
import contextlib
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logging.basicConfig(
|
|
29
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
30
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
31
|
+
level=logging.INFO,
|
|
32
|
+
)
|
|
33
|
+
LOGGER = logging.getLogger('simulstream.wav_reader_client')
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def float32_to_int16(audio_data: np.ndarray) -> np.ndarray:
|
|
37
|
+
"""Convert a NumPy array of float32 audio samples to int16 PCM format."""
|
|
38
|
+
audio_data = np.clip(audio_data * 2 ** 15, -32768, 32767)
|
|
39
|
+
return audio_data.astype(np.int16)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def read_wav_file(filename: str) -> Tuple[int, np.ndarray]:
|
|
43
|
+
"""
|
|
44
|
+
Read a WAV file and return its sample rate and audio data.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
filename (str): Path to the WAV file.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
tuple[int, np.ndarray]: Sample rate and mono audio data as int16 array.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If the sample width is unsupported.
|
|
54
|
+
AssertionError: If the file contains more than one channel.
|
|
55
|
+
"""
|
|
56
|
+
with contextlib.closing(wave.open(filename, 'rb')) as wf:
|
|
57
|
+
num_channels = wf.getnchannels()
|
|
58
|
+
sample_width = wf.getsampwidth()
|
|
59
|
+
sample_rate = wf.getframerate()
|
|
60
|
+
num_frames = wf.getnframes()
|
|
61
|
+
raw_data = wf.readframes(num_frames)
|
|
62
|
+
|
|
63
|
+
if sample_width == 2:
|
|
64
|
+
dtype = np.int16
|
|
65
|
+
elif sample_width == 4:
|
|
66
|
+
dtype = np.float32
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Unsupported sample width: {sample_width}")
|
|
69
|
+
|
|
70
|
+
data = np.frombuffer(raw_data, dtype=dtype)
|
|
71
|
+
|
|
72
|
+
if sample_width == 4:
|
|
73
|
+
data = float32_to_int16(data)
|
|
74
|
+
|
|
75
|
+
assert num_channels == 1, "Currently ony 1 channel is supported"
|
|
76
|
+
|
|
77
|
+
return sample_rate, data
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def send_audio(
|
|
81
|
+
websocket: websockets.ClientConnection,
|
|
82
|
+
sample_rate: int,
|
|
83
|
+
data: np.ndarray,
|
|
84
|
+
chunk_duration_ms: int = 100):
|
|
85
|
+
"""
|
|
86
|
+
Stream audio data in fixed-size chunks over a WebSocket connection.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
websocket (websockets.ClientConnection): Active WebSocket connection.
|
|
90
|
+
sample_rate (int): Audio sample rate (Hz).
|
|
91
|
+
data (np.ndarray): Audio samples as int16 array.
|
|
92
|
+
chunk_duration_ms (int): Duration of each chunk in milliseconds.
|
|
93
|
+
"""
|
|
94
|
+
samples_per_chunk = int(sample_rate * chunk_duration_ms / 1000.0)
|
|
95
|
+
i = 0
|
|
96
|
+
for i in range(0, len(data), samples_per_chunk):
|
|
97
|
+
await websocket.send(data[i:i + samples_per_chunk].tobytes())
|
|
98
|
+
# send last part of the audio
|
|
99
|
+
if i < len(data):
|
|
100
|
+
await websocket.send(data[i:].tobytes())
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
async def stream_wav_files(
|
|
104
|
+
uri: str,
|
|
105
|
+
wav_file_list: List[str],
|
|
106
|
+
chunk_duration_ms: int = 100,
|
|
107
|
+
tgt_lang: Optional[str] = None,
|
|
108
|
+
src_lang: Optional[str] = None):
|
|
109
|
+
"""
|
|
110
|
+
Stream multiple WAV files sequentially to a WebSocket server.
|
|
111
|
+
|
|
112
|
+
For each file:
|
|
113
|
+
- Sends metadata (sample rate, filename, optional languages).
|
|
114
|
+
- Streams audio in chunks.
|
|
115
|
+
- Sends an end-of-stream marker.
|
|
116
|
+
- Waits for server confirmation before proceeding.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
uri (str): WebSocket server URI.
|
|
120
|
+
wav_file_list (list[str]): Paths to WAV files.
|
|
121
|
+
chunk_duration_ms (int): Chunk size in milliseconds.
|
|
122
|
+
tgt_lang (str | None): Target language code (e.g., "en").
|
|
123
|
+
src_lang (str | None): Source language code (e.g., "en").
|
|
124
|
+
"""
|
|
125
|
+
for wav_file in wav_file_list:
|
|
126
|
+
LOGGER.info(f"Streaming: {wav_file}")
|
|
127
|
+
sample_rate, data = read_wav_file(wav_file)
|
|
128
|
+
metadata = {
|
|
129
|
+
"sample_rate": sample_rate,
|
|
130
|
+
"metrics_metadata": {
|
|
131
|
+
"wav_name": wav_file,
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if tgt_lang is not None:
|
|
135
|
+
metadata["target_lang"] = tgt_lang
|
|
136
|
+
if src_lang is not None:
|
|
137
|
+
metadata["source_lang"] = src_lang
|
|
138
|
+
async with websockets.connect(uri, ping_timeout=None) as websocket:
|
|
139
|
+
await websocket.send(json.dumps(metadata))
|
|
140
|
+
await send_audio(websocket, sample_rate, data, chunk_duration_ms)
|
|
141
|
+
await websocket.send(json.dumps({"end_of_stream": True}))
|
|
142
|
+
while True:
|
|
143
|
+
response = await websocket.recv()
|
|
144
|
+
LOGGER.debug(response)
|
|
145
|
+
if 'end_of_processing' in response:
|
|
146
|
+
break
|
|
147
|
+
LOGGER.info(f"All {len(wav_file_list)} files sent.")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def load_wav_file_list(list_file_path: str) -> List[str]:
|
|
151
|
+
"""
|
|
152
|
+
Load a list of WAV file paths from a text file.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
list_file_path (str): Path to a text file, one WAV file path per line.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
list[str]: Absolute file paths of WAV files.
|
|
159
|
+
"""
|
|
160
|
+
basedir = os.path.dirname(list_file_path)
|
|
161
|
+
with open(list_file_path, 'r') as f:
|
|
162
|
+
wav_files = [basedir + '/' + line.strip() for line in f if line.strip()]
|
|
163
|
+
if not wav_files:
|
|
164
|
+
LOGGER.error("No valid WAV files found in the list.")
|
|
165
|
+
exit(1)
|
|
166
|
+
else:
|
|
167
|
+
assert all(os.path.isfile(f) for f in wav_files), "Invalid wav file in the list."
|
|
168
|
+
return wav_files
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
async def main(args: argparse.Namespace):
|
|
172
|
+
"""Main entrypoint: validates WAV files and starts streaming."""
|
|
173
|
+
wav_files = load_wav_file_list(args.wav_list_file)
|
|
174
|
+
await stream_wav_files(
|
|
175
|
+
args.uri, wav_files, args.chunk_duration_ms, args.tgt_lang, args.src_lang)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def cli_main():
|
|
179
|
+
"""
|
|
180
|
+
Simulstream WebSocket client command-line interface (CLI) entry point.
|
|
181
|
+
|
|
182
|
+
This script implements a simple WebSocket client that streams audio data from a list of WAV
|
|
183
|
+
files to a server for processing (e.g., speech recognition or translation). It reads WAV files,
|
|
184
|
+
converts them into fixed-size chunks, and sends them asynchronously over a WebSocket
|
|
185
|
+
connection.
|
|
186
|
+
|
|
187
|
+
Example usage::
|
|
188
|
+
|
|
189
|
+
$ python wav_reader_client.py --uri ws://localhost:8000/ --wav-list-file wav_files.txt \\
|
|
190
|
+
--tgt-lang it --src-lang en
|
|
191
|
+
|
|
192
|
+
Command-line arguments:
|
|
193
|
+
|
|
194
|
+
- ``--uri``: WebSocket server URI (e.g., ``ws://localhost:8000/``).
|
|
195
|
+
- ``--wav-list-file``: Path to a text file containing one WAV file path per line.
|
|
196
|
+
- ``--chunk-duration-ms``: Duration of each audio chunk sent to the server (ms). Default: 100.
|
|
197
|
+
- ``--tgt-lang``: Optional target language.
|
|
198
|
+
- ``--src-lang``: Optional source language.
|
|
199
|
+
"""
|
|
200
|
+
parser = argparse.ArgumentParser(description="Websocket client for WAV files.")
|
|
201
|
+
parser.add_argument(
|
|
202
|
+
"--uri",
|
|
203
|
+
required=True,
|
|
204
|
+
help="WebSocket server URI (e.g., ws://localhost:8000/)")
|
|
205
|
+
parser.add_argument(
|
|
206
|
+
"--wav-list-file",
|
|
207
|
+
required=True,
|
|
208
|
+
help="Path to text file containing list of WAV file paths")
|
|
209
|
+
parser.add_argument(
|
|
210
|
+
"--chunk-duration-ms",
|
|
211
|
+
default=100,
|
|
212
|
+
type=int,
|
|
213
|
+
help="Size of the each chunk sent to the server in milliseconds (default: 100)")
|
|
214
|
+
parser.add_argument(
|
|
215
|
+
"--tgt-lang",
|
|
216
|
+
default=None,
|
|
217
|
+
help="Target language (if needed, its effect depends on the speech processor used by the "
|
|
218
|
+
"server).")
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--src-lang",
|
|
221
|
+
default=None,
|
|
222
|
+
help="Source language (if needed, its effect depends on the speech processor used by the "
|
|
223
|
+
"server).")
|
|
224
|
+
asyncio.run(main(parser.parse_args()))
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__":
|
|
228
|
+
cli_main()
|
simulstream/config.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from types import SimpleNamespace
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _dict_to_object(d):
|
|
21
|
+
if isinstance(d, dict):
|
|
22
|
+
return SimpleNamespace(**{k: _dict_to_object(v) for k, v in d.items()})
|
|
23
|
+
elif isinstance(d, list):
|
|
24
|
+
return [_dict_to_object(i) for i in d]
|
|
25
|
+
else:
|
|
26
|
+
return d
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def yaml_config(path: str) -> SimpleNamespace:
|
|
30
|
+
with open(path, "r") as file:
|
|
31
|
+
return _dict_to_object(yaml.safe_load(file))
|
simulstream/inference.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
from types import SimpleNamespace
|
|
20
|
+
from typing import List, Optional
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
import simulstream
|
|
25
|
+
from simulstream.client.wav_reader_client import load_wav_file_list, read_wav_file
|
|
26
|
+
from simulstream.config import yaml_config
|
|
27
|
+
from simulstream.metrics.logger import setup_metrics_logger, METRICS_LOGGER
|
|
28
|
+
from simulstream.server.message_processor import MessageProcessor
|
|
29
|
+
from simulstream.server.speech_processors import build_speech_processor, SpeechProcessor
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
logging.basicConfig(
|
|
33
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
34
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
35
|
+
level=logging.INFO,
|
|
36
|
+
)
|
|
37
|
+
LOGGER = logging.getLogger('simulstream.inference')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def process_audio(
|
|
41
|
+
message_processor: MessageProcessor,
|
|
42
|
+
sample_rate: int,
|
|
43
|
+
data: np.ndarray):
|
|
44
|
+
"""
|
|
45
|
+
Stream audio data in fixed-size chunks over a WebSocket connection.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
message_processor (MessageProcessor): class that processes the audio chunks.
|
|
49
|
+
sample_rate (int): Audio sample rate (Hz).
|
|
50
|
+
data (np.ndarray): Audio samples as int16 array.
|
|
51
|
+
"""
|
|
52
|
+
samples_per_chunk = int(
|
|
53
|
+
sample_rate * message_processor.speech_processor.speech_chunk_size / 1000.0)
|
|
54
|
+
i = 0
|
|
55
|
+
for i in range(0, len(data), samples_per_chunk):
|
|
56
|
+
output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
|
|
57
|
+
LOGGER.debug(f"response: {output}")
|
|
58
|
+
# send last part of the audio
|
|
59
|
+
if i < len(data):
|
|
60
|
+
output = message_processor.process_speech(data[i:].tobytes())
|
|
61
|
+
LOGGER.debug(f"response: {output}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def run_inference(
|
|
65
|
+
speech_processor: SpeechProcessor,
|
|
66
|
+
wav_file_list: List[str],
|
|
67
|
+
tgt_lang: Optional[str] = None,
|
|
68
|
+
src_lang: Optional[str] = None):
|
|
69
|
+
"""
|
|
70
|
+
Runs the inference on the WAV files sequentially with the specified speech processor.
|
|
71
|
+
|
|
72
|
+
For each file:
|
|
73
|
+
- Sets metadata (sample rate, filename, optional languages).
|
|
74
|
+
- Processes the audio in chunks.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
speech_processor (SpeechProcessor): the speech processor to use to run the inference.
|
|
78
|
+
wav_file_list (list[str]): Paths to WAV files.
|
|
79
|
+
tgt_lang (str | None): Target language code (e.g., "en").
|
|
80
|
+
src_lang (str | None): Source language code (e.g., "en").
|
|
81
|
+
"""
|
|
82
|
+
for i, wav_file in enumerate(wav_file_list):
|
|
83
|
+
LOGGER.info(f"Streaming: {wav_file}")
|
|
84
|
+
sample_rate, data = read_wav_file(wav_file)
|
|
85
|
+
metadata = {
|
|
86
|
+
"sample_rate": sample_rate,
|
|
87
|
+
"metrics_metadata": {
|
|
88
|
+
"wav_name": wav_file,
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if tgt_lang is not None:
|
|
92
|
+
metadata["target_lang"] = tgt_lang
|
|
93
|
+
if src_lang is not None:
|
|
94
|
+
metadata["source_lang"] = src_lang
|
|
95
|
+
message_processor = MessageProcessor(i, speech_processor)
|
|
96
|
+
message_processor.process_metadata(metadata)
|
|
97
|
+
process_audio(message_processor, sample_rate, data)
|
|
98
|
+
message_processor.end_of_stream()
|
|
99
|
+
LOGGER.info(f"All {len(wav_file_list)} files sent.")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main(args: argparse.Namespace):
|
|
103
|
+
"""
|
|
104
|
+
Main entrypoint: validates WAV files and starts the generation with the specified speech
|
|
105
|
+
processor.
|
|
106
|
+
"""
|
|
107
|
+
setup_metrics_logger(SimpleNamespace(**{
|
|
108
|
+
"enabled": True,
|
|
109
|
+
"filename": args.metrics_log_file
|
|
110
|
+
}))
|
|
111
|
+
LOGGER.info(f"Loading speech processor from {args.speech_processor_config}")
|
|
112
|
+
speech_processor_config = yaml_config(args.speech_processor_config)
|
|
113
|
+
LOGGER.info(f"Using as speech processor: {speech_processor_config.type}")
|
|
114
|
+
speech_processor_loading_time = time.time()
|
|
115
|
+
speech_processor = build_speech_processor(speech_processor_config)
|
|
116
|
+
speech_processor_loading_time = time.time() - speech_processor_loading_time
|
|
117
|
+
LOGGER.info(f"Loaded speech processor in {speech_processor_loading_time:.3f} seconds")
|
|
118
|
+
METRICS_LOGGER.info(json.dumps({
|
|
119
|
+
"model_loading_time": speech_processor_loading_time,
|
|
120
|
+
}))
|
|
121
|
+
wav_files = load_wav_file_list(args.wav_list_file)
|
|
122
|
+
run_inference(speech_processor, wav_files, args.tgt_lang, args.src_lang)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def cli_main():
|
|
126
|
+
"""
|
|
127
|
+
Simulstream evaluation command-line interface (CLI) entry point. This script processes the
|
|
128
|
+
specified wav files with the configured speech processor and can be used to get the metrics
|
|
129
|
+
log file to evaluate the quality and latency of the speech processor.
|
|
130
|
+
|
|
131
|
+
This function parses command-line arguments and starts the asynchronous :func:`main` routine.
|
|
132
|
+
|
|
133
|
+
Example usage::
|
|
134
|
+
|
|
135
|
+
$ python inference.py --speech-processor-config config/speech.yaml \\
|
|
136
|
+
--wav-list-file wav_files.txt --tgt-lang it --src-lang en
|
|
137
|
+
|
|
138
|
+
Command-line arguments:
|
|
139
|
+
|
|
140
|
+
- ``--server-config`` (str, optional): Path to the server configuration file
|
|
141
|
+
(default: ``config/server.yaml``).
|
|
142
|
+
- ``--speech-processor-config`` (str, required): Path to the speech processor configuration
|
|
143
|
+
file.
|
|
144
|
+
"""
|
|
145
|
+
LOGGER.info(f"Simulstream version: {simulstream.__version__}")
|
|
146
|
+
parser = argparse.ArgumentParser("simulstream_inference")
|
|
147
|
+
parser.add_argument("--speech-processor-config", type=str, required=True)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--wav-list-file",
|
|
150
|
+
required=True,
|
|
151
|
+
help="Path to text file containing list of WAV file paths")
|
|
152
|
+
parser.add_argument(
|
|
153
|
+
"--tgt-lang",
|
|
154
|
+
default=None,
|
|
155
|
+
help="Target language (if needed, its effect depends on the speech processor used by the "
|
|
156
|
+
"server).")
|
|
157
|
+
parser.add_argument(
|
|
158
|
+
"--src-lang",
|
|
159
|
+
default=None,
|
|
160
|
+
help="Source language (if needed, its effect depends on the speech processor used by the "
|
|
161
|
+
"server).")
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--metrics-log-file",
|
|
164
|
+
default="metrics.json",
|
|
165
|
+
help="Path where to write the metrics log file.")
|
|
166
|
+
main(parser.parse_args())
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
cli_main()
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from types import SimpleNamespace
|
|
16
|
+
from typing import Callable, Dict, List
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_hf_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
|
|
20
|
+
from transformers import AutoProcessor
|
|
21
|
+
|
|
22
|
+
assert hasattr(config, "hf_model_name"), \
|
|
23
|
+
"`hf_model_name` required in the eval config for `hf` detokenizer"
|
|
24
|
+
processor = AutoProcessor.from_pretrained(config.hf_model_name)
|
|
25
|
+
|
|
26
|
+
def detokenize(input_tokens: List[str]) -> str:
|
|
27
|
+
return processor.tokenizer.convert_tokens_to_string(input_tokens)
|
|
28
|
+
|
|
29
|
+
return detokenize
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def build_canary_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
|
|
33
|
+
from nemo.collections.asr.models import ASRModel
|
|
34
|
+
|
|
35
|
+
assert hasattr(config, "model_name"), \
|
|
36
|
+
"`model_name` required in the eval config for `canary` detokenizer"
|
|
37
|
+
tokenizer = ASRModel.from_pretrained(model_name=config.model_name).tokenizer
|
|
38
|
+
|
|
39
|
+
def detokenize(input_tokens: List[str]) -> str:
|
|
40
|
+
return tokenizer.tokens_to_text(input_tokens)
|
|
41
|
+
|
|
42
|
+
return detokenize
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_simuleval_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
|
|
46
|
+
""" SimulEval detokenizer from https://github.com/facebookresearch/SimulEval/blob/
|
|
47
|
+
536de8253b82d805c9845440169a5010ff507357/simuleval/evaluator/instance.py#L233"""
|
|
48
|
+
if config.latency_unit == "word":
|
|
49
|
+
def detokenize(input_tokens: List[str]) -> str:
|
|
50
|
+
return " ".join(input_tokens)
|
|
51
|
+
elif config.latency_unit == "char":
|
|
52
|
+
def detokenize(input_tokens: List[str]) -> str:
|
|
53
|
+
return "".join(input_tokens)
|
|
54
|
+
elif config.latency_unit == "spm":
|
|
55
|
+
def detokenize(input_tokens: List[str]) -> str:
|
|
56
|
+
return "".join(input_tokens).replace("▁", " ").strip()
|
|
57
|
+
else:
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
60
|
+
return detokenize
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
_DETOKENIZER_BUILDER_MAP: Dict[str, Callable[[SimpleNamespace], Callable[[List[str]], str]]] = {
|
|
64
|
+
"hf": build_hf_detokenizer,
|
|
65
|
+
"canary": build_canary_detokenizer,
|
|
66
|
+
"simuleval": build_simuleval_detokenizer
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
|
|
71
|
+
return _DETOKENIZER_BUILDER_MAP[config.detokenizer_type](config)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
METRICS_LOGGER = logging.getLogger('fbk_fairseq.simultaneous.metrics')
|
|
19
|
+
METRICS_LOGGER.propagate = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def setup_metrics_logger(metrics_config):
|
|
23
|
+
if metrics_config.enabled:
|
|
24
|
+
fh = logging.FileHandler(metrics_config.filename)
|
|
25
|
+
formatter = logging.Formatter('%(message)s')
|
|
26
|
+
fh.setFormatter(formatter)
|
|
27
|
+
|
|
28
|
+
# Clear existing handlers (if any) and set new one
|
|
29
|
+
METRICS_LOGGER.handlers.clear()
|
|
30
|
+
METRICS_LOGGER.addHandler(fh)
|
|
31
|
+
else:
|
|
32
|
+
METRICS_LOGGER.disabled = True
|