simulstream 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. docs/source/conf.py +47 -0
  2. simulstream/__init__.py +15 -0
  3. simulstream/client/__init__.py +0 -0
  4. simulstream/client/wav_reader_client.py +228 -0
  5. simulstream/config.py +31 -0
  6. simulstream/inference.py +170 -0
  7. simulstream/metrics/__init__.py +0 -0
  8. simulstream/metrics/detokenizers.py +71 -0
  9. simulstream/metrics/logger.py +32 -0
  10. simulstream/metrics/readers.py +348 -0
  11. simulstream/metrics/score_latency.py +130 -0
  12. simulstream/metrics/score_quality.py +169 -0
  13. simulstream/metrics/scorers/__init__.py +0 -0
  14. simulstream/metrics/scorers/latency/__init__.py +115 -0
  15. simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
  16. simulstream/metrics/scorers/latency/stream_laal.py +119 -0
  17. simulstream/metrics/scorers/quality/__init__.py +132 -0
  18. simulstream/metrics/scorers/quality/comet.py +57 -0
  19. simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
  20. simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
  21. simulstream/metrics/stats.py +184 -0
  22. simulstream/server/__init__.py +0 -0
  23. simulstream/server/http_server.py +95 -0
  24. simulstream/server/message_processor.py +156 -0
  25. simulstream/server/speech_processors/__init__.py +173 -0
  26. simulstream/server/speech_processors/base.py +135 -0
  27. simulstream/server/speech_processors/base_streamatt.py +320 -0
  28. simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
  29. simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
  30. simulstream/server/speech_processors/incremental_output.py +85 -0
  31. simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
  32. simulstream/server/speech_processors/seamless_streamatt.py +268 -0
  33. simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
  34. simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
  35. simulstream/server/speech_processors/vad_wrapper.py +180 -0
  36. simulstream/server/websocket_server.py +236 -0
  37. simulstream-0.1.0.dist-info/METADATA +465 -0
  38. simulstream-0.1.0.dist-info/RECORD +48 -0
  39. simulstream-0.1.0.dist-info/WHEEL +5 -0
  40. simulstream-0.1.0.dist-info/entry_points.txt +8 -0
  41. simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
  42. simulstream-0.1.0.dist-info/top_level.txt +3 -0
  43. uts/__init__.py +0 -0
  44. uts/metrics/__init__.py +0 -0
  45. uts/metrics/log_reader.py +50 -0
  46. uts/speech_processors/__init__.py +0 -0
  47. uts/speech_processors/test_simuleval_wrapper.py +88 -0
  48. uts/utils.py +5 -0
docs/source/conf.py ADDED
@@ -0,0 +1,47 @@
1
+ # Configuration file for the Sphinx documentation builder.
2
+ #
3
+ # For the full list of built-in configuration values, see the documentation:
4
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
+ import os
6
+ import sys
7
+
8
+ sys.path.insert(0, os.path.abspath('../../'))
9
+
10
+ # -- Project information -----------------------------------------------------
11
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
12
+
13
+ project = 'simulstream'
14
+ copyright = '2025, FBK'
15
+ author = 'Marco Gaido, FBK MT Unit'
16
+ release = '0.1.0'
17
+
18
+ # -- General configuration ---------------------------------------------------
19
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
20
+
21
+ extensions = [
22
+ 'sphinx.ext.duration',
23
+ 'sphinx.ext.doctest',
24
+ 'sphinx.ext.autosummary',
25
+ 'sphinx.ext.intersphinx',
26
+ "sphinx.ext.autodoc",
27
+ "sphinx.ext.napoleon",
28
+ ]
29
+
30
+ intersphinx_mapping = {
31
+ 'python': ('https://docs.python.org/3/', None),
32
+ 'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
33
+ }
34
+ intersphinx_disabled_domains = ['std']
35
+
36
+ templates_path = ['_templates']
37
+ exclude_patterns = []
38
+
39
+ # -- Options for HTML output -------------------------------------------------
40
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
41
+
42
+ html_logo = "_static/logo.png"
43
+ html_favicon = "_static/logo.png"
44
+ html_css_files = ['custom.css']
45
+
46
+ html_theme = 'sphinx_rtd_theme'
47
+ html_static_path = ['_static']
@@ -0,0 +1,15 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ __version__ = '0.1.0'
File without changes
@@ -0,0 +1,228 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import asyncio
17
+ import json
18
+ import logging
19
+ import wave
20
+ from typing import Tuple, Optional, List
21
+
22
+ import numpy as np
23
+ import websockets
24
+ import os
25
+ import contextlib
26
+
27
+
28
+ logging.basicConfig(
29
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
30
+ datefmt='%Y-%m-%d %H:%M:%S',
31
+ level=logging.INFO,
32
+ )
33
+ LOGGER = logging.getLogger('simulstream.wav_reader_client')
34
+
35
+
36
+ def float32_to_int16(audio_data: np.ndarray) -> np.ndarray:
37
+ """Convert a NumPy array of float32 audio samples to int16 PCM format."""
38
+ audio_data = np.clip(audio_data * 2 ** 15, -32768, 32767)
39
+ return audio_data.astype(np.int16)
40
+
41
+
42
+ def read_wav_file(filename: str) -> Tuple[int, np.ndarray]:
43
+ """
44
+ Read a WAV file and return its sample rate and audio data.
45
+
46
+ Args:
47
+ filename (str): Path to the WAV file.
48
+
49
+ Returns:
50
+ tuple[int, np.ndarray]: Sample rate and mono audio data as int16 array.
51
+
52
+ Raises:
53
+ ValueError: If the sample width is unsupported.
54
+ AssertionError: If the file contains more than one channel.
55
+ """
56
+ with contextlib.closing(wave.open(filename, 'rb')) as wf:
57
+ num_channels = wf.getnchannels()
58
+ sample_width = wf.getsampwidth()
59
+ sample_rate = wf.getframerate()
60
+ num_frames = wf.getnframes()
61
+ raw_data = wf.readframes(num_frames)
62
+
63
+ if sample_width == 2:
64
+ dtype = np.int16
65
+ elif sample_width == 4:
66
+ dtype = np.float32
67
+ else:
68
+ raise ValueError(f"Unsupported sample width: {sample_width}")
69
+
70
+ data = np.frombuffer(raw_data, dtype=dtype)
71
+
72
+ if sample_width == 4:
73
+ data = float32_to_int16(data)
74
+
75
+ assert num_channels == 1, "Currently ony 1 channel is supported"
76
+
77
+ return sample_rate, data
78
+
79
+
80
+ async def send_audio(
81
+ websocket: websockets.ClientConnection,
82
+ sample_rate: int,
83
+ data: np.ndarray,
84
+ chunk_duration_ms: int = 100):
85
+ """
86
+ Stream audio data in fixed-size chunks over a WebSocket connection.
87
+
88
+ Args:
89
+ websocket (websockets.ClientConnection): Active WebSocket connection.
90
+ sample_rate (int): Audio sample rate (Hz).
91
+ data (np.ndarray): Audio samples as int16 array.
92
+ chunk_duration_ms (int): Duration of each chunk in milliseconds.
93
+ """
94
+ samples_per_chunk = int(sample_rate * chunk_duration_ms / 1000.0)
95
+ i = 0
96
+ for i in range(0, len(data), samples_per_chunk):
97
+ await websocket.send(data[i:i + samples_per_chunk].tobytes())
98
+ # send last part of the audio
99
+ if i < len(data):
100
+ await websocket.send(data[i:].tobytes())
101
+
102
+
103
+ async def stream_wav_files(
104
+ uri: str,
105
+ wav_file_list: List[str],
106
+ chunk_duration_ms: int = 100,
107
+ tgt_lang: Optional[str] = None,
108
+ src_lang: Optional[str] = None):
109
+ """
110
+ Stream multiple WAV files sequentially to a WebSocket server.
111
+
112
+ For each file:
113
+ - Sends metadata (sample rate, filename, optional languages).
114
+ - Streams audio in chunks.
115
+ - Sends an end-of-stream marker.
116
+ - Waits for server confirmation before proceeding.
117
+
118
+ Args:
119
+ uri (str): WebSocket server URI.
120
+ wav_file_list (list[str]): Paths to WAV files.
121
+ chunk_duration_ms (int): Chunk size in milliseconds.
122
+ tgt_lang (str | None): Target language code (e.g., "en").
123
+ src_lang (str | None): Source language code (e.g., "en").
124
+ """
125
+ for wav_file in wav_file_list:
126
+ LOGGER.info(f"Streaming: {wav_file}")
127
+ sample_rate, data = read_wav_file(wav_file)
128
+ metadata = {
129
+ "sample_rate": sample_rate,
130
+ "metrics_metadata": {
131
+ "wav_name": wav_file,
132
+ }
133
+ }
134
+ if tgt_lang is not None:
135
+ metadata["target_lang"] = tgt_lang
136
+ if src_lang is not None:
137
+ metadata["source_lang"] = src_lang
138
+ async with websockets.connect(uri, ping_timeout=None) as websocket:
139
+ await websocket.send(json.dumps(metadata))
140
+ await send_audio(websocket, sample_rate, data, chunk_duration_ms)
141
+ await websocket.send(json.dumps({"end_of_stream": True}))
142
+ while True:
143
+ response = await websocket.recv()
144
+ LOGGER.debug(response)
145
+ if 'end_of_processing' in response:
146
+ break
147
+ LOGGER.info(f"All {len(wav_file_list)} files sent.")
148
+
149
+
150
+ def load_wav_file_list(list_file_path: str) -> List[str]:
151
+ """
152
+ Load a list of WAV file paths from a text file.
153
+
154
+ Args:
155
+ list_file_path (str): Path to a text file, one WAV file path per line.
156
+
157
+ Returns:
158
+ list[str]: Absolute file paths of WAV files.
159
+ """
160
+ basedir = os.path.dirname(list_file_path)
161
+ with open(list_file_path, 'r') as f:
162
+ wav_files = [basedir + '/' + line.strip() for line in f if line.strip()]
163
+ if not wav_files:
164
+ LOGGER.error("No valid WAV files found in the list.")
165
+ exit(1)
166
+ else:
167
+ assert all(os.path.isfile(f) for f in wav_files), "Invalid wav file in the list."
168
+ return wav_files
169
+
170
+
171
+ async def main(args: argparse.Namespace):
172
+ """Main entrypoint: validates WAV files and starts streaming."""
173
+ wav_files = load_wav_file_list(args.wav_list_file)
174
+ await stream_wav_files(
175
+ args.uri, wav_files, args.chunk_duration_ms, args.tgt_lang, args.src_lang)
176
+
177
+
178
+ def cli_main():
179
+ """
180
+ Simulstream WebSocket client command-line interface (CLI) entry point.
181
+
182
+ This script implements a simple WebSocket client that streams audio data from a list of WAV
183
+ files to a server for processing (e.g., speech recognition or translation). It reads WAV files,
184
+ converts them into fixed-size chunks, and sends them asynchronously over a WebSocket
185
+ connection.
186
+
187
+ Example usage::
188
+
189
+ $ python wav_reader_client.py --uri ws://localhost:8000/ --wav-list-file wav_files.txt \\
190
+ --tgt-lang it --src-lang en
191
+
192
+ Command-line arguments:
193
+
194
+ - ``--uri``: WebSocket server URI (e.g., ``ws://localhost:8000/``).
195
+ - ``--wav-list-file``: Path to a text file containing one WAV file path per line.
196
+ - ``--chunk-duration-ms``: Duration of each audio chunk sent to the server (ms). Default: 100.
197
+ - ``--tgt-lang``: Optional target language.
198
+ - ``--src-lang``: Optional source language.
199
+ """
200
+ parser = argparse.ArgumentParser(description="Websocket client for WAV files.")
201
+ parser.add_argument(
202
+ "--uri",
203
+ required=True,
204
+ help="WebSocket server URI (e.g., ws://localhost:8000/)")
205
+ parser.add_argument(
206
+ "--wav-list-file",
207
+ required=True,
208
+ help="Path to text file containing list of WAV file paths")
209
+ parser.add_argument(
210
+ "--chunk-duration-ms",
211
+ default=100,
212
+ type=int,
213
+ help="Size of the each chunk sent to the server in milliseconds (default: 100)")
214
+ parser.add_argument(
215
+ "--tgt-lang",
216
+ default=None,
217
+ help="Target language (if needed, its effect depends on the speech processor used by the "
218
+ "server).")
219
+ parser.add_argument(
220
+ "--src-lang",
221
+ default=None,
222
+ help="Source language (if needed, its effect depends on the speech processor used by the "
223
+ "server).")
224
+ asyncio.run(main(parser.parse_args()))
225
+
226
+
227
+ if __name__ == "__main__":
228
+ cli_main()
simulstream/config.py ADDED
@@ -0,0 +1,31 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ from types import SimpleNamespace
16
+
17
+ import yaml
18
+
19
+
20
+ def _dict_to_object(d):
21
+ if isinstance(d, dict):
22
+ return SimpleNamespace(**{k: _dict_to_object(v) for k, v in d.items()})
23
+ elif isinstance(d, list):
24
+ return [_dict_to_object(i) for i in d]
25
+ else:
26
+ return d
27
+
28
+
29
+ def yaml_config(path: str) -> SimpleNamespace:
30
+ with open(path, "r") as file:
31
+ return _dict_to_object(yaml.safe_load(file))
@@ -0,0 +1,170 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import json
17
+ import logging
18
+ import time
19
+ from types import SimpleNamespace
20
+ from typing import List, Optional
21
+
22
+ import numpy as np
23
+
24
+ import simulstream
25
+ from simulstream.client.wav_reader_client import load_wav_file_list, read_wav_file
26
+ from simulstream.config import yaml_config
27
+ from simulstream.metrics.logger import setup_metrics_logger, METRICS_LOGGER
28
+ from simulstream.server.message_processor import MessageProcessor
29
+ from simulstream.server.speech_processors import build_speech_processor, SpeechProcessor
30
+
31
+
32
+ logging.basicConfig(
33
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
34
+ datefmt='%Y-%m-%d %H:%M:%S',
35
+ level=logging.INFO,
36
+ )
37
+ LOGGER = logging.getLogger('simulstream.inference')
38
+
39
+
40
+ def process_audio(
41
+ message_processor: MessageProcessor,
42
+ sample_rate: int,
43
+ data: np.ndarray):
44
+ """
45
+ Stream audio data in fixed-size chunks over a WebSocket connection.
46
+
47
+ Args:
48
+ message_processor (MessageProcessor): class that processes the audio chunks.
49
+ sample_rate (int): Audio sample rate (Hz).
50
+ data (np.ndarray): Audio samples as int16 array.
51
+ """
52
+ samples_per_chunk = int(
53
+ sample_rate * message_processor.speech_processor.speech_chunk_size / 1000.0)
54
+ i = 0
55
+ for i in range(0, len(data), samples_per_chunk):
56
+ output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
57
+ LOGGER.debug(f"response: {output}")
58
+ # send last part of the audio
59
+ if i < len(data):
60
+ output = message_processor.process_speech(data[i:].tobytes())
61
+ LOGGER.debug(f"response: {output}")
62
+
63
+
64
+ def run_inference(
65
+ speech_processor: SpeechProcessor,
66
+ wav_file_list: List[str],
67
+ tgt_lang: Optional[str] = None,
68
+ src_lang: Optional[str] = None):
69
+ """
70
+ Runs the inference on the WAV files sequentially with the specified speech processor.
71
+
72
+ For each file:
73
+ - Sets metadata (sample rate, filename, optional languages).
74
+ - Processes the audio in chunks.
75
+
76
+ Args:
77
+ speech_processor (SpeechProcessor): the speech processor to use to run the inference.
78
+ wav_file_list (list[str]): Paths to WAV files.
79
+ tgt_lang (str | None): Target language code (e.g., "en").
80
+ src_lang (str | None): Source language code (e.g., "en").
81
+ """
82
+ for i, wav_file in enumerate(wav_file_list):
83
+ LOGGER.info(f"Streaming: {wav_file}")
84
+ sample_rate, data = read_wav_file(wav_file)
85
+ metadata = {
86
+ "sample_rate": sample_rate,
87
+ "metrics_metadata": {
88
+ "wav_name": wav_file,
89
+ }
90
+ }
91
+ if tgt_lang is not None:
92
+ metadata["target_lang"] = tgt_lang
93
+ if src_lang is not None:
94
+ metadata["source_lang"] = src_lang
95
+ message_processor = MessageProcessor(i, speech_processor)
96
+ message_processor.process_metadata(metadata)
97
+ process_audio(message_processor, sample_rate, data)
98
+ message_processor.end_of_stream()
99
+ LOGGER.info(f"All {len(wav_file_list)} files sent.")
100
+
101
+
102
+ def main(args: argparse.Namespace):
103
+ """
104
+ Main entrypoint: validates WAV files and starts the generation with the specified speech
105
+ processor.
106
+ """
107
+ setup_metrics_logger(SimpleNamespace(**{
108
+ "enabled": True,
109
+ "filename": args.metrics_log_file
110
+ }))
111
+ LOGGER.info(f"Loading speech processor from {args.speech_processor_config}")
112
+ speech_processor_config = yaml_config(args.speech_processor_config)
113
+ LOGGER.info(f"Using as speech processor: {speech_processor_config.type}")
114
+ speech_processor_loading_time = time.time()
115
+ speech_processor = build_speech_processor(speech_processor_config)
116
+ speech_processor_loading_time = time.time() - speech_processor_loading_time
117
+ LOGGER.info(f"Loaded speech processor in {speech_processor_loading_time:.3f} seconds")
118
+ METRICS_LOGGER.info(json.dumps({
119
+ "model_loading_time": speech_processor_loading_time,
120
+ }))
121
+ wav_files = load_wav_file_list(args.wav_list_file)
122
+ run_inference(speech_processor, wav_files, args.tgt_lang, args.src_lang)
123
+
124
+
125
+ def cli_main():
126
+ """
127
+ Simulstream evaluation command-line interface (CLI) entry point. This script processes the
128
+ specified wav files with the configured speech processor and can be used to get the metrics
129
+ log file to evaluate the quality and latency of the speech processor.
130
+
131
+ This function parses command-line arguments and starts the asynchronous :func:`main` routine.
132
+
133
+ Example usage::
134
+
135
+ $ python inference.py --speech-processor-config config/speech.yaml \\
136
+ --wav-list-file wav_files.txt --tgt-lang it --src-lang en
137
+
138
+ Command-line arguments:
139
+
140
+ - ``--server-config`` (str, optional): Path to the server configuration file
141
+ (default: ``config/server.yaml``).
142
+ - ``--speech-processor-config`` (str, required): Path to the speech processor configuration
143
+ file.
144
+ """
145
+ LOGGER.info(f"Simulstream version: {simulstream.__version__}")
146
+ parser = argparse.ArgumentParser("simulstream_inference")
147
+ parser.add_argument("--speech-processor-config", type=str, required=True)
148
+ parser.add_argument(
149
+ "--wav-list-file",
150
+ required=True,
151
+ help="Path to text file containing list of WAV file paths")
152
+ parser.add_argument(
153
+ "--tgt-lang",
154
+ default=None,
155
+ help="Target language (if needed, its effect depends on the speech processor used by the "
156
+ "server).")
157
+ parser.add_argument(
158
+ "--src-lang",
159
+ default=None,
160
+ help="Source language (if needed, its effect depends on the speech processor used by the "
161
+ "server).")
162
+ parser.add_argument(
163
+ "--metrics-log-file",
164
+ default="metrics.json",
165
+ help="Path where to write the metrics log file.")
166
+ main(parser.parse_args())
167
+
168
+
169
+ if __name__ == "__main__":
170
+ cli_main()
File without changes
@@ -0,0 +1,71 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ from types import SimpleNamespace
16
+ from typing import Callable, Dict, List
17
+
18
+
19
+ def build_hf_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
20
+ from transformers import AutoProcessor
21
+
22
+ assert hasattr(config, "hf_model_name"), \
23
+ "`hf_model_name` required in the eval config for `hf` detokenizer"
24
+ processor = AutoProcessor.from_pretrained(config.hf_model_name)
25
+
26
+ def detokenize(input_tokens: List[str]) -> str:
27
+ return processor.tokenizer.convert_tokens_to_string(input_tokens)
28
+
29
+ return detokenize
30
+
31
+
32
+ def build_canary_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
33
+ from nemo.collections.asr.models import ASRModel
34
+
35
+ assert hasattr(config, "model_name"), \
36
+ "`model_name` required in the eval config for `canary` detokenizer"
37
+ tokenizer = ASRModel.from_pretrained(model_name=config.model_name).tokenizer
38
+
39
+ def detokenize(input_tokens: List[str]) -> str:
40
+ return tokenizer.tokens_to_text(input_tokens)
41
+
42
+ return detokenize
43
+
44
+
45
+ def build_simuleval_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
46
+ """ SimulEval detokenizer from https://github.com/facebookresearch/SimulEval/blob/
47
+ 536de8253b82d805c9845440169a5010ff507357/simuleval/evaluator/instance.py#L233"""
48
+ if config.latency_unit == "word":
49
+ def detokenize(input_tokens: List[str]) -> str:
50
+ return " ".join(input_tokens)
51
+ elif config.latency_unit == "char":
52
+ def detokenize(input_tokens: List[str]) -> str:
53
+ return "".join(input_tokens)
54
+ elif config.latency_unit == "spm":
55
+ def detokenize(input_tokens: List[str]) -> str:
56
+ return "".join(input_tokens).replace("▁", " ").strip()
57
+ else:
58
+ raise NotImplementedError
59
+
60
+ return detokenize
61
+
62
+
63
+ _DETOKENIZER_BUILDER_MAP: Dict[str, Callable[[SimpleNamespace], Callable[[List[str]], str]]] = {
64
+ "hf": build_hf_detokenizer,
65
+ "canary": build_canary_detokenizer,
66
+ "simuleval": build_simuleval_detokenizer
67
+ }
68
+
69
+
70
+ def get_detokenizer(config: SimpleNamespace) -> Callable[[List[str]], str]:
71
+ return _DETOKENIZER_BUILDER_MAP[config.detokenizer_type](config)
@@ -0,0 +1,32 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import logging
16
+
17
+
18
+ METRICS_LOGGER = logging.getLogger('fbk_fairseq.simultaneous.metrics')
19
+ METRICS_LOGGER.propagate = False
20
+
21
+
22
+ def setup_metrics_logger(metrics_config):
23
+ if metrics_config.enabled:
24
+ fh = logging.FileHandler(metrics_config.filename)
25
+ formatter = logging.Formatter('%(message)s')
26
+ fh.setFormatter(formatter)
27
+
28
+ # Clear existing handlers (if any) and set new one
29
+ METRICS_LOGGER.handlers.clear()
30
+ METRICS_LOGGER.addHandler(fh)
31
+ else:
32
+ METRICS_LOGGER.disabled = True