simulstream 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +47 -0
- simulstream/__init__.py +15 -0
- simulstream/client/__init__.py +0 -0
- simulstream/client/wav_reader_client.py +228 -0
- simulstream/config.py +31 -0
- simulstream/inference.py +170 -0
- simulstream/metrics/__init__.py +0 -0
- simulstream/metrics/detokenizers.py +71 -0
- simulstream/metrics/logger.py +32 -0
- simulstream/metrics/readers.py +348 -0
- simulstream/metrics/score_latency.py +130 -0
- simulstream/metrics/score_quality.py +169 -0
- simulstream/metrics/scorers/__init__.py +0 -0
- simulstream/metrics/scorers/latency/__init__.py +115 -0
- simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
- simulstream/metrics/scorers/latency/stream_laal.py +119 -0
- simulstream/metrics/scorers/quality/__init__.py +132 -0
- simulstream/metrics/scorers/quality/comet.py +57 -0
- simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
- simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
- simulstream/metrics/stats.py +184 -0
- simulstream/server/__init__.py +0 -0
- simulstream/server/http_server.py +95 -0
- simulstream/server/message_processor.py +156 -0
- simulstream/server/speech_processors/__init__.py +173 -0
- simulstream/server/speech_processors/base.py +135 -0
- simulstream/server/speech_processors/base_streamatt.py +320 -0
- simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
- simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
- simulstream/server/speech_processors/incremental_output.py +85 -0
- simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
- simulstream/server/speech_processors/seamless_streamatt.py +268 -0
- simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
- simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
- simulstream/server/speech_processors/vad_wrapper.py +180 -0
- simulstream/server/websocket_server.py +236 -0
- simulstream-0.1.0.dist-info/METADATA +465 -0
- simulstream-0.1.0.dist-info/RECORD +48 -0
- simulstream-0.1.0.dist-info/WHEEL +5 -0
- simulstream-0.1.0.dist-info/entry_points.txt +8 -0
- simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
- simulstream-0.1.0.dist-info/top_level.txt +3 -0
- uts/__init__.py +0 -0
- uts/metrics/__init__.py +0 -0
- uts/metrics/log_reader.py +50 -0
- uts/speech_processors/__init__.py +0 -0
- uts/speech_processors/test_simuleval_wrapper.py +88 -0
- uts/utils.py +5 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from collections import OrderedDict
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from types import SimpleNamespace
|
|
20
|
+
from typing import List, Any, Dict
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
from simulstream.metrics.detokenizers import get_detokenizer
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def text_items(text: str, latency_unit: str) -> List[str]:
|
|
28
|
+
"""
|
|
29
|
+
Split a text string into items depending on the latency unit.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
text (str): The input text string.
|
|
33
|
+
latency_unit (str): The unit for latency measurement. Must be either:
|
|
34
|
+
- ``"word"`` → split on whitespace.
|
|
35
|
+
- ``"char"`` → split into individual characters.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List[str]: The list of word or character tokens.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
ValueError: If `latency_unit` is not ``"word"`` or ``"char"``.
|
|
42
|
+
"""
|
|
43
|
+
if latency_unit == "word":
|
|
44
|
+
words = text.split(" ")
|
|
45
|
+
return [w for w in words if w != '']
|
|
46
|
+
elif latency_unit == "char":
|
|
47
|
+
return list(text)
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Latency unit `{latency_unit}` not supported. Allowed values are `word` and `char`.")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class OutputWithDelays:
|
|
55
|
+
"""
|
|
56
|
+
Representation of a final output sequence and its delays.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
final_text (str): The detokenized output text.
|
|
60
|
+
ideal_delays (List[float]): Latency values relative to processed audio.
|
|
61
|
+
computational_aware_delays (List[float]): Latency values including computation time.
|
|
62
|
+
"""
|
|
63
|
+
final_text: str
|
|
64
|
+
ideal_delays: List[float]
|
|
65
|
+
computational_aware_delays: List[float]
|
|
66
|
+
|
|
67
|
+
def text_len(self, latency_unit: str) -> int:
|
|
68
|
+
"""
|
|
69
|
+
Return the length of the text in the given latency unit.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
latency_unit (str): Either ``"word"`` or ``"char"``.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
int: Number of items in the text.
|
|
76
|
+
"""
|
|
77
|
+
return len(self.text_items(latency_unit))
|
|
78
|
+
|
|
79
|
+
def text_items(self, latency_unit: str) -> List[str]:
|
|
80
|
+
"""
|
|
81
|
+
Return the text split into items (words or characters).
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
latency_unit (str): Either ``"word"`` or ``"char"``.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List[str]: Tokens in the specified unit.
|
|
88
|
+
"""
|
|
89
|
+
return text_items(self.final_text, latency_unit)
|
|
90
|
+
|
|
91
|
+
def last_word(self) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Return the last word of the text.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
str: The last word token.
|
|
97
|
+
"""
|
|
98
|
+
return self.text_items("word")[-1]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class ReferenceSentenceDefinition:
|
|
103
|
+
"""
|
|
104
|
+
Stores the information about a reference sentence.
|
|
105
|
+
|
|
106
|
+
Attributes:
|
|
107
|
+
content (str): The sentence text.
|
|
108
|
+
start_time (float): Start time (in seconds) of the segment.
|
|
109
|
+
duration (float): Duration (in seconds) of the segment.
|
|
110
|
+
"""
|
|
111
|
+
content: str
|
|
112
|
+
start_time: float
|
|
113
|
+
duration: float
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class LogReader:
|
|
117
|
+
"""
|
|
118
|
+
Reads and processes JSONL metric logs written by the websocket server.
|
|
119
|
+
|
|
120
|
+
This class rebuilds the final outputs (ignoring retranslated tokens) and provides access to
|
|
121
|
+
fine-grained information.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
config (SimpleNamespace): Configuration namespace, used for detokenizer setup.
|
|
125
|
+
filepath (str): Path to the log file (JSONL format).
|
|
126
|
+
latency_unit (str, optional): Latency measurement unit, ``"word"`` or ``"char"``.
|
|
127
|
+
"""
|
|
128
|
+
def __init__(self, config: SimpleNamespace, filepath: str, latency_unit: str = "word"):
|
|
129
|
+
self.filepath = filepath
|
|
130
|
+
self.detokenizer = get_detokenizer(config)
|
|
131
|
+
self.outputs_by_audio = self._get_outputs()
|
|
132
|
+
self.latency_unit = latency_unit
|
|
133
|
+
|
|
134
|
+
def _get_outputs(self) -> Dict[str, List[Dict[str, Any]]]:
|
|
135
|
+
"""
|
|
136
|
+
Group outputs by audio file.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dict[str, List[Dict[str, Any]]]: Mapping of audio name → list of log entries.
|
|
140
|
+
"""
|
|
141
|
+
outputs_by_audio = OrderedDict()
|
|
142
|
+
audio_id_map = {}
|
|
143
|
+
for line in self._read_all():
|
|
144
|
+
if 'metadata' in line:
|
|
145
|
+
audio_id_map[line['id']] = Path(line['metadata']['wav_name']).stem
|
|
146
|
+
elif 'id' in line:
|
|
147
|
+
assert line['id'] in audio_id_map, \
|
|
148
|
+
f'{line["id"]} not associated with audio file'
|
|
149
|
+
audio_name = audio_id_map[line['id']]
|
|
150
|
+
if audio_name not in outputs_by_audio:
|
|
151
|
+
outputs_by_audio[audio_name] = []
|
|
152
|
+
outputs_by_audio[audio_name].append(line)
|
|
153
|
+
return outputs_by_audio
|
|
154
|
+
|
|
155
|
+
def _read_all(self) -> List[Any]:
|
|
156
|
+
data = []
|
|
157
|
+
with open(self.filepath, 'r', encoding='utf-8') as f:
|
|
158
|
+
for line in f:
|
|
159
|
+
if line.strip(): # skip empty lines
|
|
160
|
+
data.append(json.loads(line))
|
|
161
|
+
return data
|
|
162
|
+
|
|
163
|
+
def num_deleted_tokens(self) -> int:
|
|
164
|
+
"""
|
|
165
|
+
Count the number of deleted tokens across all outputs.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
int: Total count of deleted tokens.
|
|
169
|
+
"""
|
|
170
|
+
num_deleted_tokens = 0
|
|
171
|
+
for audio, lines in self.outputs_by_audio.items():
|
|
172
|
+
for line in lines:
|
|
173
|
+
if len(line['deleted_tokens']) > 0:
|
|
174
|
+
num_deleted_tokens += len(
|
|
175
|
+
text_items(
|
|
176
|
+
self.detokenizer(line['deleted_tokens']),
|
|
177
|
+
latency_unit=self.latency_unit))
|
|
178
|
+
return num_deleted_tokens
|
|
179
|
+
|
|
180
|
+
def final_outputs_and_latencies(self) -> Dict[str, OutputWithDelays]:
|
|
181
|
+
"""
|
|
182
|
+
Compute the final outputs and their associated delays.
|
|
183
|
+
|
|
184
|
+
Retranslated (overridden) tokens are excluded from the output and from the delays. When a
|
|
185
|
+
word is partially updated (e.g., only the last subword is updated), the last update latency
|
|
186
|
+
is considered.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Dict[str, OutputWithDelays]: Mapping of audio file → output with delays.
|
|
190
|
+
"""
|
|
191
|
+
outputs: OrderedDict[str, OutputWithDelays] = OrderedDict()
|
|
192
|
+
for audio, lines in self.outputs_by_audio.items():
|
|
193
|
+
tokens = []
|
|
194
|
+
current_output = None
|
|
195
|
+
for line in lines:
|
|
196
|
+
line_delay = line['total_audio_processed']
|
|
197
|
+
line_comp_aware_delay = line['total_audio_processed'] + line['computation_time']
|
|
198
|
+
# remove tokens from previous generation
|
|
199
|
+
if len(line['deleted_tokens']) > 0:
|
|
200
|
+
assert line['deleted_tokens'] == tokens[-len(line['deleted_tokens']):]
|
|
201
|
+
tokens = tokens[:-len(line['deleted_tokens'])]
|
|
202
|
+
# update the current output by removing text and corresponding delays
|
|
203
|
+
new_output = OutputWithDelays(
|
|
204
|
+
self.detokenizer(tokens),
|
|
205
|
+
current_output.ideal_delays,
|
|
206
|
+
current_output.computational_aware_delays)
|
|
207
|
+
removed_tokens = current_output.text_len(self.latency_unit) - \
|
|
208
|
+
new_output.text_len(self.latency_unit)
|
|
209
|
+
if removed_tokens > 0:
|
|
210
|
+
new_output.ideal_delays = new_output.ideal_delays[:-removed_tokens]
|
|
211
|
+
new_output.computational_aware_delays = \
|
|
212
|
+
new_output.computational_aware_delays[:-removed_tokens]
|
|
213
|
+
# if the latency unit is `word` and part of the last word has been deleted
|
|
214
|
+
# we update the latency of the last word
|
|
215
|
+
if self.latency_unit == "word":
|
|
216
|
+
previous_ending_word_idx = new_output.text_len("word") - 1
|
|
217
|
+
if previous_ending_word_idx >= 0:
|
|
218
|
+
ending_word_before_update = current_output.text_items("word")[
|
|
219
|
+
previous_ending_word_idx]
|
|
220
|
+
if ending_word_before_update != new_output.last_word():
|
|
221
|
+
new_output.ideal_delays[-1] = line_delay
|
|
222
|
+
new_output.computational_aware_delays[-1] = line_comp_aware_delay
|
|
223
|
+
current_output = new_output
|
|
224
|
+
|
|
225
|
+
# add newly generated tokens
|
|
226
|
+
tokens.extend(line['generated_tokens'])
|
|
227
|
+
# for the first line, we initialize the OutputWithDelays with the partial text and
|
|
228
|
+
# assigning the ideal delay anc computational-aware one to all its units
|
|
229
|
+
if current_output is None:
|
|
230
|
+
current_output = OutputWithDelays(self.detokenizer(tokens), [], [])
|
|
231
|
+
num_units = current_output.text_len(self.latency_unit)
|
|
232
|
+
current_output.ideal_delays = [line_delay] * num_units
|
|
233
|
+
current_output.computational_aware_delays = [line_comp_aware_delay] * num_units
|
|
234
|
+
else:
|
|
235
|
+
# update the current output by adding corresponding delays
|
|
236
|
+
new_output = OutputWithDelays(
|
|
237
|
+
self.detokenizer(tokens),
|
|
238
|
+
current_output.ideal_delays,
|
|
239
|
+
current_output.computational_aware_delays)
|
|
240
|
+
added_units = new_output.text_len(self.latency_unit) - \
|
|
241
|
+
current_output.text_len(self.latency_unit)
|
|
242
|
+
if added_units > 0:
|
|
243
|
+
new_output.ideal_delays.extend([line_delay] * added_units)
|
|
244
|
+
new_output.computational_aware_delays.extend(
|
|
245
|
+
[line_comp_aware_delay] * added_units)
|
|
246
|
+
# if the latency unit is `word` and part of the last word has been updated
|
|
247
|
+
# we update the latency of the last word
|
|
248
|
+
if self.latency_unit == "word":
|
|
249
|
+
previous_ending_word_idx = current_output.text_len("word") - 1
|
|
250
|
+
if previous_ending_word_idx >= 0:
|
|
251
|
+
previous_ending_word_after_update = new_output.text_items("word")[
|
|
252
|
+
previous_ending_word_idx]
|
|
253
|
+
if previous_ending_word_after_update != current_output.last_word():
|
|
254
|
+
new_output.ideal_delays[previous_ending_word_idx] = line_delay
|
|
255
|
+
new_output.computational_aware_delays[previous_ending_word_idx] = \
|
|
256
|
+
line_comp_aware_delay
|
|
257
|
+
current_output = new_output
|
|
258
|
+
outputs[audio] = current_output
|
|
259
|
+
return outputs
|
|
260
|
+
|
|
261
|
+
def final_outputs(self) -> Dict[str, str]:
|
|
262
|
+
"""
|
|
263
|
+
Returns the final outputs for each audio.
|
|
264
|
+
|
|
265
|
+
Overridden tokens in retranslation are not included in the output, which is the final
|
|
266
|
+
string obtained at the end of the audio file.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Dict[str, str]: Mapping of audio file → final text.
|
|
270
|
+
"""
|
|
271
|
+
outputs: OrderedDict[str, str] = OrderedDict()
|
|
272
|
+
for audio, outputs_with_latency in self.final_outputs_and_latencies().items():
|
|
273
|
+
outputs[audio] = outputs_with_latency.final_text
|
|
274
|
+
return outputs
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class ReferencesReader:
|
|
278
|
+
"""
|
|
279
|
+
Reads plain-text reference files. Each file corresponds to a single audio.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
reference_files (List[str]): Paths to reference files.
|
|
283
|
+
"""
|
|
284
|
+
def __init__(self, reference_files: List[str]):
|
|
285
|
+
self.references = self._read_all(reference_files)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def _read_all(references: List[str]) -> Dict[str, List[str]]:
|
|
289
|
+
reference_by_file = OrderedDict()
|
|
290
|
+
for reference in references:
|
|
291
|
+
with open(reference, 'r', encoding='utf-8') as f:
|
|
292
|
+
reference_by_file[Path(reference).stem] = [line.strip() for line in f.readlines()]
|
|
293
|
+
return reference_by_file
|
|
294
|
+
|
|
295
|
+
def get_reference_texts(self) -> Dict[str, List[str]]:
|
|
296
|
+
"""
|
|
297
|
+
Get the references grouped by file.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Dict[str, List[str]]: Mapping of file stem → list of reference sentences.
|
|
301
|
+
"""
|
|
302
|
+
return self.references
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class YamlReferenceReader:
|
|
306
|
+
"""
|
|
307
|
+
Reads references aligned with audio definitions.
|
|
308
|
+
|
|
309
|
+
The audio definition is a YAML file where each entry describes a segment with its start and
|
|
310
|
+
duration. The reference file contains one sentence per line, where each lines is associated
|
|
311
|
+
with the corresponding segment in the audio definition file.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
audio_definition (str): Path to YAML file with segment definitions.
|
|
315
|
+
reference (str): Path to text file with reference sentences.
|
|
316
|
+
"""
|
|
317
|
+
def __init__(self, audio_definition: str, reference: str):
|
|
318
|
+
self.references = self._read_all(audio_definition, reference)
|
|
319
|
+
|
|
320
|
+
@staticmethod
|
|
321
|
+
def _read_all(
|
|
322
|
+
audio_definition: str, reference: str) -> Dict[str, List[ReferenceSentenceDefinition]]:
|
|
323
|
+
reference_by_file = OrderedDict()
|
|
324
|
+
with open(audio_definition) as f:
|
|
325
|
+
sentence_definitions = yaml.load(f, Loader=yaml.FullLoader)
|
|
326
|
+
with open(reference) as f:
|
|
327
|
+
sentences = f.readlines()
|
|
328
|
+
assert len(sentence_definitions) == len(sentences), \
|
|
329
|
+
f"Number of reference sentences ({len(sentences)}) and sentence definitions " \
|
|
330
|
+
f"({len(sentence_definitions)}) should be the same."
|
|
331
|
+
for sentence, definition in zip(sentences, sentence_definitions):
|
|
332
|
+
wav_name = Path(definition["wav"]).stem
|
|
333
|
+
if wav_name not in reference_by_file:
|
|
334
|
+
reference_by_file[wav_name] = []
|
|
335
|
+
reference_by_file[wav_name].append(ReferenceSentenceDefinition(
|
|
336
|
+
sentence.strip(), definition["offset"], definition["duration"]))
|
|
337
|
+
return reference_by_file
|
|
338
|
+
|
|
339
|
+
def get_reference_texts(self) -> Dict[str, List[str]]:
|
|
340
|
+
"""
|
|
341
|
+
Get the references grouped by file.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Dict[str, List[str]]: Mapping of file stem → list of reference sentences.
|
|
345
|
+
"""
|
|
346
|
+
return OrderedDict({
|
|
347
|
+
name: [sentence_def.content for sentence_def in list_sentences]
|
|
348
|
+
for name, list_sentences in self.references.items()})
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
import simulstream
|
|
19
|
+
from simulstream.config import yaml_config
|
|
20
|
+
from simulstream.metrics.readers import LogReader, YamlReferenceReader
|
|
21
|
+
from simulstream.metrics.scorers.latency import LATENCY_SCORER_REGISTRY, LatencyScorer, \
|
|
22
|
+
LatencyScoringSample
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
27
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
force=True
|
|
30
|
+
)
|
|
31
|
+
LOGGER = logging.getLogger('simulstream.score_latency')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main(scorer_cls: type[LatencyScorer], args: argparse.Namespace):
|
|
35
|
+
"""
|
|
36
|
+
Main entry point for latency scoring.
|
|
37
|
+
|
|
38
|
+
Loads system outputs from a log file, builds scoring samples with segment-level references,
|
|
39
|
+
and computes latency scores using the specified scorer.
|
|
40
|
+
|
|
41
|
+
The score (in seconds) is printed on standard output.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
scorer_cls (type[LatencyScorer]): The latency scorer class to use.
|
|
45
|
+
args (argparse.Namespace): Parsed command-line arguments.
|
|
46
|
+
"""
|
|
47
|
+
LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
|
|
48
|
+
eval_config = yaml_config(args.eval_config)
|
|
49
|
+
LOGGER.info(f"Reading log file ({args.log_file})")
|
|
50
|
+
log_reader = LogReader(eval_config, args.log_file, latency_unit=args.latency_unit)
|
|
51
|
+
|
|
52
|
+
LOGGER.info(f"Building latency scorer {args.scorer}")
|
|
53
|
+
scorer = scorer_cls(args)
|
|
54
|
+
|
|
55
|
+
LOGGER.info(
|
|
56
|
+
f"Reading audio definition ({args.audio_definition}), and reference ({args.reference})")
|
|
57
|
+
references = None
|
|
58
|
+
if scorer.requires_reference():
|
|
59
|
+
reference_reader = YamlReferenceReader(args.audio_definition, args.reference)
|
|
60
|
+
references = reference_reader.references
|
|
61
|
+
|
|
62
|
+
output_with_latency = log_reader.final_outputs_and_latencies()
|
|
63
|
+
|
|
64
|
+
if references is not None:
|
|
65
|
+
audio_files = references.keys()
|
|
66
|
+
else:
|
|
67
|
+
audio_files = output_with_latency.keys()
|
|
68
|
+
|
|
69
|
+
samples = []
|
|
70
|
+
for audio_file in audio_files:
|
|
71
|
+
reference = references[audio_file] if references is not None else None
|
|
72
|
+
samples.append(LatencyScoringSample(
|
|
73
|
+
audio_file, output_with_latency[audio_file], reference))
|
|
74
|
+
|
|
75
|
+
score = scorer.score(samples)
|
|
76
|
+
print(f"Latency scores (in seconds): {score}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def cli_main():
|
|
80
|
+
"""
|
|
81
|
+
Latency scoring script for Simulstream evaluation.
|
|
82
|
+
|
|
83
|
+
This module provides tools to compute latency metrics for streaming speech translation or
|
|
84
|
+
recognition. It supports multiple latency scorers through a pluggable registry
|
|
85
|
+
(:data:`LATENCY_SCORER_REGISTRY`).
|
|
86
|
+
|
|
87
|
+
The script works with JSONL log files generated during inference.
|
|
88
|
+
|
|
89
|
+
Typical usage from the command line::
|
|
90
|
+
|
|
91
|
+
$ python -m simulstream.metrics.score_latency \\
|
|
92
|
+
--eval-config config/speech-processor.yaml \\
|
|
93
|
+
--log-file metrics.jsonl \\
|
|
94
|
+
--audio-definition segments.yaml \\
|
|
95
|
+
--reference ref.txt \\
|
|
96
|
+
--scorer stream_laal
|
|
97
|
+
"""
|
|
98
|
+
LOGGER.info(f"Simulstream version: {simulstream.__version__}")
|
|
99
|
+
parser = argparse.ArgumentParser("score_latency")
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--eval-config", type=str, required=True,
|
|
102
|
+
help="Path to the yaml config file containing information about the tokenizer to be used.")
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"--log-file", type=str, required=True,
|
|
105
|
+
help="Path to the log file with the metrics to be used for the evaluation.")
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
"--reference", "-r", type=str,
|
|
108
|
+
help="Path to the textual file containing segment-level references stored line by line.")
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
"--audio-definition", "-a", type=str, required=True,
|
|
111
|
+
help="Path to the yaml file containing the segment-level audio information.")
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
"--latency-unit", choices=["word", "char"], default="word",
|
|
114
|
+
help="Whether to computed latency based on words or characters. Default: word.")
|
|
115
|
+
parser.add_argument("--scorer", choices=LATENCY_SCORER_REGISTRY.keys(), required=True)
|
|
116
|
+
args, _ = parser.parse_known_args()
|
|
117
|
+
|
|
118
|
+
# build full parser with scorer-specific args
|
|
119
|
+
parser = argparse.ArgumentParser(parents=[parser], add_help=False)
|
|
120
|
+
scorer_cls = LATENCY_SCORER_REGISTRY[args.scorer]
|
|
121
|
+
scorer_cls.add_arguments(parser)
|
|
122
|
+
|
|
123
|
+
# parse new arguments
|
|
124
|
+
args = parser.parse_args()
|
|
125
|
+
|
|
126
|
+
main(scorer_cls, args)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
cli_main()
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
import simulstream
|
|
19
|
+
from simulstream.config import yaml_config
|
|
20
|
+
from simulstream.metrics.readers import LogReader, ReferencesReader, YamlReferenceReader
|
|
21
|
+
from simulstream.metrics.scorers.quality import QUALITY_SCORER_REGISTRY, QualityScorer, \
|
|
22
|
+
QualityScoringSample
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
27
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
force=True
|
|
30
|
+
)
|
|
31
|
+
LOGGER = logging.getLogger('simulstream.score_quality')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main(scorer_cls: type[QualityScorer], args: argparse.Namespace):
|
|
35
|
+
"""
|
|
36
|
+
Main entry point for quality scoring.
|
|
37
|
+
|
|
38
|
+
This function loads the evaluation configuration, system hypotheses, and reference/transcript
|
|
39
|
+
data (if required), then constructs scoring samples and computes the final quality score using
|
|
40
|
+
the selected scorer.
|
|
41
|
+
|
|
42
|
+
The output is printed on standard output.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
scorer_cls (type[QualityScorer]): Class implementing the quality metric.
|
|
46
|
+
args (argparse.Namespace): Parsed command-line arguments.
|
|
47
|
+
"""
|
|
48
|
+
LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
|
|
49
|
+
eval_config = yaml_config(args.eval_config)
|
|
50
|
+
log_reader = LogReader(eval_config, args.log_file)
|
|
51
|
+
|
|
52
|
+
LOGGER.info(f"Building scorer class for {args.scorer}")
|
|
53
|
+
scorer = scorer_cls(args)
|
|
54
|
+
|
|
55
|
+
LOGGER.info("Reading source and reference definition")
|
|
56
|
+
reference_reader = None
|
|
57
|
+
transcripts_reader = None
|
|
58
|
+
if args.audio_definition is not None:
|
|
59
|
+
if scorer.requires_reference():
|
|
60
|
+
assert len(args.references) == 1, \
|
|
61
|
+
"When audio definition is provided, only one reference file should be provided."
|
|
62
|
+
reference_reader = YamlReferenceReader(args.audio_definition, args.references[0])
|
|
63
|
+
if scorer.requires_source():
|
|
64
|
+
assert len(args.transcripts) == 1, \
|
|
65
|
+
"When audio definition is provided, only one transcript file should be provided."
|
|
66
|
+
transcripts_reader = YamlReferenceReader(args.audio_definition, args.transcripts[0])
|
|
67
|
+
else:
|
|
68
|
+
if scorer.requires_reference():
|
|
69
|
+
reference_reader = ReferencesReader(args.references)
|
|
70
|
+
if scorer.requires_source():
|
|
71
|
+
transcripts_reader = ReferencesReader(args.transcripts)
|
|
72
|
+
|
|
73
|
+
hypothesis_dictionary = log_reader.final_outputs()
|
|
74
|
+
transcript_dictionary = None
|
|
75
|
+
reference_dictionary = None
|
|
76
|
+
audio_files_to_score = None
|
|
77
|
+
if transcripts_reader is not None:
|
|
78
|
+
transcript_dictionary = transcripts_reader.get_reference_texts()
|
|
79
|
+
audio_files_to_score = transcript_dictionary.keys()
|
|
80
|
+
if reference_reader is not None:
|
|
81
|
+
reference_dictionary = reference_reader.get_reference_texts()
|
|
82
|
+
audio_files_to_score = reference_dictionary.keys()
|
|
83
|
+
|
|
84
|
+
scoring_samples = []
|
|
85
|
+
for audio_name in audio_files_to_score:
|
|
86
|
+
transcript = None
|
|
87
|
+
if transcript_dictionary is not None:
|
|
88
|
+
transcript = transcript_dictionary[audio_name]
|
|
89
|
+
reference = None
|
|
90
|
+
if reference_dictionary is not None:
|
|
91
|
+
reference = reference_dictionary[audio_name]
|
|
92
|
+
if transcript is not None and reference is not None:
|
|
93
|
+
assert len(reference) == len(transcript), \
|
|
94
|
+
f"Reference ({audio_name}) has mismatched number of target ({len(reference)}) " \
|
|
95
|
+
f"and source lines ({len(transcript)})"
|
|
96
|
+
|
|
97
|
+
scoring_samples.append(QualityScoringSample(
|
|
98
|
+
audio_name, hypothesis_dictionary[audio_name], reference, transcript))
|
|
99
|
+
|
|
100
|
+
LOGGER.info("Scoring outputs")
|
|
101
|
+
score = scorer.score(scoring_samples)
|
|
102
|
+
|
|
103
|
+
print(f"{args.scorer} score: {score}")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cli_main():
|
|
107
|
+
"""
|
|
108
|
+
Quality scoring script for Simulstream evaluation.
|
|
109
|
+
|
|
110
|
+
This module provides functionality to compute quality-based evaluation metrics on system
|
|
111
|
+
outputs stored in JSONL log files. It uses pluggable scorers from the
|
|
112
|
+
:mod:`simulstream.metrics.scorers.quality` registry and compares system outputs against
|
|
113
|
+
references and/or transcripts.
|
|
114
|
+
|
|
115
|
+
It supports:
|
|
116
|
+
- **Reference-based metrics** (e.g., BLEU, COMET).
|
|
117
|
+
- **Source-based metrics** (e.g., reference-free COMET).
|
|
118
|
+
- Hybrid setups when both references and transcripts are available.
|
|
119
|
+
|
|
120
|
+
The script can be invoked as a standalone CLI:
|
|
121
|
+
|
|
122
|
+
$ python -m simulstream.metrics.score_quality \\
|
|
123
|
+
--eval-config config/speech-processor.yaml \\
|
|
124
|
+
--log-file metrics.jsonl \\
|
|
125
|
+
--references ref.en \\
|
|
126
|
+
--transcripts src.it \\
|
|
127
|
+
--scorer sacrebleu
|
|
128
|
+
"""
|
|
129
|
+
LOGGER.info(f"Simulstream version: {simulstream.__version__}")
|
|
130
|
+
parser = argparse.ArgumentParser("score_quality")
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"--eval-config", type=str, required=True,
|
|
133
|
+
help="Path to the yaml config file containing information about the tokenizer to be used.")
|
|
134
|
+
parser.add_argument(
|
|
135
|
+
"--log-file", type=str, required=True,
|
|
136
|
+
help="Path to the log file with the metrics to be used for the evaluation.")
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--references", nargs="+", type=str,
|
|
139
|
+
help="Path to the textual files containing references. If `--audio-definition` is "
|
|
140
|
+
"specified, this should be a single file containing all the lines of the audios in "
|
|
141
|
+
"the reference, which should be of the same length of the audio definition. "
|
|
142
|
+
"Otherwise, this should be a list of files, where each contains the lines "
|
|
143
|
+
"corresponding to an audio file.")
|
|
144
|
+
parser.add_argument(
|
|
145
|
+
"--transcripts", nargs="+", type=str,
|
|
146
|
+
help="Path to the textual files containing reference transcripts. If `--audio-definition` "
|
|
147
|
+
"is specified, this should be a single file containing all the lines of the audios "
|
|
148
|
+
"in the reference, which should be of the same length of the audio definition. "
|
|
149
|
+
"Otherwise, this should be a list of files, where each contains the lines "
|
|
150
|
+
"corresponding to an audio file.")
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
"--audio-definition", "-a", type=str, default=None,
|
|
153
|
+
help="Path to the yaml file containing the segment-level audio information.")
|
|
154
|
+
parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
|
|
155
|
+
args, _ = parser.parse_known_args()
|
|
156
|
+
|
|
157
|
+
# build full parser with scorer-specific args
|
|
158
|
+
parser = argparse.ArgumentParser(parents=[parser], add_help=False)
|
|
159
|
+
scorer_cls = QUALITY_SCORER_REGISTRY[args.scorer]
|
|
160
|
+
scorer_cls.add_arguments(parser)
|
|
161
|
+
|
|
162
|
+
# parse new arguments
|
|
163
|
+
args = parser.parse_args()
|
|
164
|
+
|
|
165
|
+
main(scorer_cls, args)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
cli_main()
|
|
File without changes
|