simulstream 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. docs/source/conf.py +47 -0
  2. simulstream/__init__.py +15 -0
  3. simulstream/client/__init__.py +0 -0
  4. simulstream/client/wav_reader_client.py +228 -0
  5. simulstream/config.py +31 -0
  6. simulstream/inference.py +170 -0
  7. simulstream/metrics/__init__.py +0 -0
  8. simulstream/metrics/detokenizers.py +71 -0
  9. simulstream/metrics/logger.py +32 -0
  10. simulstream/metrics/readers.py +348 -0
  11. simulstream/metrics/score_latency.py +130 -0
  12. simulstream/metrics/score_quality.py +169 -0
  13. simulstream/metrics/scorers/__init__.py +0 -0
  14. simulstream/metrics/scorers/latency/__init__.py +115 -0
  15. simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
  16. simulstream/metrics/scorers/latency/stream_laal.py +119 -0
  17. simulstream/metrics/scorers/quality/__init__.py +132 -0
  18. simulstream/metrics/scorers/quality/comet.py +57 -0
  19. simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
  20. simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
  21. simulstream/metrics/stats.py +184 -0
  22. simulstream/server/__init__.py +0 -0
  23. simulstream/server/http_server.py +95 -0
  24. simulstream/server/message_processor.py +156 -0
  25. simulstream/server/speech_processors/__init__.py +173 -0
  26. simulstream/server/speech_processors/base.py +135 -0
  27. simulstream/server/speech_processors/base_streamatt.py +320 -0
  28. simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
  29. simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
  30. simulstream/server/speech_processors/incremental_output.py +85 -0
  31. simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
  32. simulstream/server/speech_processors/seamless_streamatt.py +268 -0
  33. simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
  34. simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
  35. simulstream/server/speech_processors/vad_wrapper.py +180 -0
  36. simulstream/server/websocket_server.py +236 -0
  37. simulstream-0.1.0.dist-info/METADATA +465 -0
  38. simulstream-0.1.0.dist-info/RECORD +48 -0
  39. simulstream-0.1.0.dist-info/WHEEL +5 -0
  40. simulstream-0.1.0.dist-info/entry_points.txt +8 -0
  41. simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
  42. simulstream-0.1.0.dist-info/top_level.txt +3 -0
  43. uts/__init__.py +0 -0
  44. uts/metrics/__init__.py +0 -0
  45. uts/metrics/log_reader.py +50 -0
  46. uts/speech_processors/__init__.py +0 -0
  47. uts/speech_processors/test_simuleval_wrapper.py +88 -0
  48. uts/utils.py +5 -0
@@ -0,0 +1,348 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import json
16
+ from collections import OrderedDict
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from types import SimpleNamespace
20
+ from typing import List, Any, Dict
21
+
22
+ import yaml
23
+
24
+ from simulstream.metrics.detokenizers import get_detokenizer
25
+
26
+
27
+ def text_items(text: str, latency_unit: str) -> List[str]:
28
+ """
29
+ Split a text string into items depending on the latency unit.
30
+
31
+ Args:
32
+ text (str): The input text string.
33
+ latency_unit (str): The unit for latency measurement. Must be either:
34
+ - ``"word"`` → split on whitespace.
35
+ - ``"char"`` → split into individual characters.
36
+
37
+ Returns:
38
+ List[str]: The list of word or character tokens.
39
+
40
+ Raises:
41
+ ValueError: If `latency_unit` is not ``"word"`` or ``"char"``.
42
+ """
43
+ if latency_unit == "word":
44
+ words = text.split(" ")
45
+ return [w for w in words if w != '']
46
+ elif latency_unit == "char":
47
+ return list(text)
48
+ else:
49
+ raise ValueError(
50
+ f"Latency unit `{latency_unit}` not supported. Allowed values are `word` and `char`.")
51
+
52
+
53
+ @dataclass
54
+ class OutputWithDelays:
55
+ """
56
+ Representation of a final output sequence and its delays.
57
+
58
+ Attributes:
59
+ final_text (str): The detokenized output text.
60
+ ideal_delays (List[float]): Latency values relative to processed audio.
61
+ computational_aware_delays (List[float]): Latency values including computation time.
62
+ """
63
+ final_text: str
64
+ ideal_delays: List[float]
65
+ computational_aware_delays: List[float]
66
+
67
+ def text_len(self, latency_unit: str) -> int:
68
+ """
69
+ Return the length of the text in the given latency unit.
70
+
71
+ Args:
72
+ latency_unit (str): Either ``"word"`` or ``"char"``.
73
+
74
+ Returns:
75
+ int: Number of items in the text.
76
+ """
77
+ return len(self.text_items(latency_unit))
78
+
79
+ def text_items(self, latency_unit: str) -> List[str]:
80
+ """
81
+ Return the text split into items (words or characters).
82
+
83
+ Args:
84
+ latency_unit (str): Either ``"word"`` or ``"char"``.
85
+
86
+ Returns:
87
+ List[str]: Tokens in the specified unit.
88
+ """
89
+ return text_items(self.final_text, latency_unit)
90
+
91
+ def last_word(self) -> str:
92
+ """
93
+ Return the last word of the text.
94
+
95
+ Returns:
96
+ str: The last word token.
97
+ """
98
+ return self.text_items("word")[-1]
99
+
100
+
101
+ @dataclass
102
+ class ReferenceSentenceDefinition:
103
+ """
104
+ Stores the information about a reference sentence.
105
+
106
+ Attributes:
107
+ content (str): The sentence text.
108
+ start_time (float): Start time (in seconds) of the segment.
109
+ duration (float): Duration (in seconds) of the segment.
110
+ """
111
+ content: str
112
+ start_time: float
113
+ duration: float
114
+
115
+
116
+ class LogReader:
117
+ """
118
+ Reads and processes JSONL metric logs written by the websocket server.
119
+
120
+ This class rebuilds the final outputs (ignoring retranslated tokens) and provides access to
121
+ fine-grained information.
122
+
123
+ Args:
124
+ config (SimpleNamespace): Configuration namespace, used for detokenizer setup.
125
+ filepath (str): Path to the log file (JSONL format).
126
+ latency_unit (str, optional): Latency measurement unit, ``"word"`` or ``"char"``.
127
+ """
128
+ def __init__(self, config: SimpleNamespace, filepath: str, latency_unit: str = "word"):
129
+ self.filepath = filepath
130
+ self.detokenizer = get_detokenizer(config)
131
+ self.outputs_by_audio = self._get_outputs()
132
+ self.latency_unit = latency_unit
133
+
134
+ def _get_outputs(self) -> Dict[str, List[Dict[str, Any]]]:
135
+ """
136
+ Group outputs by audio file.
137
+
138
+ Returns:
139
+ Dict[str, List[Dict[str, Any]]]: Mapping of audio name → list of log entries.
140
+ """
141
+ outputs_by_audio = OrderedDict()
142
+ audio_id_map = {}
143
+ for line in self._read_all():
144
+ if 'metadata' in line:
145
+ audio_id_map[line['id']] = Path(line['metadata']['wav_name']).stem
146
+ elif 'id' in line:
147
+ assert line['id'] in audio_id_map, \
148
+ f'{line["id"]} not associated with audio file'
149
+ audio_name = audio_id_map[line['id']]
150
+ if audio_name not in outputs_by_audio:
151
+ outputs_by_audio[audio_name] = []
152
+ outputs_by_audio[audio_name].append(line)
153
+ return outputs_by_audio
154
+
155
+ def _read_all(self) -> List[Any]:
156
+ data = []
157
+ with open(self.filepath, 'r', encoding='utf-8') as f:
158
+ for line in f:
159
+ if line.strip(): # skip empty lines
160
+ data.append(json.loads(line))
161
+ return data
162
+
163
+ def num_deleted_tokens(self) -> int:
164
+ """
165
+ Count the number of deleted tokens across all outputs.
166
+
167
+ Returns:
168
+ int: Total count of deleted tokens.
169
+ """
170
+ num_deleted_tokens = 0
171
+ for audio, lines in self.outputs_by_audio.items():
172
+ for line in lines:
173
+ if len(line['deleted_tokens']) > 0:
174
+ num_deleted_tokens += len(
175
+ text_items(
176
+ self.detokenizer(line['deleted_tokens']),
177
+ latency_unit=self.latency_unit))
178
+ return num_deleted_tokens
179
+
180
+ def final_outputs_and_latencies(self) -> Dict[str, OutputWithDelays]:
181
+ """
182
+ Compute the final outputs and their associated delays.
183
+
184
+ Retranslated (overridden) tokens are excluded from the output and from the delays. When a
185
+ word is partially updated (e.g., only the last subword is updated), the last update latency
186
+ is considered.
187
+
188
+ Returns:
189
+ Dict[str, OutputWithDelays]: Mapping of audio file → output with delays.
190
+ """
191
+ outputs: OrderedDict[str, OutputWithDelays] = OrderedDict()
192
+ for audio, lines in self.outputs_by_audio.items():
193
+ tokens = []
194
+ current_output = None
195
+ for line in lines:
196
+ line_delay = line['total_audio_processed']
197
+ line_comp_aware_delay = line['total_audio_processed'] + line['computation_time']
198
+ # remove tokens from previous generation
199
+ if len(line['deleted_tokens']) > 0:
200
+ assert line['deleted_tokens'] == tokens[-len(line['deleted_tokens']):]
201
+ tokens = tokens[:-len(line['deleted_tokens'])]
202
+ # update the current output by removing text and corresponding delays
203
+ new_output = OutputWithDelays(
204
+ self.detokenizer(tokens),
205
+ current_output.ideal_delays,
206
+ current_output.computational_aware_delays)
207
+ removed_tokens = current_output.text_len(self.latency_unit) - \
208
+ new_output.text_len(self.latency_unit)
209
+ if removed_tokens > 0:
210
+ new_output.ideal_delays = new_output.ideal_delays[:-removed_tokens]
211
+ new_output.computational_aware_delays = \
212
+ new_output.computational_aware_delays[:-removed_tokens]
213
+ # if the latency unit is `word` and part of the last word has been deleted
214
+ # we update the latency of the last word
215
+ if self.latency_unit == "word":
216
+ previous_ending_word_idx = new_output.text_len("word") - 1
217
+ if previous_ending_word_idx >= 0:
218
+ ending_word_before_update = current_output.text_items("word")[
219
+ previous_ending_word_idx]
220
+ if ending_word_before_update != new_output.last_word():
221
+ new_output.ideal_delays[-1] = line_delay
222
+ new_output.computational_aware_delays[-1] = line_comp_aware_delay
223
+ current_output = new_output
224
+
225
+ # add newly generated tokens
226
+ tokens.extend(line['generated_tokens'])
227
+ # for the first line, we initialize the OutputWithDelays with the partial text and
228
+ # assigning the ideal delay anc computational-aware one to all its units
229
+ if current_output is None:
230
+ current_output = OutputWithDelays(self.detokenizer(tokens), [], [])
231
+ num_units = current_output.text_len(self.latency_unit)
232
+ current_output.ideal_delays = [line_delay] * num_units
233
+ current_output.computational_aware_delays = [line_comp_aware_delay] * num_units
234
+ else:
235
+ # update the current output by adding corresponding delays
236
+ new_output = OutputWithDelays(
237
+ self.detokenizer(tokens),
238
+ current_output.ideal_delays,
239
+ current_output.computational_aware_delays)
240
+ added_units = new_output.text_len(self.latency_unit) - \
241
+ current_output.text_len(self.latency_unit)
242
+ if added_units > 0:
243
+ new_output.ideal_delays.extend([line_delay] * added_units)
244
+ new_output.computational_aware_delays.extend(
245
+ [line_comp_aware_delay] * added_units)
246
+ # if the latency unit is `word` and part of the last word has been updated
247
+ # we update the latency of the last word
248
+ if self.latency_unit == "word":
249
+ previous_ending_word_idx = current_output.text_len("word") - 1
250
+ if previous_ending_word_idx >= 0:
251
+ previous_ending_word_after_update = new_output.text_items("word")[
252
+ previous_ending_word_idx]
253
+ if previous_ending_word_after_update != current_output.last_word():
254
+ new_output.ideal_delays[previous_ending_word_idx] = line_delay
255
+ new_output.computational_aware_delays[previous_ending_word_idx] = \
256
+ line_comp_aware_delay
257
+ current_output = new_output
258
+ outputs[audio] = current_output
259
+ return outputs
260
+
261
+ def final_outputs(self) -> Dict[str, str]:
262
+ """
263
+ Returns the final outputs for each audio.
264
+
265
+ Overridden tokens in retranslation are not included in the output, which is the final
266
+ string obtained at the end of the audio file.
267
+
268
+ Returns:
269
+ Dict[str, str]: Mapping of audio file → final text.
270
+ """
271
+ outputs: OrderedDict[str, str] = OrderedDict()
272
+ for audio, outputs_with_latency in self.final_outputs_and_latencies().items():
273
+ outputs[audio] = outputs_with_latency.final_text
274
+ return outputs
275
+
276
+
277
+ class ReferencesReader:
278
+ """
279
+ Reads plain-text reference files. Each file corresponds to a single audio.
280
+
281
+ Args:
282
+ reference_files (List[str]): Paths to reference files.
283
+ """
284
+ def __init__(self, reference_files: List[str]):
285
+ self.references = self._read_all(reference_files)
286
+
287
+ @staticmethod
288
+ def _read_all(references: List[str]) -> Dict[str, List[str]]:
289
+ reference_by_file = OrderedDict()
290
+ for reference in references:
291
+ with open(reference, 'r', encoding='utf-8') as f:
292
+ reference_by_file[Path(reference).stem] = [line.strip() for line in f.readlines()]
293
+ return reference_by_file
294
+
295
+ def get_reference_texts(self) -> Dict[str, List[str]]:
296
+ """
297
+ Get the references grouped by file.
298
+
299
+ Returns:
300
+ Dict[str, List[str]]: Mapping of file stem → list of reference sentences.
301
+ """
302
+ return self.references
303
+
304
+
305
+ class YamlReferenceReader:
306
+ """
307
+ Reads references aligned with audio definitions.
308
+
309
+ The audio definition is a YAML file where each entry describes a segment with its start and
310
+ duration. The reference file contains one sentence per line, where each lines is associated
311
+ with the corresponding segment in the audio definition file.
312
+
313
+ Args:
314
+ audio_definition (str): Path to YAML file with segment definitions.
315
+ reference (str): Path to text file with reference sentences.
316
+ """
317
+ def __init__(self, audio_definition: str, reference: str):
318
+ self.references = self._read_all(audio_definition, reference)
319
+
320
+ @staticmethod
321
+ def _read_all(
322
+ audio_definition: str, reference: str) -> Dict[str, List[ReferenceSentenceDefinition]]:
323
+ reference_by_file = OrderedDict()
324
+ with open(audio_definition) as f:
325
+ sentence_definitions = yaml.load(f, Loader=yaml.FullLoader)
326
+ with open(reference) as f:
327
+ sentences = f.readlines()
328
+ assert len(sentence_definitions) == len(sentences), \
329
+ f"Number of reference sentences ({len(sentences)}) and sentence definitions " \
330
+ f"({len(sentence_definitions)}) should be the same."
331
+ for sentence, definition in zip(sentences, sentence_definitions):
332
+ wav_name = Path(definition["wav"]).stem
333
+ if wav_name not in reference_by_file:
334
+ reference_by_file[wav_name] = []
335
+ reference_by_file[wav_name].append(ReferenceSentenceDefinition(
336
+ sentence.strip(), definition["offset"], definition["duration"]))
337
+ return reference_by_file
338
+
339
+ def get_reference_texts(self) -> Dict[str, List[str]]:
340
+ """
341
+ Get the references grouped by file.
342
+
343
+ Returns:
344
+ Dict[str, List[str]]: Mapping of file stem → list of reference sentences.
345
+ """
346
+ return OrderedDict({
347
+ name: [sentence_def.content for sentence_def in list_sentences]
348
+ for name, list_sentences in self.references.items()})
@@ -0,0 +1,130 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import logging
17
+
18
+ import simulstream
19
+ from simulstream.config import yaml_config
20
+ from simulstream.metrics.readers import LogReader, YamlReferenceReader
21
+ from simulstream.metrics.scorers.latency import LATENCY_SCORER_REGISTRY, LatencyScorer, \
22
+ LatencyScoringSample
23
+
24
+
25
+ logging.basicConfig(
26
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
27
+ datefmt='%Y-%m-%d %H:%M:%S',
28
+ level=logging.INFO,
29
+ force=True
30
+ )
31
+ LOGGER = logging.getLogger('simulstream.score_latency')
32
+
33
+
34
+ def main(scorer_cls: type[LatencyScorer], args: argparse.Namespace):
35
+ """
36
+ Main entry point for latency scoring.
37
+
38
+ Loads system outputs from a log file, builds scoring samples with segment-level references,
39
+ and computes latency scores using the specified scorer.
40
+
41
+ The score (in seconds) is printed on standard output.
42
+
43
+ Args:
44
+ scorer_cls (type[LatencyScorer]): The latency scorer class to use.
45
+ args (argparse.Namespace): Parsed command-line arguments.
46
+ """
47
+ LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
48
+ eval_config = yaml_config(args.eval_config)
49
+ LOGGER.info(f"Reading log file ({args.log_file})")
50
+ log_reader = LogReader(eval_config, args.log_file, latency_unit=args.latency_unit)
51
+
52
+ LOGGER.info(f"Building latency scorer {args.scorer}")
53
+ scorer = scorer_cls(args)
54
+
55
+ LOGGER.info(
56
+ f"Reading audio definition ({args.audio_definition}), and reference ({args.reference})")
57
+ references = None
58
+ if scorer.requires_reference():
59
+ reference_reader = YamlReferenceReader(args.audio_definition, args.reference)
60
+ references = reference_reader.references
61
+
62
+ output_with_latency = log_reader.final_outputs_and_latencies()
63
+
64
+ if references is not None:
65
+ audio_files = references.keys()
66
+ else:
67
+ audio_files = output_with_latency.keys()
68
+
69
+ samples = []
70
+ for audio_file in audio_files:
71
+ reference = references[audio_file] if references is not None else None
72
+ samples.append(LatencyScoringSample(
73
+ audio_file, output_with_latency[audio_file], reference))
74
+
75
+ score = scorer.score(samples)
76
+ print(f"Latency scores (in seconds): {score}")
77
+
78
+
79
+ def cli_main():
80
+ """
81
+ Latency scoring script for Simulstream evaluation.
82
+
83
+ This module provides tools to compute latency metrics for streaming speech translation or
84
+ recognition. It supports multiple latency scorers through a pluggable registry
85
+ (:data:`LATENCY_SCORER_REGISTRY`).
86
+
87
+ The script works with JSONL log files generated during inference.
88
+
89
+ Typical usage from the command line::
90
+
91
+ $ python -m simulstream.metrics.score_latency \\
92
+ --eval-config config/speech-processor.yaml \\
93
+ --log-file metrics.jsonl \\
94
+ --audio-definition segments.yaml \\
95
+ --reference ref.txt \\
96
+ --scorer stream_laal
97
+ """
98
+ LOGGER.info(f"Simulstream version: {simulstream.__version__}")
99
+ parser = argparse.ArgumentParser("score_latency")
100
+ parser.add_argument(
101
+ "--eval-config", type=str, required=True,
102
+ help="Path to the yaml config file containing information about the tokenizer to be used.")
103
+ parser.add_argument(
104
+ "--log-file", type=str, required=True,
105
+ help="Path to the log file with the metrics to be used for the evaluation.")
106
+ parser.add_argument(
107
+ "--reference", "-r", type=str,
108
+ help="Path to the textual file containing segment-level references stored line by line.")
109
+ parser.add_argument(
110
+ "--audio-definition", "-a", type=str, required=True,
111
+ help="Path to the yaml file containing the segment-level audio information.")
112
+ parser.add_argument(
113
+ "--latency-unit", choices=["word", "char"], default="word",
114
+ help="Whether to computed latency based on words or characters. Default: word.")
115
+ parser.add_argument("--scorer", choices=LATENCY_SCORER_REGISTRY.keys(), required=True)
116
+ args, _ = parser.parse_known_args()
117
+
118
+ # build full parser with scorer-specific args
119
+ parser = argparse.ArgumentParser(parents=[parser], add_help=False)
120
+ scorer_cls = LATENCY_SCORER_REGISTRY[args.scorer]
121
+ scorer_cls.add_arguments(parser)
122
+
123
+ # parse new arguments
124
+ args = parser.parse_args()
125
+
126
+ main(scorer_cls, args)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ cli_main()
@@ -0,0 +1,169 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import logging
17
+
18
+ import simulstream
19
+ from simulstream.config import yaml_config
20
+ from simulstream.metrics.readers import LogReader, ReferencesReader, YamlReferenceReader
21
+ from simulstream.metrics.scorers.quality import QUALITY_SCORER_REGISTRY, QualityScorer, \
22
+ QualityScoringSample
23
+
24
+
25
+ logging.basicConfig(
26
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
27
+ datefmt='%Y-%m-%d %H:%M:%S',
28
+ level=logging.INFO,
29
+ force=True
30
+ )
31
+ LOGGER = logging.getLogger('simulstream.score_quality')
32
+
33
+
34
+ def main(scorer_cls: type[QualityScorer], args: argparse.Namespace):
35
+ """
36
+ Main entry point for quality scoring.
37
+
38
+ This function loads the evaluation configuration, system hypotheses, and reference/transcript
39
+ data (if required), then constructs scoring samples and computes the final quality score using
40
+ the selected scorer.
41
+
42
+ The output is printed on standard output.
43
+
44
+ Args:
45
+ scorer_cls (type[QualityScorer]): Class implementing the quality metric.
46
+ args (argparse.Namespace): Parsed command-line arguments.
47
+ """
48
+ LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
49
+ eval_config = yaml_config(args.eval_config)
50
+ log_reader = LogReader(eval_config, args.log_file)
51
+
52
+ LOGGER.info(f"Building scorer class for {args.scorer}")
53
+ scorer = scorer_cls(args)
54
+
55
+ LOGGER.info("Reading source and reference definition")
56
+ reference_reader = None
57
+ transcripts_reader = None
58
+ if args.audio_definition is not None:
59
+ if scorer.requires_reference():
60
+ assert len(args.references) == 1, \
61
+ "When audio definition is provided, only one reference file should be provided."
62
+ reference_reader = YamlReferenceReader(args.audio_definition, args.references[0])
63
+ if scorer.requires_source():
64
+ assert len(args.transcripts) == 1, \
65
+ "When audio definition is provided, only one transcript file should be provided."
66
+ transcripts_reader = YamlReferenceReader(args.audio_definition, args.transcripts[0])
67
+ else:
68
+ if scorer.requires_reference():
69
+ reference_reader = ReferencesReader(args.references)
70
+ if scorer.requires_source():
71
+ transcripts_reader = ReferencesReader(args.transcripts)
72
+
73
+ hypothesis_dictionary = log_reader.final_outputs()
74
+ transcript_dictionary = None
75
+ reference_dictionary = None
76
+ audio_files_to_score = None
77
+ if transcripts_reader is not None:
78
+ transcript_dictionary = transcripts_reader.get_reference_texts()
79
+ audio_files_to_score = transcript_dictionary.keys()
80
+ if reference_reader is not None:
81
+ reference_dictionary = reference_reader.get_reference_texts()
82
+ audio_files_to_score = reference_dictionary.keys()
83
+
84
+ scoring_samples = []
85
+ for audio_name in audio_files_to_score:
86
+ transcript = None
87
+ if transcript_dictionary is not None:
88
+ transcript = transcript_dictionary[audio_name]
89
+ reference = None
90
+ if reference_dictionary is not None:
91
+ reference = reference_dictionary[audio_name]
92
+ if transcript is not None and reference is not None:
93
+ assert len(reference) == len(transcript), \
94
+ f"Reference ({audio_name}) has mismatched number of target ({len(reference)}) " \
95
+ f"and source lines ({len(transcript)})"
96
+
97
+ scoring_samples.append(QualityScoringSample(
98
+ audio_name, hypothesis_dictionary[audio_name], reference, transcript))
99
+
100
+ LOGGER.info("Scoring outputs")
101
+ score = scorer.score(scoring_samples)
102
+
103
+ print(f"{args.scorer} score: {score}")
104
+
105
+
106
+ def cli_main():
107
+ """
108
+ Quality scoring script for Simulstream evaluation.
109
+
110
+ This module provides functionality to compute quality-based evaluation metrics on system
111
+ outputs stored in JSONL log files. It uses pluggable scorers from the
112
+ :mod:`simulstream.metrics.scorers.quality` registry and compares system outputs against
113
+ references and/or transcripts.
114
+
115
+ It supports:
116
+ - **Reference-based metrics** (e.g., BLEU, COMET).
117
+ - **Source-based metrics** (e.g., reference-free COMET).
118
+ - Hybrid setups when both references and transcripts are available.
119
+
120
+ The script can be invoked as a standalone CLI:
121
+
122
+ $ python -m simulstream.metrics.score_quality \\
123
+ --eval-config config/speech-processor.yaml \\
124
+ --log-file metrics.jsonl \\
125
+ --references ref.en \\
126
+ --transcripts src.it \\
127
+ --scorer sacrebleu
128
+ """
129
+ LOGGER.info(f"Simulstream version: {simulstream.__version__}")
130
+ parser = argparse.ArgumentParser("score_quality")
131
+ parser.add_argument(
132
+ "--eval-config", type=str, required=True,
133
+ help="Path to the yaml config file containing information about the tokenizer to be used.")
134
+ parser.add_argument(
135
+ "--log-file", type=str, required=True,
136
+ help="Path to the log file with the metrics to be used for the evaluation.")
137
+ parser.add_argument(
138
+ "--references", nargs="+", type=str,
139
+ help="Path to the textual files containing references. If `--audio-definition` is "
140
+ "specified, this should be a single file containing all the lines of the audios in "
141
+ "the reference, which should be of the same length of the audio definition. "
142
+ "Otherwise, this should be a list of files, where each contains the lines "
143
+ "corresponding to an audio file.")
144
+ parser.add_argument(
145
+ "--transcripts", nargs="+", type=str,
146
+ help="Path to the textual files containing reference transcripts. If `--audio-definition` "
147
+ "is specified, this should be a single file containing all the lines of the audios "
148
+ "in the reference, which should be of the same length of the audio definition. "
149
+ "Otherwise, this should be a list of files, where each contains the lines "
150
+ "corresponding to an audio file.")
151
+ parser.add_argument(
152
+ "--audio-definition", "-a", type=str, default=None,
153
+ help="Path to the yaml file containing the segment-level audio information.")
154
+ parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
155
+ args, _ = parser.parse_known_args()
156
+
157
+ # build full parser with scorer-specific args
158
+ parser = argparse.ArgumentParser(parents=[parser], add_help=False)
159
+ scorer_cls = QUALITY_SCORER_REGISTRY[args.scorer]
160
+ scorer_cls.add_arguments(parser)
161
+
162
+ # parse new arguments
163
+ args = parser.parse_args()
164
+
165
+ main(scorer_cls, args)
166
+
167
+
168
+ if __name__ == "__main__":
169
+ cli_main()
File without changes