simulstream 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +47 -0
- simulstream/__init__.py +15 -0
- simulstream/client/__init__.py +0 -0
- simulstream/client/wav_reader_client.py +228 -0
- simulstream/config.py +31 -0
- simulstream/inference.py +170 -0
- simulstream/metrics/__init__.py +0 -0
- simulstream/metrics/detokenizers.py +71 -0
- simulstream/metrics/logger.py +32 -0
- simulstream/metrics/readers.py +348 -0
- simulstream/metrics/score_latency.py +130 -0
- simulstream/metrics/score_quality.py +169 -0
- simulstream/metrics/scorers/__init__.py +0 -0
- simulstream/metrics/scorers/latency/__init__.py +115 -0
- simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
- simulstream/metrics/scorers/latency/stream_laal.py +119 -0
- simulstream/metrics/scorers/quality/__init__.py +132 -0
- simulstream/metrics/scorers/quality/comet.py +57 -0
- simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
- simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
- simulstream/metrics/stats.py +184 -0
- simulstream/server/__init__.py +0 -0
- simulstream/server/http_server.py +95 -0
- simulstream/server/message_processor.py +156 -0
- simulstream/server/speech_processors/__init__.py +173 -0
- simulstream/server/speech_processors/base.py +135 -0
- simulstream/server/speech_processors/base_streamatt.py +320 -0
- simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
- simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
- simulstream/server/speech_processors/incremental_output.py +85 -0
- simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
- simulstream/server/speech_processors/seamless_streamatt.py +268 -0
- simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
- simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
- simulstream/server/speech_processors/vad_wrapper.py +180 -0
- simulstream/server/websocket_server.py +236 -0
- simulstream-0.1.0.dist-info/METADATA +465 -0
- simulstream-0.1.0.dist-info/RECORD +48 -0
- simulstream-0.1.0.dist-info/WHEEL +5 -0
- simulstream-0.1.0.dist-info/entry_points.txt +8 -0
- simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
- simulstream-0.1.0.dist-info/top_level.txt +3 -0
- uts/__init__.py +0 -0
- uts/metrics/__init__.py +0 -0
- uts/metrics/log_reader.py +50 -0
- uts/speech_processors/__init__.py +0 -0
- uts/speech_processors/test_simuleval_wrapper.py +88 -0
- uts/utils.py +5 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import importlib
|
|
17
|
+
import pkgutil
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import List, Optional
|
|
21
|
+
|
|
22
|
+
from simulstream.metrics.readers import OutputWithDelays, ReferenceSentenceDefinition
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
LATENCY_SCORER_REGISTRY = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def register_latency_scorer(name):
|
|
29
|
+
"""
|
|
30
|
+
Decorator for registering a latency scorer class.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name (str): The unique identifier for the scorer.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
TypeError: If the decorated class is not a subclass of
|
|
37
|
+
:class:`LatencyScorer`.
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> @register_latency_scorer("stream_laal")
|
|
41
|
+
... class StreamLAALScorer(LatencyScorer):
|
|
42
|
+
... ...
|
|
43
|
+
"""
|
|
44
|
+
def register(cls):
|
|
45
|
+
if not issubclass(cls, LatencyScorer):
|
|
46
|
+
raise TypeError(f"Cannot register {cls.__name__}: must be a subclass of LatencyScorer")
|
|
47
|
+
LATENCY_SCORER_REGISTRY[name] = cls
|
|
48
|
+
return cls
|
|
49
|
+
|
|
50
|
+
return register
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class LatencyScoringSample:
|
|
55
|
+
"""
|
|
56
|
+
Data structure representing a single evaluation sample.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
audio_name (str): The identifier of the audio file.
|
|
60
|
+
hypothesis (str): The system-generated hypothesis text.
|
|
61
|
+
reference (Optional[List[ReferenceSentenceDefinition]]): One or more reference sentences,
|
|
62
|
+
including the text, start time and duration, or ``None`` if not required.
|
|
63
|
+
"""
|
|
64
|
+
audio_name: str
|
|
65
|
+
hypothesis: OutputWithDelays
|
|
66
|
+
reference: Optional[List[ReferenceSentenceDefinition]] = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class LatencyScores:
|
|
71
|
+
"""
|
|
72
|
+
Data structure representing a latency score.
|
|
73
|
+
|
|
74
|
+
Attributes:
|
|
75
|
+
ideal_latency (float): The latency score in ideal conditions, which do not include
|
|
76
|
+
computational costs.
|
|
77
|
+
computational_aware_latency (Optional[float]): The latency score in computational aware
|
|
78
|
+
conditions, which include computational costs.
|
|
79
|
+
"""
|
|
80
|
+
ideal_latency: float
|
|
81
|
+
computational_aware_latency: Optional[float] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class LatencyScorer:
|
|
85
|
+
"""
|
|
86
|
+
Abstract base class for all latency scorers.
|
|
87
|
+
|
|
88
|
+
A latency scorer evaluates system hypotheses against references and returns a
|
|
89
|
+
:class:`LatencyScores` object that represents the latency scores.
|
|
90
|
+
|
|
91
|
+
Subclasses must implement the abstract methods defined here and should be registered via
|
|
92
|
+
:func:`register_latency_scorer`.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
args (argparse.Namespace): Parsed command-line arguments.
|
|
96
|
+
"""
|
|
97
|
+
def __init__(self, args: argparse.Namespace):
|
|
98
|
+
self.args = args
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
@abstractmethod
|
|
106
|
+
def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
|
|
107
|
+
...
|
|
108
|
+
|
|
109
|
+
@abstractmethod
|
|
110
|
+
def requires_reference(self) -> bool:
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
for loader, name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
|
|
115
|
+
importlib.import_module(name)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from abc import abstractmethod
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
from mweralign import mweralign
|
|
20
|
+
|
|
21
|
+
from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
|
|
22
|
+
from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ResegmentedLatencyScoringSample:
|
|
27
|
+
"""
|
|
28
|
+
A sample containing realigned hypotheses and references.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
audio_name (str): The identifier of the audio file.
|
|
32
|
+
hypothesis (List[str]): Hypothesis lines after realignment.
|
|
33
|
+
reference (List[str]): Reference lines aligned to the hypothesis.
|
|
34
|
+
"""
|
|
35
|
+
audio_name: str
|
|
36
|
+
hypothesis: List[OutputWithDelays]
|
|
37
|
+
reference: List[ReferenceSentenceDefinition]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MWERSegmenterBasedLatencyScorer(LatencyScorer):
|
|
41
|
+
"""
|
|
42
|
+
Abstract base class for scorers that require aligned system outputs and references through
|
|
43
|
+
MWER Segmenter alignment.
|
|
44
|
+
|
|
45
|
+
This class wraps a latency scorer and applies the MWER Segmenter alignment by `"Effects of
|
|
46
|
+
automatic alignment on speech translation metrics"
|
|
47
|
+
<https://aclanthology.org/2025.iwslt-1.7/>`_ to hypotheses before scoring.
|
|
48
|
+
|
|
49
|
+
Subclasses must implement :meth:`_do_score`, which operates on
|
|
50
|
+
:class:`ResegmentedLatencyScoringSample` instances where hypotheses and references are aligned.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> class CustomLatencyScorer(MWERSegmenterBasedLatencyScorer):
|
|
54
|
+
... def _do_score(self, samples):
|
|
55
|
+
... # Compute a custom latency score
|
|
56
|
+
... return LatencyScores(...)
|
|
57
|
+
"""
|
|
58
|
+
def __init__(self, args):
|
|
59
|
+
super().__init__(args)
|
|
60
|
+
self.latency_unit = args.latency_unit
|
|
61
|
+
|
|
62
|
+
def requires_reference(self) -> bool:
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def _do_score(self, samples: List[ResegmentedLatencyScoringSample]) -> LatencyScores:
|
|
67
|
+
"""
|
|
68
|
+
Compute latency scores on resegmented samples.
|
|
69
|
+
|
|
70
|
+
Subclasses must override this method.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
samples (List[ResegmentedLatencyScoringSample]): Aligned
|
|
74
|
+
hypothesis–reference pairs with delay information.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
LatencyScores: The computed latency metrics.
|
|
78
|
+
"""
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
def _split_delays_by_segmented_text(
|
|
82
|
+
self, delays: List[float], segmented_text: List[str]) -> List[List[float]]:
|
|
83
|
+
"""
|
|
84
|
+
Assign delay values to the corresponding segmented hypotheses.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
delays (List[float]): Delay values (per token or per char).
|
|
88
|
+
segmented_text (List[str]): Segmented hypothesis strings.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List[List[float]]: Delays split per segment.
|
|
92
|
+
"""
|
|
93
|
+
segmented_delays = []
|
|
94
|
+
index = 0
|
|
95
|
+
|
|
96
|
+
for segment in segmented_text:
|
|
97
|
+
segment_len = len(text_items(segment, self.latency_unit))
|
|
98
|
+
segmented_delays.append(delays[index:index + segment_len])
|
|
99
|
+
index += segment_len
|
|
100
|
+
assert len(delays) == index, \
|
|
101
|
+
f"Index {index} should have reached end of delays ({len(delays)})"
|
|
102
|
+
return segmented_delays
|
|
103
|
+
|
|
104
|
+
def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
|
|
105
|
+
resegmented_samples = []
|
|
106
|
+
for sample in samples:
|
|
107
|
+
assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
|
|
108
|
+
|
|
109
|
+
resegmented_hypos = mweralign.align_texts(
|
|
110
|
+
"\n".join([sentence_def.content for sentence_def in sample.reference]),
|
|
111
|
+
sample.hypothesis.final_text).split("\n")
|
|
112
|
+
|
|
113
|
+
assert len(resegmented_hypos) == len(sample.reference), \
|
|
114
|
+
f"Reference ({sample.audio_name}) has mismatched number of target " \
|
|
115
|
+
f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
|
|
116
|
+
|
|
117
|
+
ideal_delays_splits = self._split_delays_by_segmented_text(
|
|
118
|
+
sample.hypothesis.ideal_delays,
|
|
119
|
+
resegmented_hypos)
|
|
120
|
+
computational_aware_delays_splits = self._split_delays_by_segmented_text(
|
|
121
|
+
sample.hypothesis.computational_aware_delays,
|
|
122
|
+
resegmented_hypos)
|
|
123
|
+
assert len(ideal_delays_splits) == len(computational_aware_delays_splits)
|
|
124
|
+
|
|
125
|
+
resegmented_hypos_with_delays = []
|
|
126
|
+
for text, ideal_delay, computational_aware_delay in zip(
|
|
127
|
+
resegmented_hypos, ideal_delays_splits, computational_aware_delays_splits):
|
|
128
|
+
resegmented_hypos_with_delays.append(
|
|
129
|
+
OutputWithDelays(text, ideal_delay, computational_aware_delay))
|
|
130
|
+
|
|
131
|
+
resegmented_samples.append(ResegmentedLatencyScoringSample(
|
|
132
|
+
sample.audio_name,
|
|
133
|
+
resegmented_hypos_with_delays,
|
|
134
|
+
sample.reference,
|
|
135
|
+
))
|
|
136
|
+
return self._do_score(resegmented_samples)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
import statistics
|
|
18
|
+
from typing import List
|
|
19
|
+
|
|
20
|
+
from simulstream.metrics.readers import text_items
|
|
21
|
+
from simulstream.metrics.scorers.latency import register_latency_scorer, LatencyScores
|
|
22
|
+
from simulstream.metrics.scorers.latency.mwersegmenter import MWERSegmenterBasedLatencyScorer, \
|
|
23
|
+
ResegmentedLatencyScoringSample
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
LOGGER = logging.getLogger('simulstream.metrics.scorers.latency.stream_laal')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_latency_scorer("stream_laal")
|
|
30
|
+
class StreamLaal(MWERSegmenterBasedLatencyScorer):
|
|
31
|
+
"""
|
|
32
|
+
Computes StreamLAAL version 2.0, as proposed in
|
|
33
|
+
`StreamAtt: Direct Streaming Speech-to-Text Translation with Attention-based
|
|
34
|
+
Audio History Selection <https://aclanthology.org/2024.acl-long.202.pdf>`_.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
Then main difference with version 1 is the different segmentation of
|
|
38
|
+
the text (uses mwerSegmenter python package instead of Matusov's executable).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _sentence_level_laal(
|
|
43
|
+
delays: List[float], source_length: float, target_length: int) -> float:
|
|
44
|
+
"""
|
|
45
|
+
Function to compute Length Adaptive Average Lagging (LAAL) on one sentence as proposed in
|
|
46
|
+
`CUNI-KIT System for Simultaneous Speech Translation Task at IWSLT 2022
|
|
47
|
+
<https://arxiv.org/abs/2204.06028>`_ and
|
|
48
|
+
`Length-Adaptive Average Lagging for Simultaneous Speech Translation
|
|
49
|
+
<https://arxiv.org/abs/2206.05807>`_.
|
|
50
|
+
It is the original Average Lagging as proposed in
|
|
51
|
+
`Controllable Latency using Prefix-to-Prefix Framework
|
|
52
|
+
<https://arxiv.org/abs/1810.08398>`_
|
|
53
|
+
but is robust to the length difference between the hypothesis and reference.
|
|
54
|
+
|
|
55
|
+
The implementation is derived by that available in SimulEval (see `latency_scorer.py` in
|
|
56
|
+
`https://github.com/facebookresearch/SimulEval/).
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
float: the latency score on one sentence.
|
|
60
|
+
"""
|
|
61
|
+
if delays[0] > source_length:
|
|
62
|
+
return delays[0]
|
|
63
|
+
|
|
64
|
+
LAAL = 0
|
|
65
|
+
gamma = max(len(delays), target_length) / source_length
|
|
66
|
+
tau = 0
|
|
67
|
+
for t_minus_1, d in enumerate(delays):
|
|
68
|
+
LAAL += d - t_minus_1 / gamma
|
|
69
|
+
tau = t_minus_1 + 1
|
|
70
|
+
|
|
71
|
+
if d >= source_length:
|
|
72
|
+
break
|
|
73
|
+
LAAL /= tau
|
|
74
|
+
return LAAL
|
|
75
|
+
|
|
76
|
+
def _do_score(self, samples: List[ResegmentedLatencyScoringSample]) -> LatencyScores:
|
|
77
|
+
sentence_level_ideal_scores = []
|
|
78
|
+
sentence_level_ca_scores = []
|
|
79
|
+
skipped_sentences = 0
|
|
80
|
+
for sample in samples:
|
|
81
|
+
for sentence_output, sentence_reference in zip(sample.hypothesis, sample.reference):
|
|
82
|
+
# offset delays with respect to reference start of the utterance
|
|
83
|
+
delays_from_sentence_start = [
|
|
84
|
+
delay - sentence_reference.start_time
|
|
85
|
+
for delay in sentence_output.ideal_delays]
|
|
86
|
+
ca_delays_from_sentence_start = [
|
|
87
|
+
delay - sentence_reference.start_time
|
|
88
|
+
for delay in sentence_output.computational_aware_delays]
|
|
89
|
+
assert len(delays_from_sentence_start) == len(ca_delays_from_sentence_start)
|
|
90
|
+
|
|
91
|
+
target_length = len(text_items(sentence_reference.content, self.latency_unit))
|
|
92
|
+
|
|
93
|
+
if len(delays_from_sentence_start) > 0:
|
|
94
|
+
sentence_level_ideal_scores.append(
|
|
95
|
+
self._sentence_level_laal(
|
|
96
|
+
delays_from_sentence_start,
|
|
97
|
+
sentence_reference.duration,
|
|
98
|
+
target_length)
|
|
99
|
+
)
|
|
100
|
+
sentence_level_ca_scores.append(
|
|
101
|
+
self._sentence_level_laal(
|
|
102
|
+
ca_delays_from_sentence_start,
|
|
103
|
+
sentence_reference.duration,
|
|
104
|
+
target_length)
|
|
105
|
+
)
|
|
106
|
+
else:
|
|
107
|
+
skipped_sentences += 1
|
|
108
|
+
|
|
109
|
+
if skipped_sentences > 0:
|
|
110
|
+
LOGGER.warning(
|
|
111
|
+
f"{skipped_sentences} sentences have been skipped in LAAL computation as they "
|
|
112
|
+
"were empty")
|
|
113
|
+
return LatencyScores(
|
|
114
|
+
statistics.mean(sentence_level_ideal_scores),
|
|
115
|
+
statistics.mean(sentence_level_ca_scores))
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
|
|
119
|
+
pass
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import importlib
|
|
17
|
+
import pkgutil
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import List, Optional
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
QUALITY_SCORER_REGISTRY = {}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register_quality_scorer(name):
|
|
27
|
+
"""
|
|
28
|
+
Decorator for registering a quality scorer class.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
name (str): The unique identifier for the scorer.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
TypeError: If the decorated class is not a subclass of
|
|
35
|
+
:class:`QualityScorer`.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> @register_quality_scorer("bleu")
|
|
39
|
+
... class BLEUScorer(QualityScorer):
|
|
40
|
+
... ...
|
|
41
|
+
"""
|
|
42
|
+
def register(cls):
|
|
43
|
+
if not issubclass(cls, QualityScorer):
|
|
44
|
+
raise TypeError(f"Cannot register {cls.__name__}: must be a subclass of QualityScorer")
|
|
45
|
+
QUALITY_SCORER_REGISTRY[name] = cls
|
|
46
|
+
return cls
|
|
47
|
+
|
|
48
|
+
return register
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class QualityScoringSample:
|
|
53
|
+
"""
|
|
54
|
+
Data structure representing a single evaluation sample.
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
audio_name (str): The identifier of the audio file.
|
|
58
|
+
hypothesis (str): The system-generated hypothesis text.
|
|
59
|
+
reference (Optional[List[str]]): One or more reference translations, or ``None`` if not
|
|
60
|
+
required.
|
|
61
|
+
source (Optional[List[str]]): The source transcription or text, or ``None`` if not
|
|
62
|
+
required.
|
|
63
|
+
"""
|
|
64
|
+
audio_name: str
|
|
65
|
+
hypothesis: str
|
|
66
|
+
reference: Optional[List[str]] = None
|
|
67
|
+
source: Optional[List[str]] = None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class QualityScorer:
|
|
71
|
+
"""
|
|
72
|
+
Abstract base class for all quality scorers.
|
|
73
|
+
|
|
74
|
+
A quality scorer evaluates system hypotheses against references and/or source sentences
|
|
75
|
+
and returns a numerical score.
|
|
76
|
+
|
|
77
|
+
Subclasses must implement the abstract methods defined here and should be registered via
|
|
78
|
+
:func:`register_quality_scorer`.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
args (argparse.Namespace): Parsed command-line arguments.
|
|
82
|
+
"""
|
|
83
|
+
def __init__(self, args: argparse.Namespace):
|
|
84
|
+
self.args = args
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def score(self, samples: List[QualityScoringSample]) -> float:
|
|
88
|
+
"""
|
|
89
|
+
Compute a quality score over a list of samples.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
samples (List[QualityScoringSample]): Samples to be evaluated.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
float: The computed quality score.
|
|
96
|
+
"""
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Add scorer-specific arguments to the CLI parser.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
parser (argparse.ArgumentParser): The parser to extend.
|
|
107
|
+
"""
|
|
108
|
+
...
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
def requires_source(self) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Indicate whether this scorer requires the source text.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
bool: True if source sentences are required, False otherwise.
|
|
117
|
+
"""
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def requires_reference(self) -> bool:
|
|
122
|
+
"""
|
|
123
|
+
Indicate whether this scorer requires reference translations.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
bool: True if references are required, False otherwise.
|
|
127
|
+
"""
|
|
128
|
+
...
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
for loader, name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
|
|
132
|
+
importlib.import_module(name)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import sys
|
|
17
|
+
from typing import List
|
|
18
|
+
|
|
19
|
+
from simulstream.metrics.scorers.quality import register_quality_scorer
|
|
20
|
+
from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
|
|
21
|
+
ResegmentedQualityScoringSample
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from comet import download_model, load_from_checkpoint
|
|
25
|
+
except ImportError:
|
|
26
|
+
sys.exit("Please install comet first with `pip install unbabel-comet`.")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_quality_scorer("comet")
|
|
30
|
+
class CometScorer(MWERSegmenterBasedQualityScorer):
|
|
31
|
+
def __init__(self, args: argparse.Namespace):
|
|
32
|
+
super().__init__(args)
|
|
33
|
+
self.batch_size = args.batch_size
|
|
34
|
+
model_path = download_model(args.model)
|
|
35
|
+
self.model = load_from_checkpoint(model_path)
|
|
36
|
+
self.model.eval()
|
|
37
|
+
|
|
38
|
+
def _do_score(self, samples: List[ResegmentedQualityScoringSample]) -> float:
|
|
39
|
+
comet_data = []
|
|
40
|
+
for sample in samples:
|
|
41
|
+
for hyp, ref, src in zip(sample.hypothesis, sample.reference, sample.source):
|
|
42
|
+
comet_data.append({
|
|
43
|
+
"src": src.strip(),
|
|
44
|
+
"mt": hyp.strip(),
|
|
45
|
+
"ref": ref.strip()
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
metric = self.model.predict(comet_data, batch_size=self.batch_size)
|
|
49
|
+
return metric.system_score
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
|
|
53
|
+
parser.add_argument("--model", type=str, default="Unbabel/wmt22-comet-da")
|
|
54
|
+
parser.add_argument("--batch-size", type=int, default=16)
|
|
55
|
+
|
|
56
|
+
def requires_source(self) -> bool:
|
|
57
|
+
return True
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
from abc import abstractmethod
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import List, Optional
|
|
18
|
+
|
|
19
|
+
from mweralign import mweralign
|
|
20
|
+
|
|
21
|
+
from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ResegmentedQualityScoringSample:
|
|
26
|
+
"""
|
|
27
|
+
A sample containing realigned hypotheses and references.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
audio_name (str): The identifier of the audio file.
|
|
31
|
+
hypothesis (List[str]): Hypothesis lines after realignment.
|
|
32
|
+
reference (List[str]): Reference lines aligned to the hypothesis.
|
|
33
|
+
source (Optional[List[str]]): Source text (if available).
|
|
34
|
+
"""
|
|
35
|
+
audio_name: str
|
|
36
|
+
hypothesis: List[str]
|
|
37
|
+
reference: List[str]
|
|
38
|
+
source: Optional[List[str]] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class MWERSegmenterBasedQualityScorer(QualityScorer):
|
|
42
|
+
"""
|
|
43
|
+
Abstract base class for scorers that require aligned system outputs and references through
|
|
44
|
+
MWER Segmenter alignment.
|
|
45
|
+
|
|
46
|
+
This class wraps a quality scorer and applies the MWER Segmenter alignment by `"Effects of
|
|
47
|
+
automatic alignment on speech translation metrics"
|
|
48
|
+
<https://aclanthology.org/2025.iwslt-1.7/>`_ to hypotheses before scoring.
|
|
49
|
+
|
|
50
|
+
Subclasses must implement :meth:`_do_score`, which receives
|
|
51
|
+
:class:`ResegmentedQualityScoringSample` instances, where output and references are aligned.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> class CustomQualityScorer(MWERSegmenterBasedQualityScorer):
|
|
55
|
+
... def _do_score(self, samples):
|
|
56
|
+
... # Compute a custom quality score
|
|
57
|
+
... return ...
|
|
58
|
+
"""
|
|
59
|
+
def requires_reference(self) -> bool:
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def _do_score(self, samples: List[ResegmentedQualityScoringSample]) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Compute the final score on resegmented samples.
|
|
66
|
+
|
|
67
|
+
This method must be implemented by subclasses.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
samples (List[ResegmentedQualityScoringSample]): The aligned
|
|
71
|
+
hypothesis–reference pairs, plus optional sources.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
float: The computed score.
|
|
75
|
+
"""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
def score(self, samples: List[QualityScoringSample]) -> float:
|
|
79
|
+
resegmented_samples = []
|
|
80
|
+
for sample in samples:
|
|
81
|
+
assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
|
|
82
|
+
resegmented_hypos = mweralign.align_texts(
|
|
83
|
+
"\n".join(sample.reference), sample.hypothesis).split("\n")
|
|
84
|
+
assert len(sample.reference) == len(resegmented_hypos), \
|
|
85
|
+
f"Reference ({sample.audio_name}) has mismatched number of target " \
|
|
86
|
+
f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
|
|
87
|
+
resegmented_samples.append(ResegmentedQualityScoringSample(
|
|
88
|
+
sample.audio_name,
|
|
89
|
+
resegmented_hypos,
|
|
90
|
+
sample.reference,
|
|
91
|
+
sample.source
|
|
92
|
+
))
|
|
93
|
+
return self._do_score(resegmented_samples)
|