simulstream 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/source/conf.py +47 -0
- simulstream/__init__.py +15 -0
- simulstream/client/__init__.py +0 -0
- simulstream/client/wav_reader_client.py +228 -0
- simulstream/config.py +31 -0
- simulstream/inference.py +170 -0
- simulstream/metrics/__init__.py +0 -0
- simulstream/metrics/detokenizers.py +71 -0
- simulstream/metrics/logger.py +32 -0
- simulstream/metrics/readers.py +348 -0
- simulstream/metrics/score_latency.py +130 -0
- simulstream/metrics/score_quality.py +169 -0
- simulstream/metrics/scorers/__init__.py +0 -0
- simulstream/metrics/scorers/latency/__init__.py +115 -0
- simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
- simulstream/metrics/scorers/latency/stream_laal.py +119 -0
- simulstream/metrics/scorers/quality/__init__.py +132 -0
- simulstream/metrics/scorers/quality/comet.py +57 -0
- simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
- simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
- simulstream/metrics/stats.py +184 -0
- simulstream/server/__init__.py +0 -0
- simulstream/server/http_server.py +95 -0
- simulstream/server/message_processor.py +156 -0
- simulstream/server/speech_processors/__init__.py +173 -0
- simulstream/server/speech_processors/base.py +135 -0
- simulstream/server/speech_processors/base_streamatt.py +320 -0
- simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
- simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
- simulstream/server/speech_processors/incremental_output.py +85 -0
- simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
- simulstream/server/speech_processors/seamless_streamatt.py +268 -0
- simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
- simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
- simulstream/server/speech_processors/vad_wrapper.py +180 -0
- simulstream/server/websocket_server.py +236 -0
- simulstream-0.1.0.dist-info/METADATA +465 -0
- simulstream-0.1.0.dist-info/RECORD +48 -0
- simulstream-0.1.0.dist-info/WHEEL +5 -0
- simulstream-0.1.0.dist-info/entry_points.txt +8 -0
- simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
- simulstream-0.1.0.dist-info/top_level.txt +3 -0
- uts/__init__.py +0 -0
- uts/metrics/__init__.py +0 -0
- uts/metrics/log_reader.py +50 -0
- uts/speech_processors/__init__.py +0 -0
- uts/speech_processors/test_simuleval_wrapper.py +88 -0
- uts/utils.py +5 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
import sys
|
|
18
|
+
from typing import List
|
|
19
|
+
|
|
20
|
+
from sacrebleu import BLEU
|
|
21
|
+
|
|
22
|
+
from simulstream.metrics.scorers.quality import register_quality_scorer
|
|
23
|
+
from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
|
|
24
|
+
ResegmentedQualityScoringSample
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import sacrebleu
|
|
28
|
+
except ImportError:
|
|
29
|
+
sys.exit("Please install comet first with `pip install sacrebleu`.")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
LOGGER = logging.getLogger('simulstream.metrics.scorers.latency.stream_laal')
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@register_quality_scorer("sacrebleu")
|
|
36
|
+
class SacreBLEUScorer(MWERSegmenterBasedQualityScorer):
|
|
37
|
+
def __init__(self, args: argparse.Namespace):
|
|
38
|
+
super().__init__(args)
|
|
39
|
+
self.bleu = BLEU(tokenize=args.tokenizer)
|
|
40
|
+
|
|
41
|
+
def _do_score(self, samples: List[ResegmentedQualityScoringSample]) -> float:
|
|
42
|
+
hypotheses = []
|
|
43
|
+
references = []
|
|
44
|
+
for sample in samples:
|
|
45
|
+
hypotheses.extend(sample.hypothesis)
|
|
46
|
+
references.extend(sample.reference)
|
|
47
|
+
score = self.bleu.corpus_score(hypotheses, [references])
|
|
48
|
+
LOGGER.info(f"SacreBLEU signature: {self.bleu.get_signature()}")
|
|
49
|
+
LOGGER.info(f"SacreBLEU detailed score: {score}")
|
|
50
|
+
return score.score
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--tokenizer", choices=sacrebleu.metrics.METRICS['BLEU'].TOKENIZERS,
|
|
56
|
+
default=sacrebleu.metrics.METRICS['BLEU'].TOKENIZER_DEFAULT)
|
|
57
|
+
|
|
58
|
+
def requires_source(self) -> bool:
|
|
59
|
+
return False
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
|
|
20
|
+
import simulstream
|
|
21
|
+
from simulstream.config import yaml_config
|
|
22
|
+
from simulstream.metrics.readers import LogReader, text_items
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
27
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
)
|
|
30
|
+
LOGGER = logging.getLogger('simulstream.stats')
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Stats:
|
|
34
|
+
"""
|
|
35
|
+
Abstract base class for defining evaluation statistics.
|
|
36
|
+
|
|
37
|
+
Subclasses must implement:
|
|
38
|
+
- :meth:`name`: unique identifier of the statistic.
|
|
39
|
+
- :meth:`description`: a human-readable explanation.
|
|
40
|
+
- :meth:`compute`: logic to compute the metric from a :class:`LogReader`.
|
|
41
|
+
"""
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def name(self) -> str:
|
|
44
|
+
"""The unique name of the statistic."""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def description(self) -> str:
|
|
49
|
+
"""The human-readable explanation of the statistic."""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def compute(self, log_reader: LogReader) -> float:
|
|
54
|
+
"""
|
|
55
|
+
Compute the value of the statistic.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
log_reader (LogReader): Reader object encapsulating log data.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
float: The computed value of the statistic.
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class NormalizedErasure(Stats):
|
|
67
|
+
"""
|
|
68
|
+
Compute the **Normalized Erasure** metric.
|
|
69
|
+
|
|
70
|
+
This measures the amount of flickering in retranslation, as defined in
|
|
71
|
+
`Arivazhagan et al., "Re-translation versus Streaming for Simultaneous Translation"
|
|
72
|
+
IWSLT 2020 <https://aclanthology.org/2020.iwslt-1.27/>`_.
|
|
73
|
+
|
|
74
|
+
It is defined as the ratio:
|
|
75
|
+
|
|
76
|
+
.. math::
|
|
77
|
+
|
|
78
|
+
\\text{Normalized Erasure} =
|
|
79
|
+
\\frac{\\text{# Deleted Tokens}}{\\text{# Final Generated Tokens}}
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def name(self) -> str:
|
|
83
|
+
return "normalized_erasure"
|
|
84
|
+
|
|
85
|
+
def description(self) -> str:
|
|
86
|
+
return "Normalized erasure, defined in https://aclanthology.org/2020.iwslt-1.27/, " \
|
|
87
|
+
"measures flickering in retranslation. It is defined as the ratio between the " \
|
|
88
|
+
"number of tokens that have been deleted and the number of final generated tokens."
|
|
89
|
+
|
|
90
|
+
def compute(self, log_reader: LogReader) -> float:
|
|
91
|
+
total_length = 0
|
|
92
|
+
for _, final_text in log_reader.final_outputs().items():
|
|
93
|
+
total_length += len(text_items(final_text, latency_unit=log_reader.latency_unit))
|
|
94
|
+
return log_reader.num_deleted_tokens() / total_length
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RealTimeFactor(Stats):
|
|
98
|
+
"""
|
|
99
|
+
Compute the **Real Time Factor**.
|
|
100
|
+
|
|
101
|
+
This measures how many seconds of computation are required on average
|
|
102
|
+
for each second of input audio.
|
|
103
|
+
|
|
104
|
+
Values greater than 1 indicate that the system is slower than real time
|
|
105
|
+
and cannot process input before the next audio chunk arrives.
|
|
106
|
+
"""
|
|
107
|
+
def name(self) -> str:
|
|
108
|
+
return "real_time_factor"
|
|
109
|
+
|
|
110
|
+
def description(self) -> str:
|
|
111
|
+
return "The Real Time Factor measures the average computational cost, ie. time in " \
|
|
112
|
+
"seconds spent in computation, for each input audio second. Values higher than 1 " \
|
|
113
|
+
"mean that the system is not able to process the input in time before the next " \
|
|
114
|
+
"input arrives."
|
|
115
|
+
|
|
116
|
+
def compute(self, log_reader: LogReader) -> float:
|
|
117
|
+
total_audio_lengths = sum(
|
|
118
|
+
logs[-1]["total_audio_processed"] for _, logs in log_reader.outputs_by_audio.items())
|
|
119
|
+
total_computational_cost = sum(
|
|
120
|
+
sum(log["computation_time"] for log in logs)
|
|
121
|
+
for _, logs in log_reader.outputs_by_audio.items())
|
|
122
|
+
return total_computational_cost / total_audio_lengths
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main(args: argparse.Namespace):
|
|
126
|
+
"""
|
|
127
|
+
Main entry point for computing statistics.
|
|
128
|
+
|
|
129
|
+
Loads the evaluation configuration and log file, computes all defined
|
|
130
|
+
statistics, and prints them in JSON format.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
args (argparse.Namespace): Parsed command-line arguments.
|
|
134
|
+
"""
|
|
135
|
+
LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
|
|
136
|
+
eval_config = yaml_config(args.eval_config)
|
|
137
|
+
LOGGER.info(f"Loading evaluation log file from {args.log_file}")
|
|
138
|
+
log_reader = LogReader(eval_config, args.log_file, latency_unit=args.latency_unit)
|
|
139
|
+
|
|
140
|
+
LOGGER.info("Computing stats")
|
|
141
|
+
stats_classes = [NormalizedErasure(), RealTimeFactor()]
|
|
142
|
+
stats = {
|
|
143
|
+
stat.name(): {"description": stat.description(), "value": stat.compute(log_reader)}
|
|
144
|
+
for stat in stats_classes
|
|
145
|
+
}
|
|
146
|
+
print(f"Stats: {json.dumps(stats, indent=4)}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def cli_main():
|
|
150
|
+
"""
|
|
151
|
+
Module for computing evaluation statistics from Simulstream logs.
|
|
152
|
+
|
|
153
|
+
This script provides a CLI interface to compute metrics that describe the behavior of
|
|
154
|
+
streaming systems. Metrics are computed from JSONL log files generated during evaluation and
|
|
155
|
+
include:
|
|
156
|
+
|
|
157
|
+
- **Normalized Erasure**: measures flickering in retranslation processors.
|
|
158
|
+
- **Computational Cost**: measures average computation time per second of audio.
|
|
159
|
+
|
|
160
|
+
The output is printed on standard output in JSON format.
|
|
161
|
+
|
|
162
|
+
Typical usage from the command line:
|
|
163
|
+
|
|
164
|
+
$ python -m simulstream.metrics.stats --eval-config config/speech_processor.yaml \\
|
|
165
|
+
--log-file metrics.jsonl
|
|
166
|
+
"""
|
|
167
|
+
LOGGER.info(f"Simulstream version: {simulstream.__version__}")
|
|
168
|
+
parser = argparse.ArgumentParser("stats")
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--eval-config", type=str, required=True,
|
|
171
|
+
help="Path to the yaml config file containing information about the tokenizer to be used.")
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"--log-file", type=str, required=True,
|
|
174
|
+
help="Path to the log file with the metrics to be used for the evaluation.")
|
|
175
|
+
parser.add_argument(
|
|
176
|
+
"--latency-unit", choices=["word", "char"], default="word",
|
|
177
|
+
help="Whether to computed stats based on words or characters. Default: word.")
|
|
178
|
+
|
|
179
|
+
args = parser.parse_args()
|
|
180
|
+
main(args)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
if __name__ == "__main__":
|
|
184
|
+
cli_main()
|
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
from functools import partial
|
|
18
|
+
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
|
19
|
+
|
|
20
|
+
import simulstream
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
|
25
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
26
|
+
level=logging.INFO,
|
|
27
|
+
)
|
|
28
|
+
LOGGER = logging.getLogger('simulstream.http_server')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CustomHandler(SimpleHTTPRequestHandler):
|
|
32
|
+
def __init__(self, *args, config=None, **kwargs):
|
|
33
|
+
self.config = config
|
|
34
|
+
super().__init__(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def do_GET(self):
|
|
37
|
+
if self.path == "/config.yaml":
|
|
38
|
+
# Load template
|
|
39
|
+
with open(self.config) as f:
|
|
40
|
+
config = f.read()
|
|
41
|
+
|
|
42
|
+
# Send response
|
|
43
|
+
self.send_response(200)
|
|
44
|
+
self.send_header("Content-type", "text/yaml; charset=utf-8")
|
|
45
|
+
self.end_headers()
|
|
46
|
+
self.wfile.write(config.encode("utf-8"))
|
|
47
|
+
else:
|
|
48
|
+
super().do_GET()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def cli_main():
|
|
52
|
+
"""
|
|
53
|
+
Simulstream HTTP server command-line interface (CLI) entry point.
|
|
54
|
+
|
|
55
|
+
This function parses command-line arguments and starts the asynchronous :func:`main` routine.
|
|
56
|
+
|
|
57
|
+
Example usage::
|
|
58
|
+
|
|
59
|
+
$ python http_server.py --config config/server.yaml --directory webdemo
|
|
60
|
+
|
|
61
|
+
Command-line arguments:
|
|
62
|
+
|
|
63
|
+
- ``bind`` (str): IP/address on which to serve [default: 127.0.0.1].
|
|
64
|
+
- ``port`` (int): Port on which to serve [default: 8000].
|
|
65
|
+
- ``--config`` (str): Path to the server configuration file.
|
|
66
|
+
- ``--directory`` (str): Path to the server configuration file.
|
|
67
|
+
|
|
68
|
+
.. note::
|
|
69
|
+
The server currently does not support secure connection through HTTPS
|
|
70
|
+
"""
|
|
71
|
+
LOGGER.info(f"HTTP server version: {simulstream.__version__}")
|
|
72
|
+
parser = argparse.ArgumentParser(description="Simulstream http.server")
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--bind", "-b", default="127.0.0.1",
|
|
75
|
+
help="Specify alternate bind address [default: 127.0.0.1]")
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--port", "-p", type=int, default=8000,
|
|
78
|
+
help="Specify alternate port [default: 8000]")
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--config", "-c", required=True,
|
|
81
|
+
help="Path to configuration file (YAML)")
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--directory", "-d", default="./webdemo",
|
|
84
|
+
help="Path to the directory containing the HTML web demo [default: ./webdemo]")
|
|
85
|
+
args = parser.parse_args()
|
|
86
|
+
|
|
87
|
+
custom_handler = partial(CustomHandler, config=args.config, directory=args.directory)
|
|
88
|
+
httpd = ThreadingHTTPServer((args.bind, args.port), custom_handler)
|
|
89
|
+
LOGGER.info(f"Serving directory {args.directory}")
|
|
90
|
+
LOGGER.info(f"Serving on http://{args.bind}:{args.port}")
|
|
91
|
+
httpd.serve_forever()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
cli_main()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import time
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
import librosa
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
from simulstream.metrics.logger import METRICS_LOGGER
|
|
24
|
+
from simulstream.server.speech_processors import SpeechProcessor, SAMPLE_RATE
|
|
25
|
+
from simulstream.server.speech_processors.incremental_output import merge_incremental_outputs, \
|
|
26
|
+
IncrementalOutput
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
LOGGER = logging.getLogger('simulstream.message_processor')
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MessageProcessor:
|
|
33
|
+
"""
|
|
34
|
+
This class is responsible for processing the messages incoming from a client, which include
|
|
35
|
+
control messages (e.g., configurations about languages to use, or signal that the stream is
|
|
36
|
+
over).
|
|
37
|
+
"""
|
|
38
|
+
def __init__(self, client_id: int, speech_processor: SpeechProcessor):
|
|
39
|
+
self.client_buffer = b''
|
|
40
|
+
self.processed_audio_seconds = 0
|
|
41
|
+
self.sample_rate = SAMPLE_RATE
|
|
42
|
+
self.client_id = client_id
|
|
43
|
+
self.speech_processor = speech_processor
|
|
44
|
+
|
|
45
|
+
def process_speech(self, speech_data: bytes) -> Optional[IncrementalOutput]:
|
|
46
|
+
"""
|
|
47
|
+
Process an audio chunk and return incremental transcription/translation.
|
|
48
|
+
Namely, it receives and buffers raw audio chunks (``bytes``) and processes audio
|
|
49
|
+
audio incrementally with the configured
|
|
50
|
+
:class:`~simulstream.server.speech_processors.SpeechProcessor`.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
speech_data (bytes): Raw PCM audio bytes (16-bit little endian).
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
IncrementalOutput: incremental processing results, if any. None otherwise.
|
|
57
|
+
"""
|
|
58
|
+
self.client_buffer += speech_data
|
|
59
|
+
# we have SAMPLE_RATE * 2 bytes (int16) samples every second
|
|
60
|
+
buffer_len_seconds = len(self.client_buffer) / 2 / self.sample_rate
|
|
61
|
+
if buffer_len_seconds >= self.speech_processor.speech_chunk_size:
|
|
62
|
+
self.processed_audio_seconds += buffer_len_seconds
|
|
63
|
+
start_time = time.time()
|
|
64
|
+
incremental_output = self._run_speech_processor()
|
|
65
|
+
end_time = time.time()
|
|
66
|
+
METRICS_LOGGER.info(json.dumps({
|
|
67
|
+
"id": self.client_id,
|
|
68
|
+
"total_audio_processed": self.processed_audio_seconds,
|
|
69
|
+
"computation_time": end_time - start_time,
|
|
70
|
+
"generated_tokens": incremental_output.new_tokens,
|
|
71
|
+
"deleted_tokens": incremental_output.deleted_tokens,
|
|
72
|
+
}))
|
|
73
|
+
return incremental_output
|
|
74
|
+
else:
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
def _run_speech_processor(self) -> IncrementalOutput:
|
|
78
|
+
"""
|
|
79
|
+
This function converts raw ``int16`` PCM audio to normalized ``float32``,
|
|
80
|
+
resamples it if necessary to :data:`~simulstream.server.speech_processors.SAMPLE_RATE`,
|
|
81
|
+
and forwards it to the given class:`~simulstream.server.speech_processors.SpeechProcessor`.
|
|
82
|
+
Processing statistics are logged using the metrics logger.
|
|
83
|
+
"""
|
|
84
|
+
int16_waveform = np.frombuffer(self.client_buffer, dtype=np.int16)
|
|
85
|
+
float32_waveform = int16_waveform.astype(np.float32) / 2 ** 15
|
|
86
|
+
if self.sample_rate != SAMPLE_RATE:
|
|
87
|
+
float32_waveform = librosa.resample(
|
|
88
|
+
float32_waveform, orig_sr=self.sample_rate, target_sr=SAMPLE_RATE)
|
|
89
|
+
incremental_output = self.speech_processor.process_chunk(float32_waveform)
|
|
90
|
+
self.client_buffer = b''
|
|
91
|
+
return incremental_output
|
|
92
|
+
|
|
93
|
+
def process_metadata(self, metadata: dict):
|
|
94
|
+
"""
|
|
95
|
+
Takes a dictionary of metadata regarding the incoming speech and desired output, and
|
|
96
|
+
interacts with the configured speech_processor to set it up accordingly.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
metadata (dict): Dictionary of metadata regarding the incoming speech.
|
|
100
|
+
"""
|
|
101
|
+
if 'sample_rate' in metadata:
|
|
102
|
+
self.sample_rate = int(metadata['sample_rate'])
|
|
103
|
+
if 'target_lang' in metadata:
|
|
104
|
+
self.speech_processor.set_target_language(metadata["target_lang"])
|
|
105
|
+
LOGGER.debug(
|
|
106
|
+
f"Client {self.client_id} target language set to: "
|
|
107
|
+
f"{metadata['target_lang']}")
|
|
108
|
+
if 'source_lang' in metadata:
|
|
109
|
+
self.speech_processor.set_source_language(metadata["source_lang"])
|
|
110
|
+
LOGGER.debug(
|
|
111
|
+
f"Client {self.client_id} source language set to: {metadata['source_lang']}")
|
|
112
|
+
if 'metrics_metadata' in metadata:
|
|
113
|
+
METRICS_LOGGER.info(json.dumps({
|
|
114
|
+
"id": self.client_id,
|
|
115
|
+
"metadata": metadata["metrics_metadata"]
|
|
116
|
+
}))
|
|
117
|
+
LOGGER.debug(
|
|
118
|
+
f"Logged client {self.client_id} metrics metadata: {metadata['metrics_metadata']}")
|
|
119
|
+
|
|
120
|
+
def end_of_stream(self) -> IncrementalOutput:
|
|
121
|
+
"""
|
|
122
|
+
Performs the last operations to conclude the processing of the stream of audio by the
|
|
123
|
+
speech processor and cleans up everything to be ready for the next stream.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
IncrementalOutput: last output at the end of the stream.
|
|
127
|
+
"""
|
|
128
|
+
outputs = []
|
|
129
|
+
start_time = time.time()
|
|
130
|
+
if self.client_buffer:
|
|
131
|
+
# process remaining audio after last chunk
|
|
132
|
+
self.processed_audio_seconds += len(self.client_buffer) / 2 / self.sample_rate
|
|
133
|
+
outputs.append(self._run_speech_processor())
|
|
134
|
+
|
|
135
|
+
outputs.append(self.speech_processor.end_of_stream())
|
|
136
|
+
incremental_output = merge_incremental_outputs(
|
|
137
|
+
outputs, self.speech_processor.tokens_to_string)
|
|
138
|
+
end_time = time.time()
|
|
139
|
+
METRICS_LOGGER.info(json.dumps({
|
|
140
|
+
"id": self.client_id,
|
|
141
|
+
"total_audio_processed": self.processed_audio_seconds,
|
|
142
|
+
"computation_time": end_time - start_time,
|
|
143
|
+
"generated_tokens": incremental_output.new_tokens,
|
|
144
|
+
"deleted_tokens": incremental_output.deleted_tokens,
|
|
145
|
+
}))
|
|
146
|
+
self.clear()
|
|
147
|
+
return incremental_output
|
|
148
|
+
|
|
149
|
+
def clear(self):
|
|
150
|
+
"""
|
|
151
|
+
Clear the internal states to be ready for a new input stream.
|
|
152
|
+
"""
|
|
153
|
+
self.speech_processor.clear()
|
|
154
|
+
self.client_buffer = b''
|
|
155
|
+
self.processed_audio_seconds = 0
|
|
156
|
+
self.sample_rate = SAMPLE_RATE
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Copyright 2025 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import importlib
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from types import SimpleNamespace
|
|
18
|
+
from typing import List, Any
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from simulstream.server.speech_processors.incremental_output import IncrementalOutput
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
CHANNELS = 1
|
|
26
|
+
SAMPLE_WIDTH = 2
|
|
27
|
+
SAMPLE_RATE = 16_000
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SpeechProcessor(ABC):
|
|
31
|
+
"""
|
|
32
|
+
Abstract base class for speech processors.
|
|
33
|
+
|
|
34
|
+
Subclasses must implement methods to load models, process audio chunks,
|
|
35
|
+
set source/target languages, and clear internal states.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, config: SimpleNamespace):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the speech processor with a given configuration.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config (SimpleNamespace): Configuration loaded from a YAML file.
|
|
44
|
+
"""
|
|
45
|
+
self.config = config
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def speech_chunk_size(self) -> float:
|
|
49
|
+
"""
|
|
50
|
+
Return the size of the speech chunks to be processed (in seconds).
|
|
51
|
+
"""
|
|
52
|
+
return self.config.speech_chunk_size
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def load_model(cls, config: SimpleNamespace):
|
|
57
|
+
"""
|
|
58
|
+
Load and initialize the underlying speech model.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
config (SimpleNamespace): Configuration of the speech processor.
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
|
|
67
|
+
"""
|
|
68
|
+
Process a chunk of waveform and produce incremental output.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
waveform (np.float32): A 1D NumPy array of the audio chunk. The array is PCM audio
|
|
72
|
+
normalized to the range ``[-1.0, 1.0]`` sampled at
|
|
73
|
+
:attr:`simulstream.server.speech_processors.SAMPLE_RATE`.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
IncrementalOutput: The incremental output (new and deleted tokens/strings).
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def set_source_language(self, language: str) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Set the source language for the speech processor.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
language (str): Language code (e.g., ``"en"``, ``"it"``).
|
|
87
|
+
"""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def set_target_language(self, language: str) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Set the target language for the speech processor (for translation).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
language (str): Language code (e.g., ``"en"``, ``"it"``).
|
|
97
|
+
"""
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def end_of_stream(self) -> IncrementalOutput:
|
|
102
|
+
"""
|
|
103
|
+
This method is called at the end of audio chunk processing. It can be used to emit
|
|
104
|
+
hypotheses at the end of the speech to conclude the output.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
IncrementalOutput: The incremental output (new and deleted tokens/strings).
|
|
108
|
+
"""
|
|
109
|
+
...
|
|
110
|
+
|
|
111
|
+
@abstractmethod
|
|
112
|
+
def tokens_to_string(self, tokens: List[str]) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Converts token sequences into human-readable strings.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
str: The textual representation of the tokens.
|
|
118
|
+
"""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def clear(self) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Clear internal states, such as history of cached audio and/or tokens,
|
|
125
|
+
in preparation for a new stream or conversation.
|
|
126
|
+
"""
|
|
127
|
+
...
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def build_speech_processor(speech_processor_config: SimpleNamespace) -> SpeechProcessor:
|
|
131
|
+
"""
|
|
132
|
+
Instantiate a SpeechProcessor subclass based on configuration.
|
|
133
|
+
|
|
134
|
+
The configuration should specify the fully-qualified class name in the
|
|
135
|
+
``type`` field (e.g. ``"simulstream.server.speech_processors.MyProcessor"``).
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
speech_processor_config (SimpleNamespace): Configuration for the speech processor.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
SpeechProcessor: An instance of the configured speech processor.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
AssertionError: If the specified class is not a subclass of SpeechProcessor.
|
|
145
|
+
"""
|
|
146
|
+
cls = speech_processor_class_load(speech_processor_config.type)
|
|
147
|
+
cls.load_model(speech_processor_config)
|
|
148
|
+
return cls(speech_processor_config)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def speech_processor_class_load(speech_processor_class_string: str) -> type[SpeechProcessor]:
|
|
152
|
+
"""
|
|
153
|
+
Import the speech processor class from its string definition.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
speech_processor_class_string (str): Full name of the speech processor class to load.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
SpeechProcessorClass: A class object for the speech processor class.
|
|
160
|
+
|
|
161
|
+
Raises:
|
|
162
|
+
AssertionError: If the specified class is not a subclass of SpeechProcessor.
|
|
163
|
+
"""
|
|
164
|
+
cls = class_load(speech_processor_class_string)
|
|
165
|
+
assert issubclass(cls, SpeechProcessor), \
|
|
166
|
+
f"{speech_processor_class_string} must be a subclass of SpeechProcessor"
|
|
167
|
+
return cls
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def class_load(class_string: str) -> type[Any]:
|
|
171
|
+
module_path, class_name = class_string.rsplit('.', 1)
|
|
172
|
+
module = importlib.import_module(module_path)
|
|
173
|
+
return getattr(module, class_name)
|