simulstream 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. docs/source/conf.py +47 -0
  2. simulstream/__init__.py +15 -0
  3. simulstream/client/__init__.py +0 -0
  4. simulstream/client/wav_reader_client.py +228 -0
  5. simulstream/config.py +31 -0
  6. simulstream/inference.py +170 -0
  7. simulstream/metrics/__init__.py +0 -0
  8. simulstream/metrics/detokenizers.py +71 -0
  9. simulstream/metrics/logger.py +32 -0
  10. simulstream/metrics/readers.py +348 -0
  11. simulstream/metrics/score_latency.py +130 -0
  12. simulstream/metrics/score_quality.py +169 -0
  13. simulstream/metrics/scorers/__init__.py +0 -0
  14. simulstream/metrics/scorers/latency/__init__.py +115 -0
  15. simulstream/metrics/scorers/latency/mwersegmenter.py +136 -0
  16. simulstream/metrics/scorers/latency/stream_laal.py +119 -0
  17. simulstream/metrics/scorers/quality/__init__.py +132 -0
  18. simulstream/metrics/scorers/quality/comet.py +57 -0
  19. simulstream/metrics/scorers/quality/mwersegmenter.py +93 -0
  20. simulstream/metrics/scorers/quality/sacrebleu.py +59 -0
  21. simulstream/metrics/stats.py +184 -0
  22. simulstream/server/__init__.py +0 -0
  23. simulstream/server/http_server.py +95 -0
  24. simulstream/server/message_processor.py +156 -0
  25. simulstream/server/speech_processors/__init__.py +173 -0
  26. simulstream/server/speech_processors/base.py +135 -0
  27. simulstream/server/speech_processors/base_streamatt.py +320 -0
  28. simulstream/server/speech_processors/canary_sliding_window_retranslation.py +73 -0
  29. simulstream/server/speech_processors/hf_sliding_window_retranslation.py +87 -0
  30. simulstream/server/speech_processors/incremental_output.py +85 -0
  31. simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +84 -0
  32. simulstream/server/speech_processors/seamless_streamatt.py +268 -0
  33. simulstream/server/speech_processors/simuleval_wrapper.py +165 -0
  34. simulstream/server/speech_processors/sliding_window_retranslation.py +135 -0
  35. simulstream/server/speech_processors/vad_wrapper.py +180 -0
  36. simulstream/server/websocket_server.py +236 -0
  37. simulstream-0.1.0.dist-info/METADATA +465 -0
  38. simulstream-0.1.0.dist-info/RECORD +48 -0
  39. simulstream-0.1.0.dist-info/WHEEL +5 -0
  40. simulstream-0.1.0.dist-info/entry_points.txt +8 -0
  41. simulstream-0.1.0.dist-info/licenses/LICENSE +201 -0
  42. simulstream-0.1.0.dist-info/top_level.txt +3 -0
  43. uts/__init__.py +0 -0
  44. uts/metrics/__init__.py +0 -0
  45. uts/metrics/log_reader.py +50 -0
  46. uts/speech_processors/__init__.py +0 -0
  47. uts/speech_processors/test_simuleval_wrapper.py +88 -0
  48. uts/utils.py +5 -0
@@ -0,0 +1,59 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import logging
17
+ import sys
18
+ from typing import List
19
+
20
+ from sacrebleu import BLEU
21
+
22
+ from simulstream.metrics.scorers.quality import register_quality_scorer
23
+ from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
24
+ ResegmentedQualityScoringSample
25
+
26
+ try:
27
+ import sacrebleu
28
+ except ImportError:
29
+ sys.exit("Please install comet first with `pip install sacrebleu`.")
30
+
31
+
32
+ LOGGER = logging.getLogger('simulstream.metrics.scorers.latency.stream_laal')
33
+
34
+
35
+ @register_quality_scorer("sacrebleu")
36
+ class SacreBLEUScorer(MWERSegmenterBasedQualityScorer):
37
+ def __init__(self, args: argparse.Namespace):
38
+ super().__init__(args)
39
+ self.bleu = BLEU(tokenize=args.tokenizer)
40
+
41
+ def _do_score(self, samples: List[ResegmentedQualityScoringSample]) -> float:
42
+ hypotheses = []
43
+ references = []
44
+ for sample in samples:
45
+ hypotheses.extend(sample.hypothesis)
46
+ references.extend(sample.reference)
47
+ score = self.bleu.corpus_score(hypotheses, [references])
48
+ LOGGER.info(f"SacreBLEU signature: {self.bleu.get_signature()}")
49
+ LOGGER.info(f"SacreBLEU detailed score: {score}")
50
+ return score.score
51
+
52
+ @classmethod
53
+ def add_arguments(cls, parser: argparse.ArgumentParser) -> None:
54
+ parser.add_argument(
55
+ "--tokenizer", choices=sacrebleu.metrics.METRICS['BLEU'].TOKENIZERS,
56
+ default=sacrebleu.metrics.METRICS['BLEU'].TOKENIZER_DEFAULT)
57
+
58
+ def requires_source(self) -> bool:
59
+ return False
@@ -0,0 +1,184 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import json
17
+ import logging
18
+ from abc import abstractmethod
19
+
20
+ import simulstream
21
+ from simulstream.config import yaml_config
22
+ from simulstream.metrics.readers import LogReader, text_items
23
+
24
+
25
+ logging.basicConfig(
26
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
27
+ datefmt='%Y-%m-%d %H:%M:%S',
28
+ level=logging.INFO,
29
+ )
30
+ LOGGER = logging.getLogger('simulstream.stats')
31
+
32
+
33
+ class Stats:
34
+ """
35
+ Abstract base class for defining evaluation statistics.
36
+
37
+ Subclasses must implement:
38
+ - :meth:`name`: unique identifier of the statistic.
39
+ - :meth:`description`: a human-readable explanation.
40
+ - :meth:`compute`: logic to compute the metric from a :class:`LogReader`.
41
+ """
42
+ @abstractmethod
43
+ def name(self) -> str:
44
+ """The unique name of the statistic."""
45
+ ...
46
+
47
+ @abstractmethod
48
+ def description(self) -> str:
49
+ """The human-readable explanation of the statistic."""
50
+ ...
51
+
52
+ @abstractmethod
53
+ def compute(self, log_reader: LogReader) -> float:
54
+ """
55
+ Compute the value of the statistic.
56
+
57
+ Args:
58
+ log_reader (LogReader): Reader object encapsulating log data.
59
+
60
+ Returns:
61
+ float: The computed value of the statistic.
62
+ """
63
+ ...
64
+
65
+
66
+ class NormalizedErasure(Stats):
67
+ """
68
+ Compute the **Normalized Erasure** metric.
69
+
70
+ This measures the amount of flickering in retranslation, as defined in
71
+ `Arivazhagan et al., "Re-translation versus Streaming for Simultaneous Translation"
72
+ IWSLT 2020 <https://aclanthology.org/2020.iwslt-1.27/>`_.
73
+
74
+ It is defined as the ratio:
75
+
76
+ .. math::
77
+
78
+ \\text{Normalized Erasure} =
79
+ \\frac{\\text{# Deleted Tokens}}{\\text{# Final Generated Tokens}}
80
+ """
81
+
82
+ def name(self) -> str:
83
+ return "normalized_erasure"
84
+
85
+ def description(self) -> str:
86
+ return "Normalized erasure, defined in https://aclanthology.org/2020.iwslt-1.27/, " \
87
+ "measures flickering in retranslation. It is defined as the ratio between the " \
88
+ "number of tokens that have been deleted and the number of final generated tokens."
89
+
90
+ def compute(self, log_reader: LogReader) -> float:
91
+ total_length = 0
92
+ for _, final_text in log_reader.final_outputs().items():
93
+ total_length += len(text_items(final_text, latency_unit=log_reader.latency_unit))
94
+ return log_reader.num_deleted_tokens() / total_length
95
+
96
+
97
+ class RealTimeFactor(Stats):
98
+ """
99
+ Compute the **Real Time Factor**.
100
+
101
+ This measures how many seconds of computation are required on average
102
+ for each second of input audio.
103
+
104
+ Values greater than 1 indicate that the system is slower than real time
105
+ and cannot process input before the next audio chunk arrives.
106
+ """
107
+ def name(self) -> str:
108
+ return "real_time_factor"
109
+
110
+ def description(self) -> str:
111
+ return "The Real Time Factor measures the average computational cost, ie. time in " \
112
+ "seconds spent in computation, for each input audio second. Values higher than 1 " \
113
+ "mean that the system is not able to process the input in time before the next " \
114
+ "input arrives."
115
+
116
+ def compute(self, log_reader: LogReader) -> float:
117
+ total_audio_lengths = sum(
118
+ logs[-1]["total_audio_processed"] for _, logs in log_reader.outputs_by_audio.items())
119
+ total_computational_cost = sum(
120
+ sum(log["computation_time"] for log in logs)
121
+ for _, logs in log_reader.outputs_by_audio.items())
122
+ return total_computational_cost / total_audio_lengths
123
+
124
+
125
+ def main(args: argparse.Namespace):
126
+ """
127
+ Main entry point for computing statistics.
128
+
129
+ Loads the evaluation configuration and log file, computes all defined
130
+ statistics, and prints them in JSON format.
131
+
132
+ Args:
133
+ args (argparse.Namespace): Parsed command-line arguments.
134
+ """
135
+ LOGGER.info(f"Loading evaluation configuration from {args.eval_config}")
136
+ eval_config = yaml_config(args.eval_config)
137
+ LOGGER.info(f"Loading evaluation log file from {args.log_file}")
138
+ log_reader = LogReader(eval_config, args.log_file, latency_unit=args.latency_unit)
139
+
140
+ LOGGER.info("Computing stats")
141
+ stats_classes = [NormalizedErasure(), RealTimeFactor()]
142
+ stats = {
143
+ stat.name(): {"description": stat.description(), "value": stat.compute(log_reader)}
144
+ for stat in stats_classes
145
+ }
146
+ print(f"Stats: {json.dumps(stats, indent=4)}")
147
+
148
+
149
+ def cli_main():
150
+ """
151
+ Module for computing evaluation statistics from Simulstream logs.
152
+
153
+ This script provides a CLI interface to compute metrics that describe the behavior of
154
+ streaming systems. Metrics are computed from JSONL log files generated during evaluation and
155
+ include:
156
+
157
+ - **Normalized Erasure**: measures flickering in retranslation processors.
158
+ - **Computational Cost**: measures average computation time per second of audio.
159
+
160
+ The output is printed on standard output in JSON format.
161
+
162
+ Typical usage from the command line:
163
+
164
+ $ python -m simulstream.metrics.stats --eval-config config/speech_processor.yaml \\
165
+ --log-file metrics.jsonl
166
+ """
167
+ LOGGER.info(f"Simulstream version: {simulstream.__version__}")
168
+ parser = argparse.ArgumentParser("stats")
169
+ parser.add_argument(
170
+ "--eval-config", type=str, required=True,
171
+ help="Path to the yaml config file containing information about the tokenizer to be used.")
172
+ parser.add_argument(
173
+ "--log-file", type=str, required=True,
174
+ help="Path to the log file with the metrics to be used for the evaluation.")
175
+ parser.add_argument(
176
+ "--latency-unit", choices=["word", "char"], default="word",
177
+ help="Whether to computed stats based on words or characters. Default: word.")
178
+
179
+ args = parser.parse_args()
180
+ main(args)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ cli_main()
File without changes
@@ -0,0 +1,95 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import argparse
16
+ import logging
17
+ from functools import partial
18
+ from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
19
+
20
+ import simulstream
21
+
22
+
23
+ logging.basicConfig(
24
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
25
+ datefmt='%Y-%m-%d %H:%M:%S',
26
+ level=logging.INFO,
27
+ )
28
+ LOGGER = logging.getLogger('simulstream.http_server')
29
+
30
+
31
+ class CustomHandler(SimpleHTTPRequestHandler):
32
+ def __init__(self, *args, config=None, **kwargs):
33
+ self.config = config
34
+ super().__init__(*args, **kwargs)
35
+
36
+ def do_GET(self):
37
+ if self.path == "/config.yaml":
38
+ # Load template
39
+ with open(self.config) as f:
40
+ config = f.read()
41
+
42
+ # Send response
43
+ self.send_response(200)
44
+ self.send_header("Content-type", "text/yaml; charset=utf-8")
45
+ self.end_headers()
46
+ self.wfile.write(config.encode("utf-8"))
47
+ else:
48
+ super().do_GET()
49
+
50
+
51
+ def cli_main():
52
+ """
53
+ Simulstream HTTP server command-line interface (CLI) entry point.
54
+
55
+ This function parses command-line arguments and starts the asynchronous :func:`main` routine.
56
+
57
+ Example usage::
58
+
59
+ $ python http_server.py --config config/server.yaml --directory webdemo
60
+
61
+ Command-line arguments:
62
+
63
+ - ``bind`` (str): IP/address on which to serve [default: 127.0.0.1].
64
+ - ``port`` (int): Port on which to serve [default: 8000].
65
+ - ``--config`` (str): Path to the server configuration file.
66
+ - ``--directory`` (str): Path to the server configuration file.
67
+
68
+ .. note::
69
+ The server currently does not support secure connection through HTTPS
70
+ """
71
+ LOGGER.info(f"HTTP server version: {simulstream.__version__}")
72
+ parser = argparse.ArgumentParser(description="Simulstream http.server")
73
+ parser.add_argument(
74
+ "--bind", "-b", default="127.0.0.1",
75
+ help="Specify alternate bind address [default: 127.0.0.1]")
76
+ parser.add_argument(
77
+ "--port", "-p", type=int, default=8000,
78
+ help="Specify alternate port [default: 8000]")
79
+ parser.add_argument(
80
+ "--config", "-c", required=True,
81
+ help="Path to configuration file (YAML)")
82
+ parser.add_argument(
83
+ "--directory", "-d", default="./webdemo",
84
+ help="Path to the directory containing the HTML web demo [default: ./webdemo]")
85
+ args = parser.parse_args()
86
+
87
+ custom_handler = partial(CustomHandler, config=args.config, directory=args.directory)
88
+ httpd = ThreadingHTTPServer((args.bind, args.port), custom_handler)
89
+ LOGGER.info(f"Serving directory {args.directory}")
90
+ LOGGER.info(f"Serving on http://{args.bind}:{args.port}")
91
+ httpd.serve_forever()
92
+
93
+
94
+ if __name__ == "__main__":
95
+ cli_main()
@@ -0,0 +1,156 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import json
16
+ import logging
17
+ import time
18
+ from typing import Optional
19
+
20
+ import librosa
21
+ import numpy as np
22
+
23
+ from simulstream.metrics.logger import METRICS_LOGGER
24
+ from simulstream.server.speech_processors import SpeechProcessor, SAMPLE_RATE
25
+ from simulstream.server.speech_processors.incremental_output import merge_incremental_outputs, \
26
+ IncrementalOutput
27
+
28
+
29
+ LOGGER = logging.getLogger('simulstream.message_processor')
30
+
31
+
32
+ class MessageProcessor:
33
+ """
34
+ This class is responsible for processing the messages incoming from a client, which include
35
+ control messages (e.g., configurations about languages to use, or signal that the stream is
36
+ over).
37
+ """
38
+ def __init__(self, client_id: int, speech_processor: SpeechProcessor):
39
+ self.client_buffer = b''
40
+ self.processed_audio_seconds = 0
41
+ self.sample_rate = SAMPLE_RATE
42
+ self.client_id = client_id
43
+ self.speech_processor = speech_processor
44
+
45
+ def process_speech(self, speech_data: bytes) -> Optional[IncrementalOutput]:
46
+ """
47
+ Process an audio chunk and return incremental transcription/translation.
48
+ Namely, it receives and buffers raw audio chunks (``bytes``) and processes audio
49
+ audio incrementally with the configured
50
+ :class:`~simulstream.server.speech_processors.SpeechProcessor`.
51
+
52
+ Args:
53
+ speech_data (bytes): Raw PCM audio bytes (16-bit little endian).
54
+
55
+ Returns:
56
+ IncrementalOutput: incremental processing results, if any. None otherwise.
57
+ """
58
+ self.client_buffer += speech_data
59
+ # we have SAMPLE_RATE * 2 bytes (int16) samples every second
60
+ buffer_len_seconds = len(self.client_buffer) / 2 / self.sample_rate
61
+ if buffer_len_seconds >= self.speech_processor.speech_chunk_size:
62
+ self.processed_audio_seconds += buffer_len_seconds
63
+ start_time = time.time()
64
+ incremental_output = self._run_speech_processor()
65
+ end_time = time.time()
66
+ METRICS_LOGGER.info(json.dumps({
67
+ "id": self.client_id,
68
+ "total_audio_processed": self.processed_audio_seconds,
69
+ "computation_time": end_time - start_time,
70
+ "generated_tokens": incremental_output.new_tokens,
71
+ "deleted_tokens": incremental_output.deleted_tokens,
72
+ }))
73
+ return incremental_output
74
+ else:
75
+ return None
76
+
77
+ def _run_speech_processor(self) -> IncrementalOutput:
78
+ """
79
+ This function converts raw ``int16`` PCM audio to normalized ``float32``,
80
+ resamples it if necessary to :data:`~simulstream.server.speech_processors.SAMPLE_RATE`,
81
+ and forwards it to the given class:`~simulstream.server.speech_processors.SpeechProcessor`.
82
+ Processing statistics are logged using the metrics logger.
83
+ """
84
+ int16_waveform = np.frombuffer(self.client_buffer, dtype=np.int16)
85
+ float32_waveform = int16_waveform.astype(np.float32) / 2 ** 15
86
+ if self.sample_rate != SAMPLE_RATE:
87
+ float32_waveform = librosa.resample(
88
+ float32_waveform, orig_sr=self.sample_rate, target_sr=SAMPLE_RATE)
89
+ incremental_output = self.speech_processor.process_chunk(float32_waveform)
90
+ self.client_buffer = b''
91
+ return incremental_output
92
+
93
+ def process_metadata(self, metadata: dict):
94
+ """
95
+ Takes a dictionary of metadata regarding the incoming speech and desired output, and
96
+ interacts with the configured speech_processor to set it up accordingly.
97
+
98
+ Args:
99
+ metadata (dict): Dictionary of metadata regarding the incoming speech.
100
+ """
101
+ if 'sample_rate' in metadata:
102
+ self.sample_rate = int(metadata['sample_rate'])
103
+ if 'target_lang' in metadata:
104
+ self.speech_processor.set_target_language(metadata["target_lang"])
105
+ LOGGER.debug(
106
+ f"Client {self.client_id} target language set to: "
107
+ f"{metadata['target_lang']}")
108
+ if 'source_lang' in metadata:
109
+ self.speech_processor.set_source_language(metadata["source_lang"])
110
+ LOGGER.debug(
111
+ f"Client {self.client_id} source language set to: {metadata['source_lang']}")
112
+ if 'metrics_metadata' in metadata:
113
+ METRICS_LOGGER.info(json.dumps({
114
+ "id": self.client_id,
115
+ "metadata": metadata["metrics_metadata"]
116
+ }))
117
+ LOGGER.debug(
118
+ f"Logged client {self.client_id} metrics metadata: {metadata['metrics_metadata']}")
119
+
120
+ def end_of_stream(self) -> IncrementalOutput:
121
+ """
122
+ Performs the last operations to conclude the processing of the stream of audio by the
123
+ speech processor and cleans up everything to be ready for the next stream.
124
+
125
+ Returns:
126
+ IncrementalOutput: last output at the end of the stream.
127
+ """
128
+ outputs = []
129
+ start_time = time.time()
130
+ if self.client_buffer:
131
+ # process remaining audio after last chunk
132
+ self.processed_audio_seconds += len(self.client_buffer) / 2 / self.sample_rate
133
+ outputs.append(self._run_speech_processor())
134
+
135
+ outputs.append(self.speech_processor.end_of_stream())
136
+ incremental_output = merge_incremental_outputs(
137
+ outputs, self.speech_processor.tokens_to_string)
138
+ end_time = time.time()
139
+ METRICS_LOGGER.info(json.dumps({
140
+ "id": self.client_id,
141
+ "total_audio_processed": self.processed_audio_seconds,
142
+ "computation_time": end_time - start_time,
143
+ "generated_tokens": incremental_output.new_tokens,
144
+ "deleted_tokens": incremental_output.deleted_tokens,
145
+ }))
146
+ self.clear()
147
+ return incremental_output
148
+
149
+ def clear(self):
150
+ """
151
+ Clear the internal states to be ready for a new input stream.
152
+ """
153
+ self.speech_processor.clear()
154
+ self.client_buffer = b''
155
+ self.processed_audio_seconds = 0
156
+ self.sample_rate = SAMPLE_RATE
@@ -0,0 +1,173 @@
1
+ # Copyright 2025 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import importlib
16
+ from abc import ABC, abstractmethod
17
+ from types import SimpleNamespace
18
+ from typing import List, Any
19
+
20
+ import numpy as np
21
+
22
+ from simulstream.server.speech_processors.incremental_output import IncrementalOutput
23
+
24
+
25
+ CHANNELS = 1
26
+ SAMPLE_WIDTH = 2
27
+ SAMPLE_RATE = 16_000
28
+
29
+
30
+ class SpeechProcessor(ABC):
31
+ """
32
+ Abstract base class for speech processors.
33
+
34
+ Subclasses must implement methods to load models, process audio chunks,
35
+ set source/target languages, and clear internal states.
36
+ """
37
+
38
+ def __init__(self, config: SimpleNamespace):
39
+ """
40
+ Initialize the speech processor with a given configuration.
41
+
42
+ Args:
43
+ config (SimpleNamespace): Configuration loaded from a YAML file.
44
+ """
45
+ self.config = config
46
+
47
+ @property
48
+ def speech_chunk_size(self) -> float:
49
+ """
50
+ Return the size of the speech chunks to be processed (in seconds).
51
+ """
52
+ return self.config.speech_chunk_size
53
+
54
+ @classmethod
55
+ @abstractmethod
56
+ def load_model(cls, config: SimpleNamespace):
57
+ """
58
+ Load and initialize the underlying speech model.
59
+
60
+ Args:
61
+ config (SimpleNamespace): Configuration of the speech processor.
62
+ """
63
+ ...
64
+
65
+ @abstractmethod
66
+ def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
67
+ """
68
+ Process a chunk of waveform and produce incremental output.
69
+
70
+ Args:
71
+ waveform (np.float32): A 1D NumPy array of the audio chunk. The array is PCM audio
72
+ normalized to the range ``[-1.0, 1.0]`` sampled at
73
+ :attr:`simulstream.server.speech_processors.SAMPLE_RATE`.
74
+
75
+ Returns:
76
+ IncrementalOutput: The incremental output (new and deleted tokens/strings).
77
+ """
78
+ ...
79
+
80
+ @abstractmethod
81
+ def set_source_language(self, language: str) -> None:
82
+ """
83
+ Set the source language for the speech processor.
84
+
85
+ Args:
86
+ language (str): Language code (e.g., ``"en"``, ``"it"``).
87
+ """
88
+ ...
89
+
90
+ @abstractmethod
91
+ def set_target_language(self, language: str) -> None:
92
+ """
93
+ Set the target language for the speech processor (for translation).
94
+
95
+ Args:
96
+ language (str): Language code (e.g., ``"en"``, ``"it"``).
97
+ """
98
+ ...
99
+
100
+ @abstractmethod
101
+ def end_of_stream(self) -> IncrementalOutput:
102
+ """
103
+ This method is called at the end of audio chunk processing. It can be used to emit
104
+ hypotheses at the end of the speech to conclude the output.
105
+
106
+ Returns:
107
+ IncrementalOutput: The incremental output (new and deleted tokens/strings).
108
+ """
109
+ ...
110
+
111
+ @abstractmethod
112
+ def tokens_to_string(self, tokens: List[str]) -> str:
113
+ """
114
+ Converts token sequences into human-readable strings.
115
+
116
+ Returns:
117
+ str: The textual representation of the tokens.
118
+ """
119
+ ...
120
+
121
+ @abstractmethod
122
+ def clear(self) -> None:
123
+ """
124
+ Clear internal states, such as history of cached audio and/or tokens,
125
+ in preparation for a new stream or conversation.
126
+ """
127
+ ...
128
+
129
+
130
+ def build_speech_processor(speech_processor_config: SimpleNamespace) -> SpeechProcessor:
131
+ """
132
+ Instantiate a SpeechProcessor subclass based on configuration.
133
+
134
+ The configuration should specify the fully-qualified class name in the
135
+ ``type`` field (e.g. ``"simulstream.server.speech_processors.MyProcessor"``).
136
+
137
+ Args:
138
+ speech_processor_config (SimpleNamespace): Configuration for the speech processor.
139
+
140
+ Returns:
141
+ SpeechProcessor: An instance of the configured speech processor.
142
+
143
+ Raises:
144
+ AssertionError: If the specified class is not a subclass of SpeechProcessor.
145
+ """
146
+ cls = speech_processor_class_load(speech_processor_config.type)
147
+ cls.load_model(speech_processor_config)
148
+ return cls(speech_processor_config)
149
+
150
+
151
+ def speech_processor_class_load(speech_processor_class_string: str) -> type[SpeechProcessor]:
152
+ """
153
+ Import the speech processor class from its string definition.
154
+
155
+ Args:
156
+ speech_processor_class_string (str): Full name of the speech processor class to load.
157
+
158
+ Returns:
159
+ SpeechProcessorClass: A class object for the speech processor class.
160
+
161
+ Raises:
162
+ AssertionError: If the specified class is not a subclass of SpeechProcessor.
163
+ """
164
+ cls = class_load(speech_processor_class_string)
165
+ assert issubclass(cls, SpeechProcessor), \
166
+ f"{speech_processor_class_string} must be a subclass of SpeechProcessor"
167
+ return cls
168
+
169
+
170
+ def class_load(class_string: str) -> type[Any]:
171
+ module_path, class_name = class_string.rsplit('.', 1)
172
+ module = importlib.import_module(module_path)
173
+ return getattr(module, class_name)