simulstream 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {simulstream-0.1.0/simulstream.egg-info → simulstream-0.3.0}/PKG-INFO +19 -5
- {simulstream-0.1.0 → simulstream-0.3.0}/README.md +18 -4
- {simulstream-0.1.0 → simulstream-0.3.0}/docs/source/conf.py +2 -1
- {simulstream-0.1.0 → simulstream-0.3.0}/pyproject.toml +1 -1
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/__init__.py +5 -1
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/inference.py +4 -6
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/score_quality.py +21 -2
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/mwersegmenter.py +36 -3
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/comet.py +1 -5
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/mwersegmenter.py +41 -2
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base_streamatt.py +12 -2
- simulstream-0.3.0/simulstream/server/speech_processors/remote/http_proxy_speech_processor.py +115 -0
- simulstream-0.3.0/simulstream/server/speech_processors/remote/http_speech_processor_server.py +221 -0
- simulstream-0.3.0/simulstream/version.txt +1 -0
- {simulstream-0.1.0 → simulstream-0.3.0/simulstream.egg-info}/PKG-INFO +19 -5
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream.egg-info/SOURCES.txt +10 -1
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream.egg-info/top_level.txt +1 -3
- simulstream-0.3.0/uts/metrics/test_stream_laal.py +91 -0
- simulstream-0.3.0/uts/metrics/test_tokenize_no_inplace.py +124 -0
- simulstream-0.3.0/uts/speech_processors/__init__.py +0 -0
- simulstream-0.3.0/uts/speech_processors/test_streamatt.py +64 -0
- simulstream-0.3.0/uts/test_inference.py +93 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/LICENSE +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/setup.cfg +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/client/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/client/wav_reader_client.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/config.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/detokenizers.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/logger.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/readers.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/score_latency.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/stream_laal.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/sacrebleu.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/stats.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/http_server.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/message_processor.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/canary_sliding_window_retranslation.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/hf_sliding_window_retranslation.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/incremental_output.py +0 -0
- {simulstream-0.1.0/uts → simulstream-0.3.0/simulstream/server/speech_processors/remote}/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/seamless_streamatt.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/simuleval_wrapper.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/sliding_window_retranslation.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/vad_wrapper.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/websocket_server.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream.egg-info/dependency_links.txt +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream.egg-info/entry_points.txt +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/simulstream.egg-info/requires.txt +0 -0
- {simulstream-0.1.0/uts/metrics → simulstream-0.3.0/uts}/__init__.py +0 -0
- {simulstream-0.1.0/uts/speech_processors → simulstream-0.3.0/uts/metrics}/__init__.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/uts/metrics/log_reader.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/uts/speech_processors/test_simuleval_wrapper.py +0 -0
- {simulstream-0.1.0 → simulstream-0.3.0}/uts/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: simulstream
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A server to run simultaneous/streaming experiments and demo
|
|
5
5
|
Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
|
|
6
6
|
License: Apache License
|
|
@@ -414,14 +414,15 @@ can score your speech processor by running:
|
|
|
414
414
|
simulstream_score_latency --scorer stream_laal \
|
|
415
415
|
--eval-config config/speech_processor.yaml \
|
|
416
416
|
--log-file metrics.jsonl \
|
|
417
|
-
--reference
|
|
417
|
+
--reference REFERENCES_FILE.tgt \
|
|
418
418
|
--audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
|
|
419
419
|
|
|
420
420
|
simulstream_score_quality --scorer comet \
|
|
421
421
|
--eval-config config/speech_processor.yaml \
|
|
422
422
|
--log-file metrics.jsonl \
|
|
423
|
-
--references REFERENCES_FILE.
|
|
424
|
-
--transcripts TRANSCRIPTS_FILE.
|
|
423
|
+
--references REFERENCES_FILE.tgt \
|
|
424
|
+
--transcripts TRANSCRIPTS_FILE.src \
|
|
425
|
+
--audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
|
|
425
426
|
|
|
426
427
|
simulstream_stats --eval-config config/speech_processor.yaml \
|
|
427
428
|
--log-file metrics.jsonl
|
|
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
|
|
|
435
436
|
|
|
436
437
|
Similarly, ``simulstream_score_quality`` evaluated the quality
|
|
437
438
|
of the generated outputs against one (or more) reference (and transcript, only for metrics
|
|
438
|
-
requiring them) file(s).
|
|
439
|
+
requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
|
|
440
|
+
in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
|
|
441
|
+
|
|
442
|
+
As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
|
|
443
|
+
files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
|
|
444
|
+
**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
|
|
445
|
+
|
|
446
|
+
```
|
|
447
|
+
simulstream_score_quality --scorer comet \
|
|
448
|
+
--eval-config config/speech_processor.yaml \
|
|
449
|
+
--log-file metrics.jsonl \
|
|
450
|
+
--references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
|
|
451
|
+
--transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
|
|
452
|
+
```
|
|
439
453
|
|
|
440
454
|
Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
|
|
441
455
|
|
|
@@ -177,14 +177,15 @@ can score your speech processor by running:
|
|
|
177
177
|
simulstream_score_latency --scorer stream_laal \
|
|
178
178
|
--eval-config config/speech_processor.yaml \
|
|
179
179
|
--log-file metrics.jsonl \
|
|
180
|
-
--reference
|
|
180
|
+
--reference REFERENCES_FILE.tgt \
|
|
181
181
|
--audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
|
|
182
182
|
|
|
183
183
|
simulstream_score_quality --scorer comet \
|
|
184
184
|
--eval-config config/speech_processor.yaml \
|
|
185
185
|
--log-file metrics.jsonl \
|
|
186
|
-
--references REFERENCES_FILE.
|
|
187
|
-
--transcripts TRANSCRIPTS_FILE.
|
|
186
|
+
--references REFERENCES_FILE.tgt \
|
|
187
|
+
--transcripts TRANSCRIPTS_FILE.src \
|
|
188
|
+
--audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
|
|
188
189
|
|
|
189
190
|
simulstream_stats --eval-config config/speech_processor.yaml \
|
|
190
191
|
--log-file metrics.jsonl
|
|
@@ -198,7 +199,20 @@ the selected metric (``--scorer``).
|
|
|
198
199
|
|
|
199
200
|
Similarly, ``simulstream_score_quality`` evaluated the quality
|
|
200
201
|
of the generated outputs against one (or more) reference (and transcript, only for metrics
|
|
201
|
-
requiring them) file(s).
|
|
202
|
+
requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
|
|
203
|
+
in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
|
|
204
|
+
|
|
205
|
+
As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
|
|
206
|
+
files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
|
|
207
|
+
**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
simulstream_score_quality --scorer comet \
|
|
211
|
+
--eval-config config/speech_processor.yaml \
|
|
212
|
+
--log-file metrics.jsonl \
|
|
213
|
+
--references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
|
|
214
|
+
--transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
|
|
215
|
+
```
|
|
202
216
|
|
|
203
217
|
Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
|
|
204
218
|
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
|
+
from simulstream import __version__
|
|
7
8
|
|
|
8
9
|
sys.path.insert(0, os.path.abspath('../../'))
|
|
9
10
|
|
|
@@ -13,7 +14,7 @@ sys.path.insert(0, os.path.abspath('../../'))
|
|
|
13
14
|
project = 'simulstream'
|
|
14
15
|
copyright = '2025, FBK'
|
|
15
16
|
author = 'Marco Gaido, FBK MT Unit'
|
|
16
|
-
release =
|
|
17
|
+
release = __version__
|
|
17
18
|
|
|
18
19
|
# -- General configuration ---------------------------------------------------
|
|
19
20
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
@@ -12,4 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
with Path(__file__).with_name('VERSION.txt').open('r') as f:
|
|
19
|
+
__version__ = f.read().strip()
|
|
@@ -49,16 +49,14 @@ def process_audio(
|
|
|
49
49
|
sample_rate (int): Audio sample rate (Hz).
|
|
50
50
|
data (np.ndarray): Audio samples as int16 array.
|
|
51
51
|
"""
|
|
52
|
+
# speech_chunk_size is expressed in seconds, so the number of samples corresponding to
|
|
53
|
+
# one speech chunk is the following
|
|
52
54
|
samples_per_chunk = int(
|
|
53
|
-
sample_rate * message_processor.speech_processor.speech_chunk_size
|
|
54
|
-
|
|
55
|
+
sample_rate * message_processor.speech_processor.speech_chunk_size)
|
|
56
|
+
|
|
55
57
|
for i in range(0, len(data), samples_per_chunk):
|
|
56
58
|
output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
|
|
57
59
|
LOGGER.debug(f"response: {output}")
|
|
58
|
-
# send last part of the audio
|
|
59
|
-
if i < len(data):
|
|
60
|
-
output = message_processor.process_speech(data[i:].tobytes())
|
|
61
|
-
LOGGER.debug(f"response: {output}")
|
|
62
60
|
|
|
63
61
|
|
|
64
62
|
def run_inference(
|
|
@@ -124,6 +124,19 @@ def cli_main():
|
|
|
124
124
|
--log-file metrics.jsonl \\
|
|
125
125
|
--references ref.en \\
|
|
126
126
|
--transcripts src.it \\
|
|
127
|
+
--audio-definition audio_def.yaml \\
|
|
128
|
+
--scorer sacrebleu
|
|
129
|
+
|
|
130
|
+
Otherwise, the script can be invoked without specifying the `--audio-definition`,
|
|
131
|
+
but in this case the name of the refererence and transcript files (trimmed of
|
|
132
|
+
the extension) must be the same of the audio files used (i.e. the names present
|
|
133
|
+
in `metrics.jsonl`), e.g.:
|
|
134
|
+
|
|
135
|
+
$ python -m simulstream.metrics.score_quality \\
|
|
136
|
+
--eval-config config/speech-processor.yaml \\
|
|
137
|
+
--log-file metrics.jsonl \\
|
|
138
|
+
--references 1.en,2.en \\
|
|
139
|
+
--transcripts 1.it,2.it \\
|
|
127
140
|
--scorer sacrebleu
|
|
128
141
|
"""
|
|
129
142
|
LOGGER.info(f"Simulstream version: {simulstream.__version__}")
|
|
@@ -140,17 +153,23 @@ def cli_main():
|
|
|
140
153
|
"specified, this should be a single file containing all the lines of the audios in "
|
|
141
154
|
"the reference, which should be of the same length of the audio definition. "
|
|
142
155
|
"Otherwise, this should be a list of files, where each contains the lines "
|
|
143
|
-
"corresponding to an audio file."
|
|
156
|
+
"corresponding to an audio file. In the case of being a list of files, the file "
|
|
157
|
+
"stem must match a corresponding transcript for an audio file (if applicable "
|
|
158
|
+
"to the quality metric).")
|
|
144
159
|
parser.add_argument(
|
|
145
160
|
"--transcripts", nargs="+", type=str,
|
|
146
161
|
help="Path to the textual files containing reference transcripts. If `--audio-definition` "
|
|
147
162
|
"is specified, this should be a single file containing all the lines of the audios "
|
|
148
163
|
"in the reference, which should be of the same length of the audio definition. "
|
|
149
164
|
"Otherwise, this should be a list of files, where each contains the lines "
|
|
150
|
-
"corresponding to an audio file."
|
|
165
|
+
"corresponding to an audio file. In the case of being a list of files, the file "
|
|
166
|
+
"stem must match a corresponding reference for an audio file.")
|
|
151
167
|
parser.add_argument(
|
|
152
168
|
"--audio-definition", "-a", type=str, default=None,
|
|
153
169
|
help="Path to the yaml file containing the segment-level audio information.")
|
|
170
|
+
parser.add_argument(
|
|
171
|
+
"--latency-unit", choices=["char", "word"], default="word",
|
|
172
|
+
help="Whether to computed stats based on words or characters. Default: word.")
|
|
154
173
|
parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
|
|
155
174
|
args, _ = parser.parse_known_args()
|
|
156
175
|
|
{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/mwersegmenter.py
RENAMED
|
@@ -17,6 +17,7 @@ from dataclasses import dataclass
|
|
|
17
17
|
from typing import List
|
|
18
18
|
|
|
19
19
|
from mweralign import mweralign
|
|
20
|
+
from mweralign.segmenter import CJSegmenter
|
|
20
21
|
|
|
21
22
|
from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
|
|
22
23
|
from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
|
|
@@ -58,6 +59,7 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
|
|
|
58
59
|
def __init__(self, args):
|
|
59
60
|
super().__init__(args)
|
|
60
61
|
self.latency_unit = args.latency_unit
|
|
62
|
+
self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
|
|
61
63
|
|
|
62
64
|
def requires_reference(self) -> bool:
|
|
63
65
|
return True
|
|
@@ -101,19 +103,50 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
|
|
|
101
103
|
f"Index {index} should have reached end of delays ({len(delays)})"
|
|
102
104
|
return segmented_delays
|
|
103
105
|
|
|
106
|
+
def _tokenize(self, text: List[str]) -> List[str]:
|
|
107
|
+
"""
|
|
108
|
+
Tokenize text using the segmenter.
|
|
109
|
+
|
|
110
|
+
Borrowed from
|
|
111
|
+
https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
|
|
112
|
+
"""
|
|
113
|
+
if self.segmenter is not None:
|
|
114
|
+
tokenized_text = []
|
|
115
|
+
for i in range(len(text)):
|
|
116
|
+
if " ### " in text[i]:
|
|
117
|
+
pieces = text[i].strip().split(" ### ")
|
|
118
|
+
encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
|
|
119
|
+
tokenized_text.append(" ### ".join(encoded))
|
|
120
|
+
elif "\t" in text[i]:
|
|
121
|
+
pieces = text[i].strip().split("\t")
|
|
122
|
+
# underlying C++ binary still uses ###
|
|
123
|
+
encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
|
|
124
|
+
tokenized_text.append(" ### ".join(encoded))
|
|
125
|
+
else:
|
|
126
|
+
tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
|
|
127
|
+
return "\n".join(tokenized_text)
|
|
128
|
+
else:
|
|
129
|
+
return "\n".join(text)
|
|
130
|
+
|
|
104
131
|
def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
|
|
105
132
|
resegmented_samples = []
|
|
106
133
|
for sample in samples:
|
|
107
134
|
assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
|
|
108
135
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
sample.
|
|
136
|
+
hypo = self._tokenize([sample.hypothesis.final_text])
|
|
137
|
+
refs = self._tokenize(
|
|
138
|
+
[sentence_def.content for sentence_def in sample.reference])
|
|
139
|
+
resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
|
|
112
140
|
|
|
113
141
|
assert len(resegmented_hypos) == len(sample.reference), \
|
|
114
142
|
f"Reference ({sample.audio_name}) has mismatched number of target " \
|
|
115
143
|
f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
|
|
116
144
|
|
|
145
|
+
if self.segmenter is not None:
|
|
146
|
+
# segmenter.decode will strip() the spaces, but we need them to align with delays
|
|
147
|
+
resegmented_hypos = [
|
|
148
|
+
hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
|
|
149
|
+
|
|
117
150
|
ideal_delays_splits = self._split_delays_by_segmented_text(
|
|
118
151
|
sample.hypothesis.ideal_delays,
|
|
119
152
|
resegmented_hypos)
|
|
@@ -13,17 +13,13 @@
|
|
|
13
13
|
# limitations under the License
|
|
14
14
|
|
|
15
15
|
import argparse
|
|
16
|
-
import sys
|
|
17
16
|
from typing import List
|
|
18
17
|
|
|
19
18
|
from simulstream.metrics.scorers.quality import register_quality_scorer
|
|
20
19
|
from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
|
|
21
20
|
ResegmentedQualityScoringSample
|
|
22
21
|
|
|
23
|
-
|
|
24
|
-
from comet import download_model, load_from_checkpoint
|
|
25
|
-
except ImportError:
|
|
26
|
-
sys.exit("Please install comet first with `pip install unbabel-comet`.")
|
|
22
|
+
from comet import download_model, load_from_checkpoint
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
@register_quality_scorer("comet")
|
{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/mwersegmenter.py
RENAMED
|
@@ -17,6 +17,7 @@ from dataclasses import dataclass
|
|
|
17
17
|
from typing import List, Optional
|
|
18
18
|
|
|
19
19
|
from mweralign import mweralign
|
|
20
|
+
from mweralign.segmenter import CJSegmenter
|
|
20
21
|
|
|
21
22
|
from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
|
|
22
23
|
|
|
@@ -56,6 +57,11 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
|
|
|
56
57
|
... # Compute a custom quality score
|
|
57
58
|
... return ...
|
|
58
59
|
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, args):
|
|
62
|
+
super().__init__(args)
|
|
63
|
+
self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
|
|
64
|
+
|
|
59
65
|
def requires_reference(self) -> bool:
|
|
60
66
|
return True
|
|
61
67
|
|
|
@@ -75,15 +81,48 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
|
|
|
75
81
|
"""
|
|
76
82
|
...
|
|
77
83
|
|
|
84
|
+
def _tokenize(self, text: List[str]) -> List[str]:
|
|
85
|
+
"""
|
|
86
|
+
Tokenize text using the segmenter.
|
|
87
|
+
|
|
88
|
+
Borrowed from
|
|
89
|
+
https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
|
|
90
|
+
"""
|
|
91
|
+
if self.segmenter is not None:
|
|
92
|
+
tokenized_text = []
|
|
93
|
+
for i in range(len(text)):
|
|
94
|
+
if " ### " in text[i]:
|
|
95
|
+
pieces = text[i].strip().split(" ### ")
|
|
96
|
+
encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
|
|
97
|
+
tokenized_text.append(" ### ".join(encoded))
|
|
98
|
+
elif "\t" in text[i]:
|
|
99
|
+
pieces = text[i].strip().split("\t")
|
|
100
|
+
# underlying C++ binary still uses ###
|
|
101
|
+
encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
|
|
102
|
+
tokenized_text.append(" ### ".join(encoded))
|
|
103
|
+
else:
|
|
104
|
+
tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
|
|
105
|
+
return "\n".join(tokenized_text)
|
|
106
|
+
else:
|
|
107
|
+
return "\n".join(text)
|
|
108
|
+
|
|
78
109
|
def score(self, samples: List[QualityScoringSample]) -> float:
|
|
79
110
|
resegmented_samples = []
|
|
80
111
|
for sample in samples:
|
|
81
112
|
assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
|
|
82
|
-
|
|
83
|
-
|
|
113
|
+
hypo = self._tokenize([sample.hypothesis])
|
|
114
|
+
refs = self._tokenize(sample.reference)
|
|
115
|
+
resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
|
|
116
|
+
|
|
84
117
|
assert len(sample.reference) == len(resegmented_hypos), \
|
|
85
118
|
f"Reference ({sample.audio_name}) has mismatched number of target " \
|
|
86
119
|
f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
|
|
120
|
+
|
|
121
|
+
if self.segmenter is not None:
|
|
122
|
+
# segmenter.decode will strip() the spaces, but we need them to align with delays
|
|
123
|
+
resegmented_hypos = [
|
|
124
|
+
hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
|
|
125
|
+
|
|
87
126
|
resegmented_samples.append(ResegmentedQualityScoringSample(
|
|
88
127
|
sample.audio_name,
|
|
89
128
|
resegmented_hypos,
|
{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base_streamatt.py
RENAMED
|
@@ -151,6 +151,12 @@ class BaseStreamAtt(BaseSpeechProcessor):
|
|
|
151
151
|
self._cut_audio_exceeding_maxlen()
|
|
152
152
|
return
|
|
153
153
|
|
|
154
|
+
assert len(self.text_history) > 0, \
|
|
155
|
+
"If text history is empty after selection, audio cannot be aligned. " \
|
|
156
|
+
"If you see this message, it indicates a bug, so please open an issue at " \
|
|
157
|
+
"https://github.com/hlt-mt/simulstream/issues and include the steps that " \
|
|
158
|
+
"led to this state."
|
|
159
|
+
|
|
154
160
|
# Trim the cross-attention by excluding the discarded new generated tokens and the
|
|
155
161
|
# discarded textual history. Output shape: (text_history_len, n_audio_features)
|
|
156
162
|
cross_attn = cross_attn[discarded_text:discarded_text + len(self.text_history), :]
|
|
@@ -299,13 +305,15 @@ class PunctuationTextHistory:
|
|
|
299
305
|
The current implementation supports only SentencePiece.
|
|
300
306
|
"""
|
|
301
307
|
|
|
302
|
-
STRONG_PUNCTUATION = [".", "!", "?", ":", ";"]
|
|
308
|
+
STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
|
|
303
309
|
|
|
304
310
|
def __init__(self, config: SimpleNamespace):
|
|
305
311
|
self.config = config
|
|
306
312
|
|
|
307
313
|
def select_text_history(self, text_history):
|
|
308
314
|
new_history = []
|
|
315
|
+
seen_punctuation = False
|
|
316
|
+
|
|
309
317
|
for token in reversed(text_history):
|
|
310
318
|
prefix_token = token
|
|
311
319
|
contains_punctuation = False
|
|
@@ -314,7 +322,9 @@ class PunctuationTextHistory:
|
|
|
314
322
|
contains_punctuation = True
|
|
315
323
|
break
|
|
316
324
|
if contains_punctuation:
|
|
317
|
-
|
|
325
|
+
if seen_punctuation:
|
|
326
|
+
break
|
|
327
|
+
seen_punctuation = True
|
|
318
328
|
new_history.append(token)
|
|
319
329
|
# Reverse the list
|
|
320
330
|
return new_history[::-1]
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Copyright 2026 FBK
|
|
2
|
+
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import json
|
|
17
|
+
from http import HTTPStatus
|
|
18
|
+
from typing import List, Any, Dict, Optional
|
|
19
|
+
import uuid
|
|
20
|
+
import urllib.request
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from simulstream.server.speech_processors import SpeechProcessor, IncrementalOutput
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HttpProxySpeechProcessor(SpeechProcessor):
|
|
28
|
+
"""
|
|
29
|
+
HTTP-based proxy implementation of :class:`SpeechProcessor`.
|
|
30
|
+
|
|
31
|
+
This class does not perform speech processing locally. Instead, it forwards
|
|
32
|
+
all method calls to a remote speech processor exposed via HTTP, maintaining
|
|
33
|
+
a dedicated session on the server side.
|
|
34
|
+
|
|
35
|
+
Each instance of this class corresponds to exactly one remote session.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def load_model(cls, config):
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
def __init__(self, config):
|
|
43
|
+
super().__init__(config)
|
|
44
|
+
self.base_url = f"http://{config.hostname}:{config.port}/"
|
|
45
|
+
self.session_id = uuid.uuid4().hex
|
|
46
|
+
self._cached_speech_chunk_size = None
|
|
47
|
+
|
|
48
|
+
def _http_request(
|
|
49
|
+
self, path: str, method: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
50
|
+
data = json.dumps(payload).encode("utf-8")
|
|
51
|
+
req = urllib.request.Request(
|
|
52
|
+
self.base_url + path,
|
|
53
|
+
data=data,
|
|
54
|
+
headers={"Content-Type": "application/json"},
|
|
55
|
+
method=method,
|
|
56
|
+
)
|
|
57
|
+
with urllib.request.urlopen(req) as resp:
|
|
58
|
+
if resp.status == HTTPStatus.NO_CONTENT:
|
|
59
|
+
return None
|
|
60
|
+
return json.loads(resp.read())
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _to_incremental_outputs(json_dict: Dict[str, Any]):
|
|
64
|
+
return IncrementalOutput(
|
|
65
|
+
new_tokens=json_dict["new_tokens"],
|
|
66
|
+
new_string=json_dict["new_string"],
|
|
67
|
+
deleted_tokens=json_dict["deleted_tokens"],
|
|
68
|
+
deleted_string=json_dict["deleted_string"]
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def speech_chunk_size(self) -> float:
|
|
73
|
+
if self._cached_speech_chunk_size is None:
|
|
74
|
+
response = self._http_request("speech_chunk_size", "GET", {
|
|
75
|
+
"session_id": self.session_id
|
|
76
|
+
})
|
|
77
|
+
self._cached_speech_chunk_size = response["speech_chunk_size"]
|
|
78
|
+
return self._cached_speech_chunk_size
|
|
79
|
+
|
|
80
|
+
def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
|
|
81
|
+
response = self._http_request("process_chunk", "POST", {
|
|
82
|
+
"session_id": self.session_id,
|
|
83
|
+
"waveform": base64.b64encode(waveform.tobytes()).decode("utf-8"),
|
|
84
|
+
})
|
|
85
|
+
return self._to_incremental_outputs(response)
|
|
86
|
+
|
|
87
|
+
def set_source_language(self, language):
|
|
88
|
+
self._http_request("source_language", "PUT", {
|
|
89
|
+
"session_id": self.session_id,
|
|
90
|
+
"language": language,
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
def set_target_language(self, language):
|
|
94
|
+
self._http_request("target_language", "PUT", {
|
|
95
|
+
"session_id": self.session_id,
|
|
96
|
+
"language": language,
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
def end_of_stream(self) -> IncrementalOutput:
|
|
100
|
+
response = self._http_request("end_of_stream", "POST", {
|
|
101
|
+
"session_id": self.session_id,
|
|
102
|
+
})
|
|
103
|
+
return self._to_incremental_outputs(response)
|
|
104
|
+
|
|
105
|
+
def clear(self):
|
|
106
|
+
self._http_request("clear", "POST", {
|
|
107
|
+
"session_id": self.session_id,
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
def tokens_to_string(self, tokens: List[str]) -> str:
|
|
111
|
+
response = self._http_request("tokens_to_string", "GET", {
|
|
112
|
+
"session_id": self.session_id,
|
|
113
|
+
"tokens": tokens,
|
|
114
|
+
})
|
|
115
|
+
return response["tokens_as_string"]
|