simulstream 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {simulstream-0.2.0/simulstream.egg-info → simulstream-0.3.0}/PKG-INFO +19 -5
  2. {simulstream-0.2.0 → simulstream-0.3.0}/README.md +18 -4
  3. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/inference.py +1 -5
  4. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/score_quality.py +21 -2
  5. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/mwersegmenter.py +36 -3
  6. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/comet.py +1 -5
  7. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/mwersegmenter.py +41 -2
  8. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base_streamatt.py +12 -2
  9. simulstream-0.3.0/simulstream/version.txt +1 -0
  10. {simulstream-0.2.0 → simulstream-0.3.0/simulstream.egg-info}/PKG-INFO +19 -5
  11. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/SOURCES.txt +5 -1
  12. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/top_level.txt +0 -3
  13. simulstream-0.3.0/uts/metrics/test_stream_laal.py +91 -0
  14. simulstream-0.3.0/uts/metrics/test_tokenize_no_inplace.py +124 -0
  15. simulstream-0.3.0/uts/speech_processors/test_streamatt.py +64 -0
  16. simulstream-0.3.0/uts/test_inference.py +93 -0
  17. simulstream-0.2.0/simulstream/version.txt +0 -1
  18. {simulstream-0.2.0 → simulstream-0.3.0}/LICENSE +0 -0
  19. {simulstream-0.2.0 → simulstream-0.3.0}/docs/source/conf.py +0 -0
  20. {simulstream-0.2.0 → simulstream-0.3.0}/pyproject.toml +0 -0
  21. {simulstream-0.2.0 → simulstream-0.3.0}/setup.cfg +0 -0
  22. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/__init__.py +0 -0
  23. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/client/__init__.py +0 -0
  24. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/client/wav_reader_client.py +0 -0
  25. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/config.py +0 -0
  26. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/__init__.py +0 -0
  27. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/detokenizers.py +0 -0
  28. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/logger.py +0 -0
  29. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/readers.py +0 -0
  30. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/score_latency.py +0 -0
  31. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/__init__.py +0 -0
  32. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/__init__.py +0 -0
  33. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/stream_laal.py +0 -0
  34. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/__init__.py +0 -0
  35. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/sacrebleu.py +0 -0
  36. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/stats.py +0 -0
  37. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/__init__.py +0 -0
  38. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/http_server.py +0 -0
  39. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/message_processor.py +0 -0
  40. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/__init__.py +0 -0
  41. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base.py +0 -0
  42. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/canary_sliding_window_retranslation.py +0 -0
  43. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/hf_sliding_window_retranslation.py +0 -0
  44. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/incremental_output.py +0 -0
  45. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/remote/__init__.py +0 -0
  46. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/remote/http_proxy_speech_processor.py +0 -0
  47. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/remote/http_speech_processor_server.py +0 -0
  48. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/seamless_sliding_window_retranslation.py +0 -0
  49. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/seamless_streamatt.py +0 -0
  50. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/simuleval_wrapper.py +0 -0
  51. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/sliding_window_retranslation.py +0 -0
  52. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/vad_wrapper.py +0 -0
  53. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/websocket_server.py +0 -0
  54. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/dependency_links.txt +0 -0
  55. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/entry_points.txt +0 -0
  56. {simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/requires.txt +0 -0
  57. {simulstream-0.2.0 → simulstream-0.3.0}/uts/__init__.py +0 -0
  58. {simulstream-0.2.0 → simulstream-0.3.0}/uts/metrics/__init__.py +0 -0
  59. {simulstream-0.2.0 → simulstream-0.3.0}/uts/metrics/log_reader.py +0 -0
  60. {simulstream-0.2.0 → simulstream-0.3.0}/uts/speech_processors/__init__.py +0 -0
  61. {simulstream-0.2.0 → simulstream-0.3.0}/uts/speech_processors/test_simuleval_wrapper.py +0 -0
  62. {simulstream-0.2.0 → simulstream-0.3.0}/uts/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: simulstream
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A server to run simultaneous/streaming experiments and demo
5
5
  Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
6
6
  License: Apache License
@@ -414,14 +414,15 @@ can score your speech processor by running:
414
414
  simulstream_score_latency --scorer stream_laal \
415
415
  --eval-config config/speech_processor.yaml \
416
416
  --log-file metrics.jsonl \
417
- --reference REFERENCE_FILE.txt \
417
+ --reference REFERENCES_FILE.tgt \
418
418
  --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
419
419
 
420
420
  simulstream_score_quality --scorer comet \
421
421
  --eval-config config/speech_processor.yaml \
422
422
  --log-file metrics.jsonl \
423
- --references REFERENCES_FILE.txt \
424
- --transcripts TRANSCRIPTS_FILE.txt
423
+ --references REFERENCES_FILE.tgt \
424
+ --transcripts TRANSCRIPTS_FILE.src \
425
+ --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
425
426
 
426
427
  simulstream_stats --eval-config config/speech_processor.yaml \
427
428
  --log-file metrics.jsonl
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
435
436
 
436
437
  Similarly, ``simulstream_score_quality`` evaluated the quality
437
438
  of the generated outputs against one (or more) reference (and transcript, only for metrics
438
- requiring them) file(s).
439
+ requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
440
+ in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
441
+
442
+ As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
443
+ files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
444
+ **must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
445
+
446
+ ```
447
+ simulstream_score_quality --scorer comet \
448
+ --eval-config config/speech_processor.yaml \
449
+ --log-file metrics.jsonl \
450
+ --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
451
+ --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
452
+ ```
439
453
 
440
454
  Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
441
455
 
@@ -177,14 +177,15 @@ can score your speech processor by running:
177
177
  simulstream_score_latency --scorer stream_laal \
178
178
  --eval-config config/speech_processor.yaml \
179
179
  --log-file metrics.jsonl \
180
- --reference REFERENCE_FILE.txt \
180
+ --reference REFERENCES_FILE.tgt \
181
181
  --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
182
182
 
183
183
  simulstream_score_quality --scorer comet \
184
184
  --eval-config config/speech_processor.yaml \
185
185
  --log-file metrics.jsonl \
186
- --references REFERENCES_FILE.txt \
187
- --transcripts TRANSCRIPTS_FILE.txt
186
+ --references REFERENCES_FILE.tgt \
187
+ --transcripts TRANSCRIPTS_FILE.src \
188
+ --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
188
189
 
189
190
  simulstream_stats --eval-config config/speech_processor.yaml \
190
191
  --log-file metrics.jsonl
@@ -198,7 +199,20 @@ the selected metric (``--scorer``).
198
199
 
199
200
  Similarly, ``simulstream_score_quality`` evaluated the quality
200
201
  of the generated outputs against one (or more) reference (and transcript, only for metrics
201
- requiring them) file(s).
202
+ requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
203
+ in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
204
+
205
+ As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
206
+ files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
207
+ **must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
208
+
209
+ ```
210
+ simulstream_score_quality --scorer comet \
211
+ --eval-config config/speech_processor.yaml \
212
+ --log-file metrics.jsonl \
213
+ --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
214
+ --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
215
+ ```
202
216
 
203
217
  Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
204
218
 
@@ -53,14 +53,10 @@ def process_audio(
53
53
  # one speech chunk is the following
54
54
  samples_per_chunk = int(
55
55
  sample_rate * message_processor.speech_processor.speech_chunk_size)
56
- i = 0
56
+
57
57
  for i in range(0, len(data), samples_per_chunk):
58
58
  output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
59
59
  LOGGER.debug(f"response: {output}")
60
- # send last part of the audio
61
- if i < len(data):
62
- output = message_processor.process_speech(data[i:].tobytes())
63
- LOGGER.debug(f"response: {output}")
64
60
 
65
61
 
66
62
  def run_inference(
@@ -124,6 +124,19 @@ def cli_main():
124
124
  --log-file metrics.jsonl \\
125
125
  --references ref.en \\
126
126
  --transcripts src.it \\
127
+ --audio-definition audio_def.yaml \\
128
+ --scorer sacrebleu
129
+
130
+ Otherwise, the script can be invoked without specifying the `--audio-definition`,
131
+ but in this case the name of the refererence and transcript files (trimmed of
132
+ the extension) must be the same of the audio files used (i.e. the names present
133
+ in `metrics.jsonl`), e.g.:
134
+
135
+ $ python -m simulstream.metrics.score_quality \\
136
+ --eval-config config/speech-processor.yaml \\
137
+ --log-file metrics.jsonl \\
138
+ --references 1.en,2.en \\
139
+ --transcripts 1.it,2.it \\
127
140
  --scorer sacrebleu
128
141
  """
129
142
  LOGGER.info(f"Simulstream version: {simulstream.__version__}")
@@ -140,17 +153,23 @@ def cli_main():
140
153
  "specified, this should be a single file containing all the lines of the audios in "
141
154
  "the reference, which should be of the same length of the audio definition. "
142
155
  "Otherwise, this should be a list of files, where each contains the lines "
143
- "corresponding to an audio file.")
156
+ "corresponding to an audio file. In the case of being a list of files, the file "
157
+ "stem must match a corresponding transcript for an audio file (if applicable "
158
+ "to the quality metric).")
144
159
  parser.add_argument(
145
160
  "--transcripts", nargs="+", type=str,
146
161
  help="Path to the textual files containing reference transcripts. If `--audio-definition` "
147
162
  "is specified, this should be a single file containing all the lines of the audios "
148
163
  "in the reference, which should be of the same length of the audio definition. "
149
164
  "Otherwise, this should be a list of files, where each contains the lines "
150
- "corresponding to an audio file.")
165
+ "corresponding to an audio file. In the case of being a list of files, the file "
166
+ "stem must match a corresponding reference for an audio file.")
151
167
  parser.add_argument(
152
168
  "--audio-definition", "-a", type=str, default=None,
153
169
  help="Path to the yaml file containing the segment-level audio information.")
170
+ parser.add_argument(
171
+ "--latency-unit", choices=["char", "word"], default="word",
172
+ help="Whether to computed stats based on words or characters. Default: word.")
154
173
  parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
155
174
  args, _ = parser.parse_known_args()
156
175
 
@@ -17,6 +17,7 @@ from dataclasses import dataclass
17
17
  from typing import List
18
18
 
19
19
  from mweralign import mweralign
20
+ from mweralign.segmenter import CJSegmenter
20
21
 
21
22
  from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
22
23
  from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
@@ -58,6 +59,7 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
58
59
  def __init__(self, args):
59
60
  super().__init__(args)
60
61
  self.latency_unit = args.latency_unit
62
+ self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
61
63
 
62
64
  def requires_reference(self) -> bool:
63
65
  return True
@@ -101,19 +103,50 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
101
103
  f"Index {index} should have reached end of delays ({len(delays)})"
102
104
  return segmented_delays
103
105
 
106
+ def _tokenize(self, text: List[str]) -> List[str]:
107
+ """
108
+ Tokenize text using the segmenter.
109
+
110
+ Borrowed from
111
+ https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
112
+ """
113
+ if self.segmenter is not None:
114
+ tokenized_text = []
115
+ for i in range(len(text)):
116
+ if " ### " in text[i]:
117
+ pieces = text[i].strip().split(" ### ")
118
+ encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
119
+ tokenized_text.append(" ### ".join(encoded))
120
+ elif "\t" in text[i]:
121
+ pieces = text[i].strip().split("\t")
122
+ # underlying C++ binary still uses ###
123
+ encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
124
+ tokenized_text.append(" ### ".join(encoded))
125
+ else:
126
+ tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
127
+ return "\n".join(tokenized_text)
128
+ else:
129
+ return "\n".join(text)
130
+
104
131
  def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
105
132
  resegmented_samples = []
106
133
  for sample in samples:
107
134
  assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
108
135
 
109
- resegmented_hypos = mweralign.align_texts(
110
- "\n".join([sentence_def.content for sentence_def in sample.reference]),
111
- sample.hypothesis.final_text).split("\n")
136
+ hypo = self._tokenize([sample.hypothesis.final_text])
137
+ refs = self._tokenize(
138
+ [sentence_def.content for sentence_def in sample.reference])
139
+ resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
112
140
 
113
141
  assert len(resegmented_hypos) == len(sample.reference), \
114
142
  f"Reference ({sample.audio_name}) has mismatched number of target " \
115
143
  f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
116
144
 
145
+ if self.segmenter is not None:
146
+ # segmenter.decode will strip() the spaces, but we need them to align with delays
147
+ resegmented_hypos = [
148
+ hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
149
+
117
150
  ideal_delays_splits = self._split_delays_by_segmented_text(
118
151
  sample.hypothesis.ideal_delays,
119
152
  resegmented_hypos)
@@ -13,17 +13,13 @@
13
13
  # limitations under the License
14
14
 
15
15
  import argparse
16
- import sys
17
16
  from typing import List
18
17
 
19
18
  from simulstream.metrics.scorers.quality import register_quality_scorer
20
19
  from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
21
20
  ResegmentedQualityScoringSample
22
21
 
23
- try:
24
- from comet import download_model, load_from_checkpoint
25
- except ImportError:
26
- sys.exit("Please install comet first with `pip install unbabel-comet`.")
22
+ from comet import download_model, load_from_checkpoint
27
23
 
28
24
 
29
25
  @register_quality_scorer("comet")
@@ -17,6 +17,7 @@ from dataclasses import dataclass
17
17
  from typing import List, Optional
18
18
 
19
19
  from mweralign import mweralign
20
+ from mweralign.segmenter import CJSegmenter
20
21
 
21
22
  from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
22
23
 
@@ -56,6 +57,11 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
56
57
  ... # Compute a custom quality score
57
58
  ... return ...
58
59
  """
60
+
61
+ def __init__(self, args):
62
+ super().__init__(args)
63
+ self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
64
+
59
65
  def requires_reference(self) -> bool:
60
66
  return True
61
67
 
@@ -75,15 +81,48 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
75
81
  """
76
82
  ...
77
83
 
84
+ def _tokenize(self, text: List[str]) -> List[str]:
85
+ """
86
+ Tokenize text using the segmenter.
87
+
88
+ Borrowed from
89
+ https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
90
+ """
91
+ if self.segmenter is not None:
92
+ tokenized_text = []
93
+ for i in range(len(text)):
94
+ if " ### " in text[i]:
95
+ pieces = text[i].strip().split(" ### ")
96
+ encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
97
+ tokenized_text.append(" ### ".join(encoded))
98
+ elif "\t" in text[i]:
99
+ pieces = text[i].strip().split("\t")
100
+ # underlying C++ binary still uses ###
101
+ encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
102
+ tokenized_text.append(" ### ".join(encoded))
103
+ else:
104
+ tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
105
+ return "\n".join(tokenized_text)
106
+ else:
107
+ return "\n".join(text)
108
+
78
109
  def score(self, samples: List[QualityScoringSample]) -> float:
79
110
  resegmented_samples = []
80
111
  for sample in samples:
81
112
  assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
82
- resegmented_hypos = mweralign.align_texts(
83
- "\n".join(sample.reference), sample.hypothesis).split("\n")
113
+ hypo = self._tokenize([sample.hypothesis])
114
+ refs = self._tokenize(sample.reference)
115
+ resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
116
+
84
117
  assert len(sample.reference) == len(resegmented_hypos), \
85
118
  f"Reference ({sample.audio_name}) has mismatched number of target " \
86
119
  f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
120
+
121
+ if self.segmenter is not None:
122
+ # segmenter.decode will strip() the spaces, but we need them to align with delays
123
+ resegmented_hypos = [
124
+ hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
125
+
87
126
  resegmented_samples.append(ResegmentedQualityScoringSample(
88
127
  sample.audio_name,
89
128
  resegmented_hypos,
@@ -151,6 +151,12 @@ class BaseStreamAtt(BaseSpeechProcessor):
151
151
  self._cut_audio_exceeding_maxlen()
152
152
  return
153
153
 
154
+ assert len(self.text_history) > 0, \
155
+ "If text history is empty after selection, audio cannot be aligned. " \
156
+ "If you see this message, it indicates a bug, so please open an issue at " \
157
+ "https://github.com/hlt-mt/simulstream/issues and include the steps that " \
158
+ "led to this state."
159
+
154
160
  # Trim the cross-attention by excluding the discarded new generated tokens and the
155
161
  # discarded textual history. Output shape: (text_history_len, n_audio_features)
156
162
  cross_attn = cross_attn[discarded_text:discarded_text + len(self.text_history), :]
@@ -299,13 +305,15 @@ class PunctuationTextHistory:
299
305
  The current implementation supports only SentencePiece.
300
306
  """
301
307
 
302
- STRONG_PUNCTUATION = [".", "!", "?", ":", ";"]
308
+ STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
303
309
 
304
310
  def __init__(self, config: SimpleNamespace):
305
311
  self.config = config
306
312
 
307
313
  def select_text_history(self, text_history):
308
314
  new_history = []
315
+ seen_punctuation = False
316
+
309
317
  for token in reversed(text_history):
310
318
  prefix_token = token
311
319
  contains_punctuation = False
@@ -314,7 +322,9 @@ class PunctuationTextHistory:
314
322
  contains_punctuation = True
315
323
  break
316
324
  if contains_punctuation:
317
- break
325
+ if seen_punctuation:
326
+ break
327
+ seen_punctuation = True
318
328
  new_history.append(token)
319
329
  # Reverse the list
320
330
  return new_history[::-1]
@@ -0,0 +1 @@
1
+ 0.3.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: simulstream
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A server to run simultaneous/streaming experiments and demo
5
5
  Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
6
6
  License: Apache License
@@ -414,14 +414,15 @@ can score your speech processor by running:
414
414
  simulstream_score_latency --scorer stream_laal \
415
415
  --eval-config config/speech_processor.yaml \
416
416
  --log-file metrics.jsonl \
417
- --reference REFERENCE_FILE.txt \
417
+ --reference REFERENCES_FILE.tgt \
418
418
  --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
419
419
 
420
420
  simulstream_score_quality --scorer comet \
421
421
  --eval-config config/speech_processor.yaml \
422
422
  --log-file metrics.jsonl \
423
- --references REFERENCES_FILE.txt \
424
- --transcripts TRANSCRIPTS_FILE.txt
423
+ --references REFERENCES_FILE.tgt \
424
+ --transcripts TRANSCRIPTS_FILE.src \
425
+ --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
425
426
 
426
427
  simulstream_stats --eval-config config/speech_processor.yaml \
427
428
  --log-file metrics.jsonl
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
435
436
 
436
437
  Similarly, ``simulstream_score_quality`` evaluated the quality
437
438
  of the generated outputs against one (or more) reference (and transcript, only for metrics
438
- requiring them) file(s).
439
+ requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
440
+ in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
441
+
442
+ As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
443
+ files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
444
+ **must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
445
+
446
+ ```
447
+ simulstream_score_quality --scorer comet \
448
+ --eval-config config/speech_processor.yaml \
449
+ --log-file metrics.jsonl \
450
+ --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
451
+ --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
452
+ ```
439
453
 
440
454
  Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
441
455
 
@@ -49,8 +49,12 @@ simulstream/server/speech_processors/remote/__init__.py
49
49
  simulstream/server/speech_processors/remote/http_proxy_speech_processor.py
50
50
  simulstream/server/speech_processors/remote/http_speech_processor_server.py
51
51
  uts/__init__.py
52
+ uts/test_inference.py
52
53
  uts/utils.py
53
54
  uts/metrics/__init__.py
54
55
  uts/metrics/log_reader.py
56
+ uts/metrics/test_stream_laal.py
57
+ uts/metrics/test_tokenize_no_inplace.py
55
58
  uts/speech_processors/__init__.py
56
- uts/speech_processors/test_simuleval_wrapper.py
59
+ uts/speech_processors/test_simuleval_wrapper.py
60
+ uts/speech_processors/test_streamatt.py
@@ -1,6 +1,3 @@
1
- _build
2
- _static
3
- _templates
4
1
  config
5
2
  dist
6
3
  docs
@@ -0,0 +1,91 @@
1
+ # Copyright 2026 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import unittest
16
+ from argparse import Namespace
17
+
18
+ from simulstream.metrics.readers import OutputWithDelays, ReferenceSentenceDefinition
19
+ from simulstream.metrics.scorers.latency import LatencyScoringSample
20
+ from simulstream.metrics.scorers.latency.stream_laal import StreamLaal
21
+
22
+
23
+ class StreamLaalTestCase(unittest.TestCase):
24
+ def test_basic(self):
25
+ reference = [
26
+ ReferenceSentenceDefinition(
27
+ "A New York, sono a capo di un'associazione no profit, chiamata Robin Hood.",
28
+ 12.61,
29
+ 4.07,
30
+ ),
31
+ ReferenceSentenceDefinition(
32
+ "Quando non combatto la povertà, combatto gli incendi come assistente capitano di "
33
+ "una brigata di pompieri volontari.",
34
+ 16.9,
35
+ 5.14,
36
+ )
37
+ ]
38
+ hypothesis = OutputWithDelays(
39
+ "Tornando a New York, sono il capo dello sviluppo per un non-profit chiamato Robin "
40
+ "Hood. Quando non sto combattendo la povertà, sto combattendo i fuochi.",
41
+ [14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 18.0,
42
+ 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 20.0, 20.0, 20.0, 20.0],
43
+ [18.22, 18.22, 18.22, 18.22, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93,
44
+ 19.93, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 27.30, 27.30, 27.30,
45
+ 27.30,]
46
+ )
47
+ scorer = StreamLaal(Namespace(latency_unit="word"))
48
+ score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
49
+ self.assertAlmostEqual(score.ideal_latency, 0.868587, 4)
50
+ self.assertAlmostEqual(score.computational_aware_latency, 5.86, 4)
51
+
52
+ def test_with_characters(self):
53
+ reference = [
54
+ ReferenceSentenceDefinition(
55
+ "今天她看起很好,",
56
+ 12.61,
57
+ 3.07,
58
+ ),
59
+ ReferenceSentenceDefinition(
60
+ "我们一起去公园散步吧。",
61
+ 16.9,
62
+ 3.14,
63
+ ),
64
+ ReferenceSentenceDefinition(
65
+ "Amy",
66
+ 21.0,
67
+ 0.5,
68
+ ),
69
+ ReferenceSentenceDefinition(
70
+ "今天心情很好",
71
+ 21.5,
72
+ 2.0,
73
+ ),
74
+ ]
75
+ hypothesis = OutputWithDelays(
76
+ "今天她很漂亮,我们一起去花园跑步吧。Amy 今天心情很好",
77
+ [14.0, 14.0, 14.0, 15.0, 15.0, 16.0, 17.0,
78
+ 17.0, 17.0, 18.0, 18.0, 19.0, 19.0, 20.0, 20.0, 21.0, 21.0, 21.0,
79
+ 22.0, 22.0, 22.0, 22.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0],
80
+ [14.5, 14.5, 14.5, 15.2, 15.2, 16.8, 17.5,
81
+ 18.0, 18.5, 18.5, 18.5, 20.1, 20.1, 21.3, 21.3, 22.0, 22.0, 22.0,
82
+ 23.0, 23.0, 23.0, 23.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0],
83
+ )
84
+ scorer = StreamLaal(Namespace(latency_unit="char"))
85
+ score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
86
+ self.assertAlmostEqual(score.ideal_latency, 1.333312, 4)
87
+ self.assertAlmostEqual(score.computational_aware_latency, 2.074095, 4)
88
+
89
+
90
+ if __name__ == '__main__':
91
+ unittest.main()
@@ -0,0 +1,124 @@
1
+ # Copyright 2026 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import copy
16
+ import unittest
17
+ from argparse import Namespace
18
+
19
+ from simulstream.metrics.scorers.quality.mwersegmenter import (
20
+ MWERSegmenterBasedQualityScorer,
21
+ )
22
+ from simulstream.metrics.scorers.latency.mwersegmenter import (
23
+ MWERSegmenterBasedLatencyScorer,
24
+ )
25
+ from simulstream.metrics.scorers.latency import LatencyScores
26
+
27
+
28
+ class TokenizeNoInplaceModificationTestCase(unittest.TestCase):
29
+ """
30
+ Ensures that _tokenize does not alter the references.
31
+ See https://github.com/hlt-mt/simulstream/pull/20#issuecomment-3960951980
32
+ """
33
+
34
+ def _make_quality_scorer(self, latency_unit="char"):
35
+ """Create a concrete subclass of the abstract quality scorer."""
36
+ class _Scorer(MWERSegmenterBasedQualityScorer):
37
+ def _do_score(self, samples):
38
+ return 0.0
39
+
40
+ @classmethod
41
+ def add_arguments(cls, parser):
42
+ pass
43
+
44
+ def requires_source(self):
45
+ return False
46
+
47
+ args = Namespace(latency_unit=latency_unit)
48
+ return _Scorer(args)
49
+
50
+ def _make_latency_scorer(self, latency_unit="char"):
51
+ """Create a concrete subclass of the abstract latency scorer."""
52
+ class _Scorer(MWERSegmenterBasedLatencyScorer):
53
+ def _do_score(self, samples):
54
+ return LatencyScores(0.0, [])
55
+
56
+ @classmethod
57
+ def add_arguments(cls, parser):
58
+ pass
59
+
60
+ def requires_source(self):
61
+ return False
62
+
63
+ args = Namespace(latency_unit=latency_unit)
64
+ return _Scorer(args)
65
+
66
+ def test_quality_tokenize_does_not_modify_input(self):
67
+ scorer = self._make_quality_scorer(latency_unit="char")
68
+ text = ["你好世界", "这是测试"]
69
+ original = copy.deepcopy(text)
70
+ scorer._tokenize(text)
71
+ self.assertEqual(text, original)
72
+
73
+ def test_latency_tokenize_does_not_modify_input(self):
74
+ scorer = self._make_latency_scorer(latency_unit="char")
75
+ text = ["你好世界", "这是测试"]
76
+ original = copy.deepcopy(text)
77
+ scorer._tokenize(text)
78
+ self.assertEqual(text, original)
79
+
80
+ def test_quality_tokenize_no_modify_with_separator(self):
81
+ scorer = self._make_quality_scorer(latency_unit="char")
82
+ text = ["你好 ### 世界"]
83
+ original = copy.deepcopy(text)
84
+ scorer._tokenize(text)
85
+ self.assertEqual(text, original)
86
+
87
+ def test_quality_tokenize_no_modify_with_tab(self):
88
+ scorer = self._make_quality_scorer(latency_unit="char")
89
+ text = ["你好\t世界"]
90
+ original = copy.deepcopy(text)
91
+ scorer._tokenize(text)
92
+ self.assertEqual(text, original)
93
+
94
+ def test_quality_tokenize_does_not_modify_input_english(self):
95
+ scorer = self._make_quality_scorer(latency_unit="word")
96
+ text = ["hello world", "this is a test"]
97
+ original = copy.deepcopy(text)
98
+ scorer._tokenize(text)
99
+ self.assertEqual(text, original)
100
+
101
+ def test_latency_tokenize_does_not_modify_input_english(self):
102
+ scorer = self._make_latency_scorer(latency_unit="word")
103
+ text = ["hello world", "this is a test"]
104
+ original = copy.deepcopy(text)
105
+ scorer._tokenize(text)
106
+ self.assertEqual(text, original)
107
+
108
+ def test_quality_tokenize_no_modify_with_separator_english(self):
109
+ scorer = self._make_quality_scorer(latency_unit="word")
110
+ text = ["hello ### world"]
111
+ original = copy.deepcopy(text)
112
+ scorer._tokenize(text)
113
+ self.assertEqual(text, original)
114
+
115
+ def test_quality_tokenize_no_modify_with_tab_english(self):
116
+ scorer = self._make_quality_scorer(latency_unit="word")
117
+ text = ["hello\tworld"]
118
+ original = copy.deepcopy(text)
119
+ scorer._tokenize(text)
120
+ self.assertEqual(text, original)
121
+
122
+
123
+ if __name__ == '__main__':
124
+ unittest.main()
@@ -0,0 +1,64 @@
1
+ # Copyright 2026 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import unittest
16
+ from types import SimpleNamespace
17
+
18
+ from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory
19
+
20
+
21
+ class TestPunctuationTextHistory(unittest.TestCase):
22
+ def setUp(self):
23
+ self.config = SimpleNamespace()
24
+ self.punctuation_text_history = PunctuationTextHistory(self.config)
25
+
26
+ def test_punctuation_last(self):
27
+ """ Test PunctuationTextHistory method when the history ends with strong punctuation. """
28
+ # Test word level
29
+ en_history = ["Hi", "!", "I", "am", "Sara", "."]
30
+ selected_history = self.punctuation_text_history.select_text_history(en_history)
31
+ self.assertEqual(selected_history, ["I", "am", "Sara", "."])
32
+
33
+ # Test character level
34
+ zh_history = ['担', '任', '开', '发', '主', '管', '。']
35
+ selected_history = self.punctuation_text_history.select_text_history(zh_history)
36
+ self.assertEqual(selected_history, ['担', '任', '开', '发', '主', '管', '。'])
37
+
38
+ def test_punctuation_in_between(self):
39
+ """ Test PunctuationTextHistory method when punctuation separates two sentences. """
40
+ # Test word level
41
+ en_history = ["Hi", "!", "I", "am", "Sara"]
42
+ selected_history = self.punctuation_text_history.select_text_history(en_history)
43
+ self.assertEqual(selected_history, ["I", "am", "Sara"])
44
+
45
+ # Test character level
46
+ zh_history = ['开', '发', '主', '管', '。', '担', '任']
47
+ selected_history = self.punctuation_text_history.select_text_history(zh_history)
48
+ self.assertEqual(selected_history, ['担', '任'])
49
+
50
+ def test_no_strong_punctuation(self):
51
+ """ Test PunctuationTextHistory method when no strong punctuation is present. """
52
+ # Test word level
53
+ en_history = ["Hi", ",", "I", "am", "Sara"]
54
+ selected_history = self.punctuation_text_history.select_text_history(en_history)
55
+ self.assertEqual(selected_history, ["Hi", ",", "I", "am", "Sara"])
56
+
57
+ # Test character level
58
+ zh_history = ['回', '到', '纽', '约', '后', ',', '我']
59
+ selected_history = self.punctuation_text_history.select_text_history(zh_history)
60
+ self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', ',', '我'])
61
+
62
+
63
+ if __name__ == "__main__":
64
+ unittest.main()
@@ -0,0 +1,93 @@
1
+ # Copyright 2026 FBK
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License
14
+
15
+ import unittest
16
+ from unittest.mock import MagicMock
17
+ import numpy as np
18
+
19
+ from simulstream.inference import process_audio
20
+ from simulstream.server.message_processor import MessageProcessor
21
+ from simulstream.server.speech_processors import SAMPLE_RATE
22
+ from simulstream.server.speech_processors.incremental_output import IncrementalOutput
23
+
24
+
25
+ def make_speech_processor(chunk_size_seconds=1.0):
26
+ """Creates a mock SpeechProcessor with the minimal interface needed."""
27
+ mock_output = IncrementalOutput(
28
+ new_tokens=[], deleted_tokens=0, new_string="", deleted_string="")
29
+ processor = MagicMock(
30
+ spec=["speech_chunk_size", "process_chunk", "end_of_stream", "clear", "tokens_to_string"])
31
+ processor.speech_chunk_size = chunk_size_seconds
32
+ processor.process_chunk.return_value = mock_output
33
+ processor.end_of_stream.return_value = mock_output
34
+ processor.tokens_to_string.return_value = ""
35
+ return processor
36
+
37
+
38
+ def make_message_processor(chunk_size_seconds=1.0):
39
+ speech_processor = make_speech_processor(chunk_size_seconds)
40
+ return MessageProcessor(client_id=0, speech_processor=speech_processor)
41
+
42
+
43
+ class TestProcessAudio(unittest.TestCase):
44
+
45
+ def test_exact_multiple(self):
46
+ chunk_size = 1.0
47
+ message_processor = make_message_processor(chunk_size)
48
+ # 2 Full chunks, no reminder
49
+ data = np.zeros(SAMPLE_RATE * 2, dtype=np.int16)
50
+
51
+ process_audio(message_processor, SAMPLE_RATE, data)
52
+
53
+ self.assertEqual(message_processor.speech_processor.process_chunk.call_count, 2)
54
+ self.assertEqual(message_processor.client_buffer, b'')
55
+
56
+ def test_remainder_chunk_not_sent_twice(self):
57
+ chunk_size = 1.0
58
+ message_processor = make_message_processor(chunk_size)
59
+ # 2 Full chunks + a remainder of 0.5s
60
+ data = np.zeros(int(SAMPLE_RATE * 2.5), dtype=np.int16)
61
+
62
+ process_audio(message_processor, SAMPLE_RATE, data)
63
+
64
+ # Process_chunk processes full chunks only; remainder stays buffered for end_of_stream
65
+ self.assertEqual(message_processor.speech_processor.process_chunk.call_count, 2)
66
+ # Each sample is int16 (2 bytes), so the buffer size in bytes is samples * 2
67
+ self.assertEqual(len(message_processor.client_buffer), int(SAMPLE_RATE * 0.5) * 2)
68
+
69
+ def test_single_chunk(self):
70
+ chunk_size = 1.0
71
+ message_processor = make_message_processor(chunk_size)
72
+ # Data smaller than one chunk (process_chunk not called, data stays buffered)
73
+ data = np.zeros(SAMPLE_RATE // 2, dtype=np.int16) # 0.5s
74
+
75
+ process_audio(message_processor, SAMPLE_RATE, data)
76
+
77
+ message_processor.speech_processor.process_chunk.assert_not_called()
78
+ # Each sample is int16 (2 bytes), so the buffer size in bytes is samples * 2
79
+ self.assertEqual(len(message_processor.client_buffer), int(SAMPLE_RATE * 0.5) * 2)
80
+
81
+ def test_empty_data(self):
82
+ message_processor = make_message_processor()
83
+ # Empty array (process_chunk never called, buffer remains empty)
84
+ data = np.array([], dtype=np.int16)
85
+
86
+ process_audio(message_processor, SAMPLE_RATE, data)
87
+
88
+ message_processor.speech_processor.process_chunk.assert_not_called()
89
+ self.assertEqual(message_processor.client_buffer, b'')
90
+
91
+
92
+ if __name__ == "__main__":
93
+ unittest.main()
@@ -1 +0,0 @@
1
- 0.2.0
File without changes
File without changes
File without changes
File without changes
File without changes