PyPI - simulstream - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

simulstream 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{simulstream-0.2.0/simulstream.egg-info → simulstream-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: simulstream
-Version: 0.2.0
+Version: 0.3.0
 Summary: A server to run simultaneous/streaming experiments and demo
 Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
 License:                                  Apache License
@@ -414,14 +414,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.

{simulstream-0.2.0 → simulstream-0.3.0}/README.md RENAMED Viewed

@@ -177,14 +177,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -198,7 +199,20 @@ the selected metric (``--scorer``).
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/inference.py RENAMED Viewed

@@ -53,14 +53,10 @@ def process_audio(
     # one speech chunk is the following
     samples_per_chunk = int(
         sample_rate * message_processor.speech_processor.speech_chunk_size)
-    i = 0
     for i in range(0, len(data), samples_per_chunk):
         output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
         LOGGER.debug(f"response: {output}")
-    # send last part of the audio
-    if i < len(data):
-        output = message_processor.process_speech(data[i:].tobytes())
-        LOGGER.debug(f"response: {output}")
 def run_inference(

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/score_quality.py RENAMED Viewed

@@ -124,6 +124,19 @@ def cli_main():
             --log-file metrics.jsonl \\
             --references ref.en \\
             --transcripts src.it \\
+            --audio-definition audio_def.yaml \\
+            --scorer sacrebleu
+    Otherwise, the script can be invoked without specifying the `--audio-definition`,
+    but in this case the name of the refererence and transcript files (trimmed of
+    the extension) must be the same of the audio files used (i.e. the names present
+    in `metrics.jsonl`), e.g.:
+        $ python -m simulstream.metrics.score_quality \\
+            --eval-config config/speech-processor.yaml \\
+            --log-file metrics.jsonl \\
+            --references 1.en,2.en \\
+            --transcripts 1.it,2.it \\
             --scorer sacrebleu
     """
     LOGGER.info(f"Simulstream version: {simulstream.__version__}")
@@ -140,17 +153,23 @@ def cli_main():
              "specified, this should be a single file containing all the lines of the audios in "
              "the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding transcript for an audio file (if applicable "
+             "to the quality metric).")
     parser.add_argument(
         "--transcripts", nargs="+", type=str,
         help="Path to the textual files containing reference transcripts. If `--audio-definition` "
              "is specified, this should be a single file containing all the lines of the audios "
              "in the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding reference for an audio file.")
     parser.add_argument(
         "--audio-definition", "-a", type=str, default=None,
         help="Path to the yaml file containing the segment-level audio information.")
+    parser.add_argument(
+        "--latency-unit", choices=["char", "word"], default="word",
+        help="Whether to computed stats based on words or characters. Default: word.")
     parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
     args, _ = parser.parse_known_args()

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/mwersegmenter.py RENAMED Viewed

@@ -17,6 +17,7 @@ from dataclasses import dataclass
 from typing import List
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
 from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
@@ -58,6 +59,7 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
     def __init__(self, args):
         super().__init__(args)
         self.latency_unit = args.latency_unit
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
     def requires_reference(self) -> bool:
         return True
@@ -101,19 +103,50 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
             f"Index {index} should have reached end of delays ({len(delays)})"
         return segmented_delays
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
     def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join([sentence_def.content for sentence_def in sample.reference]),
-                sample.hypothesis.final_text).split("\n")
+            hypo = self._tokenize([sample.hypothesis.final_text])
+            refs = self._tokenize(
+                [sentence_def.content for sentence_def in sample.reference])
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
             assert len(resegmented_hypos) == len(sample.reference), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
             ideal_delays_splits = self._split_delays_by_segmented_text(
                 sample.hypothesis.ideal_delays,
                 resegmented_hypos)

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/comet.py RENAMED Viewed

@@ -13,17 +13,13 @@
 # limitations under the License
 import argparse
-import sys
 from typing import List
 from simulstream.metrics.scorers.quality import register_quality_scorer
 from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
     ResegmentedQualityScoringSample
-try:
-    from comet import download_model, load_from_checkpoint
-except ImportError:
-    sys.exit("Please install comet first with `pip install unbabel-comet`.")
+from comet import download_model, load_from_checkpoint
 @register_quality_scorer("comet")

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/mwersegmenter.py RENAMED Viewed

@@ -17,6 +17,7 @@ from dataclasses import dataclass
 from typing import List, Optional
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
@@ -56,6 +57,11 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
         ...         # Compute a custom quality score
         ...         return ...
     """
+    def __init__(self, args):
+        super().__init__(args)
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
     def requires_reference(self) -> bool:
         return True
@@ -75,15 +81,48 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
         """
         ...
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
     def score(self, samples: List[QualityScoringSample]) -> float:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join(sample.reference), sample.hypothesis).split("\n")
+            hypo = self._tokenize([sample.hypothesis])
+            refs = self._tokenize(sample.reference)
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
             assert len(sample.reference) == len(resegmented_hypos), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
             resegmented_samples.append(ResegmentedQualityScoringSample(
                 sample.audio_name,
                 resegmented_hypos,

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base_streamatt.py RENAMED Viewed

@@ -151,6 +151,12 @@ class BaseStreamAtt(BaseSpeechProcessor):
             self._cut_audio_exceeding_maxlen()
             return
+        assert len(self.text_history) > 0, \
+            "If text history is empty after selection, audio cannot be aligned. " \
+            "If you see this message, it indicates a bug, so please open an issue at " \
+            "https://github.com/hlt-mt/simulstream/issues and include the steps that " \
+            "led to this state."
         # Trim the cross-attention by excluding the discarded new generated tokens and the
         # discarded textual history. Output shape: (text_history_len, n_audio_features)
         cross_attn = cross_attn[discarded_text:discarded_text + len(self.text_history), :]
@@ -299,13 +305,15 @@ class PunctuationTextHistory:
     The current implementation supports only SentencePiece.
     """
-    STRONG_PUNCTUATION = [".", "!", "?", ":", ";"]
+    STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
     def __init__(self, config: SimpleNamespace):
         self.config = config
     def select_text_history(self, text_history):
         new_history = []
+        seen_punctuation = False
         for token in reversed(text_history):
             prefix_token = token
             contains_punctuation = False
@@ -314,7 +322,9 @@ class PunctuationTextHistory:
                     contains_punctuation = True
                     break
             if contains_punctuation:
-                break
+                if seen_punctuation:
+                    break
+            seen_punctuation = True
             new_history.append(token)
         # Reverse the list
         return new_history[::-1]

simulstream-0.3.0/simulstream/version.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.3.0

{simulstream-0.2.0 → simulstream-0.3.0/simulstream.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: simulstream
-Version: 0.2.0
+Version: 0.3.0
 Summary: A server to run simultaneous/streaming experiments and demo
 Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
 License:                                  Apache License
@@ -414,14 +414,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/SOURCES.txt RENAMED Viewed

@@ -49,8 +49,12 @@ simulstream/server/speech_processors/remote/__init__.py
 simulstream/server/speech_processors/remote/http_proxy_speech_processor.py
 simulstream/server/speech_processors/remote/http_speech_processor_server.py
 uts/__init__.py
+uts/test_inference.py
 uts/utils.py
 uts/metrics/__init__.py
 uts/metrics/log_reader.py
+uts/metrics/test_stream_laal.py
+uts/metrics/test_tokenize_no_inplace.py
 uts/speech_processors/__init__.py
-uts/speech_processors/test_simuleval_wrapper.py
+uts/speech_processors/test_simuleval_wrapper.py
+uts/speech_processors/test_streamatt.py

{simulstream-0.2.0 → simulstream-0.3.0}/simulstream.egg-info/top_level.txt RENAMED Viewed

@@ -1,6 +1,3 @@
-_build
-_static
-_templates
 config
 dist
 docs

simulstream-0.3.0/uts/metrics/test_stream_laal.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright 2026 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import unittest
+from argparse import Namespace
+from simulstream.metrics.readers import OutputWithDelays, ReferenceSentenceDefinition
+from simulstream.metrics.scorers.latency import LatencyScoringSample
+from simulstream.metrics.scorers.latency.stream_laal import StreamLaal
+class StreamLaalTestCase(unittest.TestCase):
+    def test_basic(self):
+        reference = [
+            ReferenceSentenceDefinition(
+                "A New York, sono a capo di un'associazione no profit, chiamata Robin Hood.",
+                12.61,
+                4.07,
+            ),
+            ReferenceSentenceDefinition(
+                "Quando non combatto la povertà, combatto gli incendi come assistente capitano di "
+                "una brigata di pompieri volontari.",
+                16.9,
+                5.14,
+            )
+        ]
+        hypothesis = OutputWithDelays(
+            "Tornando a New York, sono il capo dello sviluppo per un non-profit chiamato Robin "
+            "Hood. Quando non sto combattendo la povertà, sto combattendo i fuochi.",
+            [14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 18.0,
+             18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 20.0, 20.0, 20.0, 20.0],
+            [18.22, 18.22, 18.22, 18.22, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93, 19.93,
+             19.93, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 23.01, 27.30, 27.30, 27.30,
+             27.30,]
+        )
+        scorer = StreamLaal(Namespace(latency_unit="word"))
+        score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
+        self.assertAlmostEqual(score.ideal_latency, 0.868587, 4)
+        self.assertAlmostEqual(score.computational_aware_latency, 5.86, 4)
+    def test_with_characters(self):
+        reference = [
+            ReferenceSentenceDefinition(
+                "今天她看起很好，",
+                12.61,
+                3.07,
+            ),
+            ReferenceSentenceDefinition(
+                "我们一起去公园散步吧。",
+                16.9,
+                3.14,
+            ),
+            ReferenceSentenceDefinition(
+                "Amy",
+                21.0,
+                0.5,
+            ),
+            ReferenceSentenceDefinition(
+                "今天心情很好",
+                21.5,
+                2.0,
+            ),
+        ]
+        hypothesis = OutputWithDelays(
+            "今天她很漂亮，我们一起去花园跑步吧。Amy 今天心情很好",
+            [14.0, 14.0, 14.0, 15.0, 15.0, 16.0, 17.0,
+             17.0, 17.0, 18.0, 18.0, 19.0, 19.0, 20.0, 20.0, 21.0, 21.0, 21.0,
+             22.0, 22.0, 22.0, 22.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0],
+            [14.5, 14.5, 14.5, 15.2, 15.2, 16.8, 17.5,
+             18.0, 18.5, 18.5, 18.5, 20.1, 20.1, 21.3, 21.3, 22.0, 22.0, 22.0,
+             23.0, 23.0, 23.0, 23.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0],
+        )
+        scorer = StreamLaal(Namespace(latency_unit="char"))
+        score = scorer.score([LatencyScoringSample("a", hypothesis, reference)])
+        self.assertAlmostEqual(score.ideal_latency, 1.333312, 4)
+        self.assertAlmostEqual(score.computational_aware_latency, 2.074095, 4)
+if __name__ == '__main__':
+    unittest.main()

simulstream-0.3.0/uts/metrics/test_tokenize_no_inplace.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright 2026 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import copy
+import unittest
+from argparse import Namespace
+from simulstream.metrics.scorers.quality.mwersegmenter import (
+    MWERSegmenterBasedQualityScorer,
+)
+from simulstream.metrics.scorers.latency.mwersegmenter import (
+    MWERSegmenterBasedLatencyScorer,
+)
+from simulstream.metrics.scorers.latency import LatencyScores
+class TokenizeNoInplaceModificationTestCase(unittest.TestCase):
+    """
+    Ensures that _tokenize does not alter the references.
+    See https://github.com/hlt-mt/simulstream/pull/20#issuecomment-3960951980
+    """
+    def _make_quality_scorer(self, latency_unit="char"):
+        """Create a concrete subclass of the abstract quality scorer."""
+        class _Scorer(MWERSegmenterBasedQualityScorer):
+            def _do_score(self, samples):
+                return 0.0
+            @classmethod
+            def add_arguments(cls, parser):
+                pass
+            def requires_source(self):
+                return False
+        args = Namespace(latency_unit=latency_unit)
+        return _Scorer(args)
+    def _make_latency_scorer(self, latency_unit="char"):
+        """Create a concrete subclass of the abstract latency scorer."""
+        class _Scorer(MWERSegmenterBasedLatencyScorer):
+            def _do_score(self, samples):
+                return LatencyScores(0.0, [])
+            @classmethod
+            def add_arguments(cls, parser):
+                pass
+            def requires_source(self):
+                return False
+        args = Namespace(latency_unit=latency_unit)
+        return _Scorer(args)
+    def test_quality_tokenize_does_not_modify_input(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好世界", "这是测试"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_latency_tokenize_does_not_modify_input(self):
+        scorer = self._make_latency_scorer(latency_unit="char")
+        text = ["你好世界", "这是测试"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_quality_tokenize_no_modify_with_separator(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好 ### 世界"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_quality_tokenize_no_modify_with_tab(self):
+        scorer = self._make_quality_scorer(latency_unit="char")
+        text = ["你好\t世界"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_quality_tokenize_does_not_modify_input_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello world", "this is a test"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_latency_tokenize_does_not_modify_input_english(self):
+        scorer = self._make_latency_scorer(latency_unit="word")
+        text = ["hello world", "this is a test"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_quality_tokenize_no_modify_with_separator_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello ### world"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+    def test_quality_tokenize_no_modify_with_tab_english(self):
+        scorer = self._make_quality_scorer(latency_unit="word")
+        text = ["hello\tworld"]
+        original = copy.deepcopy(text)
+        scorer._tokenize(text)
+        self.assertEqual(text, original)
+if __name__ == '__main__':
+    unittest.main()

simulstream-0.3.0/uts/speech_processors/test_streamatt.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright 2026 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import unittest
+from types import SimpleNamespace
+from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory
+class TestPunctuationTextHistory(unittest.TestCase):
+    def setUp(self):
+        self.config = SimpleNamespace()
+        self.punctuation_text_history = PunctuationTextHistory(self.config)
+    def test_punctuation_last(self):
+        """ Test PunctuationTextHistory method when the history ends with strong punctuation. """
+        # Test word level
+        en_history = ["Hi", "!", "I", "am", "Sara", "."]
+        selected_history = self.punctuation_text_history.select_text_history(en_history)
+        self.assertEqual(selected_history, ["I", "am", "Sara", "."])
+        # Test character level
+        zh_history = ['担', '任', '开', '发', '主', '管', '。']
+        selected_history = self.punctuation_text_history.select_text_history(zh_history)
+        self.assertEqual(selected_history, ['担', '任', '开', '发', '主', '管', '。'])
+    def test_punctuation_in_between(self):
+        """ Test PunctuationTextHistory method when punctuation separates two sentences. """
+        # Test word level
+        en_history = ["Hi", "!", "I", "am", "Sara"]
+        selected_history = self.punctuation_text_history.select_text_history(en_history)
+        self.assertEqual(selected_history, ["I", "am", "Sara"])
+        # Test character level
+        zh_history = ['开', '发', '主', '管', '。', '担', '任']
+        selected_history = self.punctuation_text_history.select_text_history(zh_history)
+        self.assertEqual(selected_history, ['担', '任'])
+    def test_no_strong_punctuation(self):
+        """ Test PunctuationTextHistory method when no strong punctuation is present. """
+        # Test word level
+        en_history = ["Hi", ",", "I", "am", "Sara"]
+        selected_history = self.punctuation_text_history.select_text_history(en_history)
+        self.assertEqual(selected_history, ["Hi", ",", "I", "am", "Sara"])
+        # Test character level
+        zh_history = ['回', '到', '纽', '约', '后', '，', '我']
+        selected_history = self.punctuation_text_history.select_text_history(zh_history)
+        self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', '，', '我'])
+if __name__ == "__main__":
+    unittest.main()

simulstream-0.3.0/uts/test_inference.py ADDED Viewed

@@ -0,0 +1,93 @@
+# Copyright 2026 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import unittest
+from unittest.mock import MagicMock
+import numpy as np
+from simulstream.inference import process_audio
+from simulstream.server.message_processor import MessageProcessor
+from simulstream.server.speech_processors import SAMPLE_RATE
+from simulstream.server.speech_processors.incremental_output import IncrementalOutput
+def make_speech_processor(chunk_size_seconds=1.0):
+    """Creates a mock SpeechProcessor with the minimal interface needed."""
+    mock_output = IncrementalOutput(
+        new_tokens=[], deleted_tokens=0, new_string="", deleted_string="")
+    processor = MagicMock(
+        spec=["speech_chunk_size", "process_chunk", "end_of_stream", "clear", "tokens_to_string"])
+    processor.speech_chunk_size = chunk_size_seconds
+    processor.process_chunk.return_value = mock_output
+    processor.end_of_stream.return_value = mock_output
+    processor.tokens_to_string.return_value = ""
+    return processor
+def make_message_processor(chunk_size_seconds=1.0):
+    speech_processor = make_speech_processor(chunk_size_seconds)
+    return MessageProcessor(client_id=0, speech_processor=speech_processor)
+class TestProcessAudio(unittest.TestCase):
+    def test_exact_multiple(self):
+        chunk_size = 1.0
+        message_processor = make_message_processor(chunk_size)
+        # 2 Full chunks, no reminder
+        data = np.zeros(SAMPLE_RATE * 2, dtype=np.int16)
+        process_audio(message_processor, SAMPLE_RATE, data)
+        self.assertEqual(message_processor.speech_processor.process_chunk.call_count, 2)
+        self.assertEqual(message_processor.client_buffer, b'')
+    def test_remainder_chunk_not_sent_twice(self):
+        chunk_size = 1.0
+        message_processor = make_message_processor(chunk_size)
+        # 2 Full chunks + a remainder of 0.5s
+        data = np.zeros(int(SAMPLE_RATE * 2.5), dtype=np.int16)
+        process_audio(message_processor, SAMPLE_RATE, data)
+        # Process_chunk processes full chunks only; remainder stays buffered for end_of_stream
+        self.assertEqual(message_processor.speech_processor.process_chunk.call_count, 2)
+        # Each sample is int16 (2 bytes), so the buffer size in bytes is samples * 2
+        self.assertEqual(len(message_processor.client_buffer), int(SAMPLE_RATE * 0.5) * 2)
+    def test_single_chunk(self):
+        chunk_size = 1.0
+        message_processor = make_message_processor(chunk_size)
+        # Data smaller than one chunk (process_chunk not called, data stays buffered)
+        data = np.zeros(SAMPLE_RATE // 2, dtype=np.int16)  # 0.5s
+        process_audio(message_processor, SAMPLE_RATE, data)
+        message_processor.speech_processor.process_chunk.assert_not_called()
+        # Each sample is int16 (2 bytes), so the buffer size in bytes is samples * 2
+        self.assertEqual(len(message_processor.client_buffer), int(SAMPLE_RATE * 0.5) * 2)
+    def test_empty_data(self):
+        message_processor = make_message_processor()
+        # Empty array (process_chunk never called, buffer remains empty)
+        data = np.array([], dtype=np.int16)
+        process_audio(message_processor, SAMPLE_RATE, data)
+        message_processor.speech_processor.process_chunk.assert_not_called()
+        self.assertEqual(message_processor.client_buffer, b'')
+if __name__ == "__main__":
+    unittest.main()