PyPI - simulstream - Versions diffs - 0.1.0__tar.gz → 0.3.0__tar.gz - Mend

simulstream 0.1.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{simulstream-0.1.0/simulstream.egg-info → simulstream-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: simulstream
-Version: 0.1.0
+Version: 0.3.0
 Summary: A server to run simultaneous/streaming experiments and demo
 Author-email: Marco Gaido <mgaido@fbk.eu>, FBK HLT-MT <mt@fbk.eu>
 License:                                  Apache License
@@ -414,14 +414,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -435,7 +436,20 @@ the selected metric (``--scorer``).
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.

{simulstream-0.1.0 → simulstream-0.3.0}/README.md RENAMED Viewed

@@ -177,14 +177,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -198,7 +199,20 @@ the selected metric (``--scorer``).
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension)
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.

{simulstream-0.1.0 → simulstream-0.3.0}/docs/source/conf.py RENAMED Viewed

@@ -4,6 +4,7 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 import os
 import sys
+from simulstream import __version__
 sys.path.insert(0, os.path.abspath('../../'))
@@ -13,7 +14,7 @@ sys.path.insert(0, os.path.abspath('../../'))
 project = 'simulstream'
 copyright = '2025, FBK'
 author = 'Marco Gaido, FBK MT Unit'
-release = '0.1.0'
+release = __version__
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

{simulstream-0.1.0 → simulstream-0.3.0}/pyproject.toml RENAMED Viewed

@@ -66,7 +66,7 @@ eval = [
 ]
 [tool.setuptools.dynamic]
-version = {attr = "simulstream.__version__"}
+version = {file = "simulstream/VERSION.txt"}
 # ---- Explicit project build information ---- #

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/__init__.py RENAMED Viewed

@@ -12,4 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
-__version__ = '0.1.0'
+from pathlib import Path
+with Path(__file__).with_name('VERSION.txt').open('r') as f:
+    __version__ = f.read().strip()

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/inference.py RENAMED Viewed

@@ -49,16 +49,14 @@ def process_audio(
         sample_rate (int): Audio sample rate (Hz).
         data (np.ndarray): Audio samples as int16 array.
     """
+    # speech_chunk_size is expressed in seconds, so the number of samples corresponding to
+    # one speech chunk is the following
     samples_per_chunk = int(
-        sample_rate * message_processor.speech_processor.speech_chunk_size / 1000.0)
-    i = 0
+        sample_rate * message_processor.speech_processor.speech_chunk_size)
     for i in range(0, len(data), samples_per_chunk):
         output = message_processor.process_speech(data[i:i + samples_per_chunk].tobytes())
         LOGGER.debug(f"response: {output}")
-    # send last part of the audio
-    if i < len(data):
-        output = message_processor.process_speech(data[i:].tobytes())
-        LOGGER.debug(f"response: {output}")
 def run_inference(

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/score_quality.py RENAMED Viewed

@@ -124,6 +124,19 @@ def cli_main():
             --log-file metrics.jsonl \\
             --references ref.en \\
             --transcripts src.it \\
+            --audio-definition audio_def.yaml \\
+            --scorer sacrebleu
+    Otherwise, the script can be invoked without specifying the `--audio-definition`,
+    but in this case the name of the refererence and transcript files (trimmed of
+    the extension) must be the same of the audio files used (i.e. the names present
+    in `metrics.jsonl`), e.g.:
+        $ python -m simulstream.metrics.score_quality \\
+            --eval-config config/speech-processor.yaml \\
+            --log-file metrics.jsonl \\
+            --references 1.en,2.en \\
+            --transcripts 1.it,2.it \\
             --scorer sacrebleu
     """
     LOGGER.info(f"Simulstream version: {simulstream.__version__}")
@@ -140,17 +153,23 @@ def cli_main():
              "specified, this should be a single file containing all the lines of the audios in "
              "the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding transcript for an audio file (if applicable "
+             "to the quality metric).")
     parser.add_argument(
         "--transcripts", nargs="+", type=str,
         help="Path to the textual files containing reference transcripts. If `--audio-definition` "
              "is specified, this should be a single file containing all the lines of the audios "
              "in the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding reference for an audio file.")
     parser.add_argument(
         "--audio-definition", "-a", type=str, default=None,
         help="Path to the yaml file containing the segment-level audio information.")
+    parser.add_argument(
+        "--latency-unit", choices=["char", "word"], default="word",
+        help="Whether to computed stats based on words or characters. Default: word.")
     parser.add_argument("--scorer", choices=QUALITY_SCORER_REGISTRY.keys(), required=True)
     args, _ = parser.parse_known_args()

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/latency/mwersegmenter.py RENAMED Viewed

@@ -17,6 +17,7 @@ from dataclasses import dataclass
 from typing import List
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 from simulstream.metrics.readers import ReferenceSentenceDefinition, OutputWithDelays, text_items
 from simulstream.metrics.scorers.latency import LatencyScorer, LatencyScoringSample, LatencyScores
@@ -58,6 +59,7 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
     def __init__(self, args):
         super().__init__(args)
         self.latency_unit = args.latency_unit
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
     def requires_reference(self) -> bool:
         return True
@@ -101,19 +103,50 @@ class MWERSegmenterBasedLatencyScorer(LatencyScorer):
             f"Index {index} should have reached end of delays ({len(delays)})"
         return segmented_delays
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
     def score(self, samples: List[LatencyScoringSample]) -> LatencyScores:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join([sentence_def.content for sentence_def in sample.reference]),
-                sample.hypothesis.final_text).split("\n")
+            hypo = self._tokenize([sample.hypothesis.final_text])
+            refs = self._tokenize(
+                [sentence_def.content for sentence_def in sample.reference])
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
             assert len(resegmented_hypos) == len(sample.reference), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
             ideal_delays_splits = self._split_delays_by_segmented_text(
                 sample.hypothesis.ideal_delays,
                 resegmented_hypos)

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/comet.py RENAMED Viewed

@@ -13,17 +13,13 @@
 # limitations under the License
 import argparse
-import sys
 from typing import List
 from simulstream.metrics.scorers.quality import register_quality_scorer
 from simulstream.metrics.scorers.quality.mwersegmenter import MWERSegmenterBasedQualityScorer, \
     ResegmentedQualityScoringSample
-try:
-    from comet import download_model, load_from_checkpoint
-except ImportError:
-    sys.exit("Please install comet first with `pip install unbabel-comet`.")
+from comet import download_model, load_from_checkpoint
 @register_quality_scorer("comet")

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/metrics/scorers/quality/mwersegmenter.py RENAMED Viewed

@@ -17,6 +17,7 @@ from dataclasses import dataclass
 from typing import List, Optional
 from mweralign import mweralign
+from mweralign.segmenter import CJSegmenter
 from simulstream.metrics.scorers.quality import QualityScorer, QualityScoringSample
@@ -56,6 +57,11 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
         ...         # Compute a custom quality score
         ...         return ...
     """
+    def __init__(self, args):
+        super().__init__(args)
+        self.segmenter = CJSegmenter() if args.latency_unit == "char" else None
     def requires_reference(self) -> bool:
         return True
@@ -75,15 +81,48 @@ class MWERSegmenterBasedQualityScorer(QualityScorer):
         """
         ...
+    def _tokenize(self, text: List[str]) -> List[str]:
+        """
+        Tokenize text using the segmenter.
+        Borrowed from
+        https://github.com/mjpost/mweralign/blob/d23a5479/mweralign/mweralign.py#L147
+        """
+        if self.segmenter is not None:
+            tokenized_text = []
+            for i in range(len(text)):
+                if " ### " in text[i]:
+                    pieces = text[i].strip().split(" ### ")
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                elif "\t" in text[i]:
+                    pieces = text[i].strip().split("\t")
+                    # underlying C++ binary still uses ###
+                    encoded = [" ".join(self.segmenter.encode(p)) for p in pieces]
+                    tokenized_text.append(" ### ".join(encoded))
+                else:
+                    tokenized_text.append(" ".join(self.segmenter.encode(text[i].strip())))
+            return "\n".join(tokenized_text)
+        else:
+            return "\n".join(text)
     def score(self, samples: List[QualityScoringSample]) -> float:
         resegmented_samples = []
         for sample in samples:
             assert sample.reference is not None, "Cannot realign hypothesis to missing reference"
-            resegmented_hypos = mweralign.align_texts(
-                "\n".join(sample.reference), sample.hypothesis).split("\n")
+            hypo = self._tokenize([sample.hypothesis])
+            refs = self._tokenize(sample.reference)
+            resegmented_hypos = mweralign.align_texts(refs, hypo).split("\n")
             assert len(sample.reference) == len(resegmented_hypos), \
                 f"Reference ({sample.audio_name}) has mismatched number of target " \
                 f"({len(sample.reference)}) and resegmented lines ({len(resegmented_hypos)})"
+            if self.segmenter is not None:
+                # segmenter.decode will strip() the spaces, but we need them to align with delays
+                resegmented_hypos = [
+                    hypo.replace(" ", "").replace("_", " ") for hypo in resegmented_hypos]
             resegmented_samples.append(ResegmentedQualityScoringSample(
                 sample.audio_name,
                 resegmented_hypos,

{simulstream-0.1.0 → simulstream-0.3.0}/simulstream/server/speech_processors/base_streamatt.py RENAMED Viewed

@@ -151,6 +151,12 @@ class BaseStreamAtt(BaseSpeechProcessor):
             self._cut_audio_exceeding_maxlen()
             return
+        assert len(self.text_history) > 0, \
+            "If text history is empty after selection, audio cannot be aligned. " \
+            "If you see this message, it indicates a bug, so please open an issue at " \
+            "https://github.com/hlt-mt/simulstream/issues and include the steps that " \
+            "led to this state."
         # Trim the cross-attention by excluding the discarded new generated tokens and the
         # discarded textual history. Output shape: (text_history_len, n_audio_features)
         cross_attn = cross_attn[discarded_text:discarded_text + len(self.text_history), :]
@@ -299,13 +305,15 @@ class PunctuationTextHistory:
     The current implementation supports only SentencePiece.
     """
-    STRONG_PUNCTUATION = [".", "!", "?", ":", ";"]
+    STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
     def __init__(self, config: SimpleNamespace):
         self.config = config
     def select_text_history(self, text_history):
         new_history = []
+        seen_punctuation = False
         for token in reversed(text_history):
             prefix_token = token
             contains_punctuation = False
@@ -314,7 +322,9 @@ class PunctuationTextHistory:
                     contains_punctuation = True
                     break
             if contains_punctuation:
-                break
+                if seen_punctuation:
+                    break
+            seen_punctuation = True
             new_history.append(token)
         # Reverse the list
         return new_history[::-1]

simulstream-0.3.0/simulstream/server/speech_processors/remote/http_proxy_speech_processor.py ADDED Viewed

@@ -0,0 +1,115 @@
+# Copyright 2026 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import base64
+import json
+from http import HTTPStatus
+from typing import List, Any, Dict, Optional
+import uuid
+import urllib.request
+import numpy as np
+from simulstream.server.speech_processors import SpeechProcessor, IncrementalOutput
+class HttpProxySpeechProcessor(SpeechProcessor):
+    """
+    HTTP-based proxy implementation of :class:`SpeechProcessor`.
+    This class does not perform speech processing locally. Instead, it forwards
+    all method calls to a remote speech processor exposed via HTTP, maintaining
+    a dedicated session on the server side.
+    Each instance of this class corresponds to exactly one remote session.
+    """
+    @classmethod
+    def load_model(cls, config):
+        pass
+    def __init__(self, config):
+        super().__init__(config)
+        self.base_url = f"http://{config.hostname}:{config.port}/"
+        self.session_id = uuid.uuid4().hex
+        self._cached_speech_chunk_size = None
+    def _http_request(
+            self, path: str, method: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        data = json.dumps(payload).encode("utf-8")
+        req = urllib.request.Request(
+            self.base_url + path,
+            data=data,
+            headers={"Content-Type": "application/json"},
+            method=method,
+        )
+        with urllib.request.urlopen(req) as resp:
+            if resp.status == HTTPStatus.NO_CONTENT:
+                return None
+            return json.loads(resp.read())
+    @staticmethod
+    def _to_incremental_outputs(json_dict: Dict[str, Any]):
+        return IncrementalOutput(
+            new_tokens=json_dict["new_tokens"],
+            new_string=json_dict["new_string"],
+            deleted_tokens=json_dict["deleted_tokens"],
+            deleted_string=json_dict["deleted_string"]
+        )
+    @property
+    def speech_chunk_size(self) -> float:
+        if self._cached_speech_chunk_size is None:
+            response = self._http_request("speech_chunk_size", "GET", {
+                "session_id": self.session_id
+            })
+            self._cached_speech_chunk_size = response["speech_chunk_size"]
+        return self._cached_speech_chunk_size
+    def process_chunk(self, waveform: np.float32) -> IncrementalOutput:
+        response = self._http_request("process_chunk", "POST", {
+            "session_id": self.session_id,
+            "waveform": base64.b64encode(waveform.tobytes()).decode("utf-8"),
+        })
+        return self._to_incremental_outputs(response)
+    def set_source_language(self, language):
+        self._http_request("source_language", "PUT", {
+            "session_id": self.session_id,
+            "language": language,
+        })
+    def set_target_language(self, language):
+        self._http_request("target_language", "PUT", {
+            "session_id": self.session_id,
+            "language": language,
+        })
+    def end_of_stream(self) -> IncrementalOutput:
+        response = self._http_request("end_of_stream", "POST", {
+            "session_id": self.session_id,
+        })
+        return self._to_incremental_outputs(response)
+    def clear(self):
+        self._http_request("clear", "POST", {
+            "session_id": self.session_id,
+        })
+    def tokens_to_string(self, tokens: List[str]) -> str:
+        response = self._http_request("tokens_to_string", "GET", {
+            "session_id": self.session_id,
+            "tokens": tokens,
+        })
+        return response["tokens_as_string"]

simulstream 0.1.0__tar.gz → 0.3.0__tar.gz

simulstream 0.1.0tar.gz → 0.3.0tar.gz