visqol-python 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
visqol/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ """
2
+ ViSQOL - Virtual Speech Quality Objective Listener (Pure Python Implementation)
3
+
4
+ A pure Python port of Google's ViSQOL v3.3.3 for objective audio quality assessment.
5
+ Compares a reference audio signal with a degraded version and outputs a MOS-LQO score (1-5).
6
+
7
+ Usage:
8
+ from visqol import VisqolApi
9
+
10
+ api = VisqolApi()
11
+ api.create(mode="audio")
12
+ result = api.measure("reference.wav", "degraded.wav")
13
+ print(f"MOS-LQO: {result.moslqo}")
14
+ """
15
+
16
+ __version__ = "3.3.3"
17
+
18
+ from visqol.api import VisqolApi
19
+
20
+ __all__ = ["VisqolApi"]
visqol/__main__.py ADDED
@@ -0,0 +1,92 @@
1
+ """
2
+ ViSQOL command-line interface.
3
+
4
+ Usage:
5
+ python -m visqol --reference ref.wav --degraded deg.wav [--speech_mode]
6
+ """
7
+
8
+ import argparse
9
+ import sys
10
+ import logging
11
+
12
+ from visqol.api import VisqolApi
13
+
14
+
15
+ def main():
16
+ parser = argparse.ArgumentParser(
17
+ description="ViSQOL - Virtual Speech Quality Objective Listener (Python)"
18
+ )
19
+ parser.add_argument(
20
+ "--reference", "-r", required=True,
21
+ help="Path to reference audio file (WAV)"
22
+ )
23
+ parser.add_argument(
24
+ "--degraded", "-d", required=True,
25
+ help="Path to degraded audio file (WAV)"
26
+ )
27
+ parser.add_argument(
28
+ "--speech_mode", action="store_true",
29
+ help="Use speech mode (16kHz, exponential mapping)"
30
+ )
31
+ parser.add_argument(
32
+ "--model", default=None,
33
+ help="Path to SVR model file (Audio mode only)"
34
+ )
35
+ parser.add_argument(
36
+ "--search_window", type=int, default=60,
37
+ help="Search window radius (default: 60)"
38
+ )
39
+ parser.add_argument(
40
+ "--unscaled_speech", action="store_true",
41
+ help="Don't scale speech MOS to max 5.0"
42
+ )
43
+ parser.add_argument(
44
+ "--no_alignment", action="store_true",
45
+ help="Disable global alignment"
46
+ )
47
+ parser.add_argument(
48
+ "--no_realignment", action="store_true",
49
+ help="Disable fine realignment"
50
+ )
51
+ parser.add_argument(
52
+ "--verbose", "-v", action="store_true",
53
+ help="Enable verbose output"
54
+ )
55
+
56
+ args = parser.parse_args()
57
+
58
+ # Setup logging
59
+ level = logging.DEBUG if args.verbose else logging.WARNING
60
+ logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
61
+
62
+ # Run ViSQOL
63
+ mode = "speech" if args.speech_mode else "audio"
64
+ api = VisqolApi()
65
+ api.create(
66
+ mode=mode,
67
+ model_path=args.model,
68
+ search_window=args.search_window,
69
+ use_unscaled_speech=args.unscaled_speech,
70
+ disable_global_alignment=args.no_alignment,
71
+ disable_realignment=args.no_realignment,
72
+ )
73
+
74
+ result = api.measure(args.reference, args.degraded)
75
+
76
+ # Output results
77
+ print(f"MOS-LQO: {result.moslqo:.6f}")
78
+ print(f"VNSIM: {result.vnsim:.6f}")
79
+ if args.verbose:
80
+ print(f"FVNSIM: {result.fvnsim}")
81
+ print(f"FVNSIM10: {result.fvnsim10}")
82
+ print(f"FSTDNSIM: {result.fstdnsim}")
83
+ print(f"FVDEGENERGY: {result.fvdegenergy}")
84
+ print(f"Patches: {len(result.patch_sims)}")
85
+ for i, p in enumerate(result.patch_sims):
86
+ print(f" Patch {i}: sim={p.similarity:.4f} "
87
+ f"ref=[{p.ref_patch_start_time:.3f}-{p.ref_patch_end_time:.3f}] "
88
+ f"deg=[{p.deg_patch_start_time:.3f}-{p.deg_patch_end_time:.3f}]")
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
visqol/alignment.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ Global signal alignment using upper envelope cross-correlation.
3
+
4
+ Corresponds to C++ file: alignment.cc
5
+ """
6
+
7
+ import numpy as np
8
+ from visqol.audio_utils import AudioSignal
9
+ from visqol.signal_utils import upper_envelope, find_best_lag
10
+
11
+
12
+ def globally_align(reference: AudioSignal, degraded: AudioSignal) -> tuple:
13
+ """
14
+ Globally align degraded signal to reference signal.
15
+
16
+ Uses upper envelope cross-correlation to find the best time-domain lag,
17
+ then shifts the degraded signal accordingly.
18
+
19
+ Matches C++ Alignment::GloballyAlign.
20
+
21
+ Args:
22
+ reference: Reference audio signal.
23
+ degraded: Degraded audio signal.
24
+
25
+ Returns:
26
+ Tuple of (aligned_degraded: AudioSignal, lag_seconds: float).
27
+ """
28
+ ref_env = upper_envelope(reference.data)
29
+ deg_env = upper_envelope(degraded.data)
30
+
31
+ best_lag = find_best_lag(ref_env, deg_env)
32
+
33
+ # Limit lag to half the reference length
34
+ if best_lag == 0 or abs(best_lag) > len(reference.data) / 2.0:
35
+ return degraded, 0.0
36
+
37
+ if best_lag < 0:
38
+ # Degraded comes before reference: truncate front of degraded
39
+ new_data = degraded.data[abs(best_lag):]
40
+ else:
41
+ # Reference comes before degraded: prepend zeros to degraded
42
+ new_data = np.concatenate([np.zeros(best_lag), degraded.data])
43
+
44
+ aligned_signal = AudioSignal(new_data, degraded.sample_rate)
45
+ lag_seconds = best_lag / float(degraded.sample_rate)
46
+ return aligned_signal, lag_seconds
47
+
48
+
49
+ def align_and_truncate(reference: AudioSignal,
50
+ degraded: AudioSignal) -> tuple:
51
+ """
52
+ Align and truncate signals to the same length.
53
+
54
+ Matches C++ Alignment::AlignAndTruncate.
55
+
56
+ Returns:
57
+ Tuple of (aligned_ref: AudioSignal, aligned_deg: AudioSignal, lag_seconds: float).
58
+ """
59
+ aligned_deg, lag_seconds = globally_align(reference, degraded)
60
+
61
+ ref_data = reference.data
62
+ deg_data = aligned_deg.data
63
+
64
+ if len(ref_data) > len(deg_data):
65
+ ref_data = ref_data[:len(deg_data)]
66
+ elif len(ref_data) < len(deg_data):
67
+ # For positive lag, the beginning of ref aligns with zeros
68
+ lag_samples = int(lag_seconds * reference.sample_rate)
69
+ if lag_samples > 0:
70
+ ref_data = ref_data[lag_samples:]
71
+ deg_data = deg_data[lag_samples:lag_samples + len(ref_data)]
72
+ else:
73
+ deg_data = deg_data[:len(ref_data)]
74
+
75
+ # Ensure same length
76
+ min_len = min(len(ref_data), len(deg_data))
77
+ ref_data = ref_data[:min_len]
78
+ deg_data = deg_data[:min_len]
79
+
80
+ return (AudioSignal(ref_data, reference.sample_rate),
81
+ AudioSignal(deg_data, degraded.sample_rate),
82
+ lag_seconds)
@@ -0,0 +1,52 @@
1
+ """
2
+ Analysis window for spectrogram construction.
3
+
4
+ Corresponds to C++ files: analysis_window.cc/h
5
+ """
6
+
7
+ import numpy as np
8
+
9
+
10
+ class AnalysisWindow:
11
+ """
12
+ Analysis window used for spectrogram frame windowing.
13
+
14
+ Attributes:
15
+ size: Window size in samples.
16
+ overlap: Overlap ratio (e.g. 0.25 means 25% of window size as hop).
17
+ window_duration: Duration of window in seconds.
18
+ """
19
+
20
+ def __init__(self, sample_rate: int, overlap: float = 0.25,
21
+ window_duration: float = 0.08):
22
+ """
23
+ Args:
24
+ sample_rate: Sample rate of the audio signal.
25
+ overlap: Overlap as a fraction of window size (used as hop = size * overlap).
26
+ window_duration: Duration of the analysis window in seconds.
27
+ """
28
+ self.window_duration = window_duration
29
+ self.overlap = overlap
30
+ self.size = int(round(sample_rate * window_duration))
31
+ self._hann_window = None
32
+
33
+ @property
34
+ def hop_size(self) -> int:
35
+ """Hop size = window_size * overlap."""
36
+ return int(self.size * self.overlap)
37
+
38
+ @property
39
+ def hann_window(self) -> np.ndarray:
40
+ """Precomputed Hann window."""
41
+ if self._hann_window is None:
42
+ # Match C++ exactly: 0.5 - 0.5 * cos(2*pi*i/(size-1))
43
+ n = self.size
44
+ self._hann_window = 0.5 - 0.5 * np.cos(
45
+ 2.0 * np.pi * np.arange(n) / (n - 1)
46
+ )
47
+ return self._hann_window
48
+
49
+ def apply_hann_window(self, frame: np.ndarray) -> np.ndarray:
50
+ """Apply Hann window to a frame."""
51
+ assert len(frame) == self.size
52
+ return frame * self.hann_window
visqol/api.py ADDED
@@ -0,0 +1,110 @@
1
+ """
2
+ ViSQOL public API.
3
+
4
+ Provides a simple interface for comparing audio quality.
5
+
6
+ Corresponds to C++ file: visqol_api.cc
7
+ """
8
+
9
+ import os
10
+ import numpy as np
11
+ from typing import Optional
12
+
13
+ from visqol.audio_utils import AudioSignal
14
+ from visqol.visqol_manager import VisqolManager
15
+ from visqol.visqol_core import SimilarityResult
16
+
17
+
18
+ # Default SVR model path (bundled inside the package)
19
+ _DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(__file__), "model")
20
+ _DEFAULT_SVR_MODEL = os.path.join(_DEFAULT_MODEL_DIR, "libsvm_nu_svr_model.txt")
21
+
22
+
23
+ class VisqolApi:
24
+ """
25
+ Public API for ViSQOL audio quality assessment.
26
+
27
+ Usage:
28
+ api = VisqolApi()
29
+ api.create(mode="audio")
30
+ result = api.measure("reference.wav", "degraded.wav")
31
+ print(f"MOS-LQO: {result.moslqo}")
32
+ """
33
+
34
+ def __init__(self):
35
+ self._manager = VisqolManager()
36
+ self._is_created = False
37
+
38
+ def create(self,
39
+ mode: str = "audio",
40
+ model_path: Optional[str] = None,
41
+ search_window: int = 60,
42
+ use_unscaled_speech: bool = False,
43
+ disable_global_alignment: bool = False,
44
+ disable_realignment: bool = False):
45
+ """
46
+ Initialize ViSQOL with the specified configuration.
47
+
48
+ Args:
49
+ mode: "audio" for music/general audio (48kHz, SVR model) or
50
+ "speech" for speech signals (16kHz, exponential fit).
51
+ model_path: Path to SVR model file (Audio mode only).
52
+ If None, uses the bundled default model.
53
+ search_window: Search window radius (default 60).
54
+ use_unscaled_speech: If True, don't scale speech MOS to 5.0.
55
+ disable_global_alignment: Skip global alignment step.
56
+ disable_realignment: Skip fine realignment step.
57
+ """
58
+ use_speech_mode = mode.lower() == "speech"
59
+
60
+ if not use_speech_mode and model_path is None:
61
+ model_path = _DEFAULT_SVR_MODEL
62
+
63
+ self._manager.init(
64
+ model_path=model_path or "",
65
+ use_speech_mode=use_speech_mode,
66
+ use_unscaled_speech=use_unscaled_speech,
67
+ search_window=search_window,
68
+ disable_global_alignment=disable_global_alignment,
69
+ disable_realignment=disable_realignment,
70
+ )
71
+ self._is_created = True
72
+
73
+ def measure(self, ref_path: str, deg_path: str) -> SimilarityResult:
74
+ """
75
+ Compare two audio files and return quality assessment.
76
+
77
+ Args:
78
+ ref_path: Path to reference audio file (WAV).
79
+ deg_path: Path to degraded audio file (WAV).
80
+
81
+ Returns:
82
+ SimilarityResult containing MOS-LQO score and detailed results.
83
+ """
84
+ if not self._is_created:
85
+ raise RuntimeError(
86
+ "VisqolApi must be created (call .create()) before measuring."
87
+ )
88
+ return self._manager.run(ref_path, deg_path)
89
+
90
+ def measure_from_arrays(self, ref_array: np.ndarray,
91
+ deg_array: np.ndarray,
92
+ sample_rate: int) -> SimilarityResult:
93
+ """
94
+ Compare two audio signals from numpy arrays.
95
+
96
+ Args:
97
+ ref_array: Reference audio signal (1D numpy array).
98
+ deg_array: Degraded audio signal (1D numpy array).
99
+ sample_rate: Sample rate of both signals.
100
+
101
+ Returns:
102
+ SimilarityResult containing MOS-LQO score and detailed results.
103
+ """
104
+ if not self._is_created:
105
+ raise RuntimeError(
106
+ "VisqolApi must be created (call .create()) before measuring."
107
+ )
108
+ ref_signal = AudioSignal(ref_array, sample_rate)
109
+ deg_signal = AudioSignal(deg_array, sample_rate)
110
+ return self._manager.run_from_signals(ref_signal, deg_signal)
visqol/audio_utils.py ADDED
@@ -0,0 +1,90 @@
1
+ """
2
+ Audio utilities: WAV loading, SPL calculation, mono conversion.
3
+
4
+ Corresponds to C++ files: wav_reader.cc, misc_audio.cc (partial)
5
+ """
6
+
7
+ import logging
8
+ import numpy as np
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Sound pressure level reference point (20 µPa)
13
+ SPL_REFERENCE_POINT = 2e-5
14
+
15
+
16
+ class AudioSignal:
17
+ """Container for audio signal data."""
18
+
19
+ def __init__(self, data: np.ndarray, sample_rate: int):
20
+ """
21
+ Args:
22
+ data: 1D numpy array of audio samples (mono), float64.
23
+ sample_rate: Sample rate in Hz.
24
+ """
25
+ self.data = np.asarray(data, dtype=np.float64).ravel()
26
+ self.sample_rate = int(sample_rate)
27
+
28
+ @property
29
+ def duration(self) -> float:
30
+ """Duration in seconds."""
31
+ return len(self.data) / self.sample_rate
32
+
33
+ @property
34
+ def num_samples(self) -> int:
35
+ return len(self.data)
36
+
37
+ def __len__(self):
38
+ return len(self.data)
39
+
40
+
41
+ def load_audio(path: str):
42
+ """
43
+ Load a WAV file and return (data, sample_rate).
44
+ Data is normalized to float64 range [-1, 1].
45
+ """
46
+ import soundfile as sf
47
+ data, sr = sf.read(path, dtype='float64', always_2d=True)
48
+ return data, sr
49
+
50
+
51
+ def to_mono(data: np.ndarray) -> np.ndarray:
52
+ """Convert multi-channel audio to mono by averaging channels."""
53
+ if data.ndim == 2 and data.shape[1] > 1:
54
+ return np.mean(data, axis=1)
55
+ elif data.ndim == 2:
56
+ return data[:, 0]
57
+ return data
58
+
59
+
60
+ def load_as_mono(path: str) -> AudioSignal:
61
+ """Load a WAV file as mono AudioSignal."""
62
+ data, sr = load_audio(path)
63
+ mono_data = to_mono(data)
64
+ return AudioSignal(mono_data, sr)
65
+
66
+
67
+ def calc_sound_pressure_level(signal: AudioSignal) -> float:
68
+ """
69
+ Calculate sound pressure level (dB SPL).
70
+ SPL = 20 * log10(rms / reference_point)
71
+ """
72
+ data = signal.data
73
+ rms = np.sqrt(np.mean(data ** 2))
74
+ if rms == 0:
75
+ return -np.inf
76
+ return 20.0 * np.log10(rms / SPL_REFERENCE_POINT)
77
+
78
+
79
+ def scale_to_match_sound_pressure_level(
80
+ reference: AudioSignal, degraded: AudioSignal
81
+ ) -> AudioSignal:
82
+ """
83
+ Scale the degraded signal to match the SPL of the reference signal.
84
+ Returns a new AudioSignal with scaled data.
85
+ """
86
+ ref_spl = calc_sound_pressure_level(reference)
87
+ deg_spl = calc_sound_pressure_level(degraded)
88
+ scale_factor = 10.0 ** ((ref_spl - deg_spl) / 20.0)
89
+ scaled_data = degraded.data * scale_factor
90
+ return AudioSignal(scaled_data, degraded.sample_rate)