visqol-python 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
visqol/visqol_core.py ADDED
@@ -0,0 +1,240 @@
1
+ """
2
+ Core ViSQOL algorithm: assembles all components into a similarity computation pipeline.
3
+
4
+ Corresponds to C++ file: visqol.cc
5
+ """
6
+
7
+ import math
8
+ import numpy as np
9
+ from typing import List
10
+ from dataclasses import dataclass, field
11
+
12
+ from visqol.audio_utils import AudioSignal, scale_to_match_sound_pressure_level
13
+ from visqol.analysis_window import AnalysisWindow
14
+ from visqol.gammatone import (
15
+ GammatoneSpectrogramBuilder, Spectrogram,
16
+ prepare_spectrograms_for_comparison,
17
+ )
18
+ from visqol.nsim import PatchSimilarityResult, measure_patch_similarity
19
+ from visqol.patch_selector import (
20
+ find_most_optimal_deg_patches,
21
+ finely_align_and_recreate_patches,
22
+ )
23
+ from visqol.quality_mapper import SimilarityToQualityMapper
24
+
25
+
26
+ @dataclass
27
+ class SimilarityResult:
28
+ """Complete result of a ViSQOL similarity comparison."""
29
+ moslqo: float = 0.0
30
+ vnsim: float = 0.0
31
+ fvnsim: np.ndarray = field(default_factory=lambda: np.array([]))
32
+ fvnsim10: np.ndarray = field(default_factory=lambda: np.array([]))
33
+ fstdnsim: np.ndarray = field(default_factory=lambda: np.array([]))
34
+ fvdegenergy: np.ndarray = field(default_factory=lambda: np.array([]))
35
+ center_freq_bands: np.ndarray = field(default_factory=lambda: np.array([]))
36
+ patch_sims: List[PatchSimilarityResult] = field(default_factory=list)
37
+
38
+
39
+ def calc_per_patch_mean_freq_band_means(
40
+ sim_results: List[PatchSimilarityResult]
41
+ ) -> np.ndarray:
42
+ """
43
+ Calculate mean of per-frequency-band means across all patches.
44
+ This is fvnsim.
45
+ Matches C++ Visqol::CalcPerPatchMeanFreqBandMeans.
46
+ """
47
+ all_means = np.array([r.freq_band_means for r in sim_results])
48
+ return np.mean(all_means, axis=0)
49
+
50
+
51
+ def calc_per_patch_freq_band_quantile(
52
+ sim_results: List[PatchSimilarityResult],
53
+ quantile: float = 0.10,
54
+ ) -> np.ndarray:
55
+ """
56
+ Calculate quantile of per-frequency-band means across patches.
57
+ This is fvnsim10.
58
+ Matches C++ Visqol::CalcPerPatchFreqBandQuantile.
59
+ """
60
+ num_freq_bands = len(sim_results[0].freq_band_means)
61
+ result = np.zeros(num_freq_bands)
62
+
63
+ for band in range(num_freq_bands):
64
+ band_nsims = sorted([r.freq_band_means[band] for r in sim_results])
65
+ num_in_quantile = max(1, int(len(band_nsims) * quantile))
66
+ result[band] = np.mean(band_nsims[:num_in_quantile])
67
+
68
+ return result
69
+
70
+
71
+ def calc_per_patch_mean_freq_band_deg_energy(
72
+ sim_results: List[PatchSimilarityResult]
73
+ ) -> np.ndarray:
74
+ """
75
+ Calculate mean of per-frequency-band degraded energy across patches.
76
+ This is fvdegenergy.
77
+ Matches C++ Visqol::CalcPerPatchMeanFreqBandDegradedEnergy.
78
+ """
79
+ all_energy = np.array([r.freq_band_deg_energy for r in sim_results])
80
+ return np.mean(all_energy, axis=0)
81
+
82
+
83
+ def calc_per_patch_mean_freq_band_stddevs(
84
+ sim_results: List[PatchSimilarityResult],
85
+ frame_duration: float,
86
+ ) -> np.ndarray:
87
+ """
88
+ Calculate pooled standard deviation across patches.
89
+ This is fstdnsim.
90
+ Matches C++ Visqol::CalcPerPatchMeanFreqBandStdDevs.
91
+
92
+ Uses the pooled variance formula:
93
+ https://en.wikipedia.org/wiki/Pooled_variance
94
+ """
95
+ num_freq_bands = len(sim_results[0].freq_band_means)
96
+
97
+ # First compute fvnsim (global mean per band)
98
+ fvnsim = calc_per_patch_mean_freq_band_means(sim_results)
99
+
100
+ total_frame_count = 0
101
+ contribution = np.zeros(num_freq_bands)
102
+
103
+ for patch in sim_results:
104
+ secs_in_patch = patch.ref_patch_end_time - patch.ref_patch_start_time
105
+ frame_count = int(math.ceil(secs_in_patch / frame_duration))
106
+ total_frame_count += frame_count
107
+
108
+ for band in range(num_freq_bands):
109
+ stddev = patch.freq_band_stddevs[band]
110
+ mean = patch.freq_band_means[band]
111
+ contribution[band] += (frame_count - 1) * stddev * stddev
112
+ contribution[band] += frame_count * mean * mean
113
+
114
+ if total_frame_count <= 1:
115
+ return np.zeros(num_freq_bands)
116
+
117
+ variance = (contribution - fvnsim * fvnsim * total_frame_count) / (
118
+ total_frame_count - 1
119
+ )
120
+
121
+ # sqrt, filtering negative values due to precision
122
+ result = np.where(variance < 0, 0.0, np.sqrt(variance))
123
+ return result
124
+
125
+
126
+ def alter_for_similarity_extremes(vnsim: float, moslqo: float) -> float:
127
+ """
128
+ Handle extreme similarity cases.
129
+ Matches C++ Visqol::AlterForSimilarityExtremes.
130
+ """
131
+ if vnsim < 0.15:
132
+ return 1.0
133
+ return moslqo
134
+
135
+
136
+ def calc_frame_duration(frame_size: int, sample_rate: int) -> float:
137
+ """Calculate frame duration in seconds."""
138
+ return frame_size / float(sample_rate)
139
+
140
+
141
+ class VisqolCore:
142
+ """
143
+ Core ViSQOL algorithm.
144
+ Orchestrates the complete similarity calculation pipeline.
145
+ """
146
+
147
+ def calculate_similarity(
148
+ self,
149
+ ref_signal: AudioSignal,
150
+ deg_signal: AudioSignal,
151
+ spect_builder: GammatoneSpectrogramBuilder,
152
+ window: AnalysisWindow,
153
+ patch_creator,
154
+ search_window: int,
155
+ quality_mapper: SimilarityToQualityMapper,
156
+ disable_realignment: bool = False,
157
+ ) -> SimilarityResult:
158
+ """
159
+ Calculate full similarity between reference and degraded signals.
160
+
161
+ Args:
162
+ ref_signal: Reference audio signal.
163
+ deg_signal: Degraded audio signal.
164
+ spect_builder: Gammatone spectrogram builder.
165
+ window: Analysis window.
166
+ patch_creator: Patch creator (ImagePatchCreator or VadPatchCreator).
167
+ search_window: Search window radius in patch units.
168
+ quality_mapper: Similarity-to-quality mapper.
169
+ disable_realignment: If True, skip fine realignment.
170
+
171
+ Returns:
172
+ SimilarityResult with MOS-LQO score and all intermediate data.
173
+ """
174
+ # Stage 1: Preprocessing - SPL matching
175
+ deg_signal = scale_to_match_sound_pressure_level(ref_signal, deg_signal)
176
+
177
+ # Build spectrograms
178
+ ref_spectrogram = spect_builder.build(ref_signal, window)
179
+ deg_spectrogram = spect_builder.build(deg_signal, window)
180
+
181
+ # Prepare spectrograms for comparison (dB conversion + noise floor)
182
+ ref_db, deg_db = prepare_spectrograms_for_comparison(
183
+ ref_spectrogram, deg_spectrogram
184
+ )
185
+
186
+ # Stage 2: Feature selection and similarity measure
187
+ ref_patch_indices = patch_creator.create_ref_patch_indices(
188
+ ref_db, ref_signal, window
189
+ )
190
+
191
+ frame_duration = calc_frame_duration(
192
+ int(window.size * window.overlap), ref_signal.sample_rate
193
+ )
194
+
195
+ ref_patches = patch_creator.create_patches_from_indices(
196
+ ref_db, ref_patch_indices
197
+ )
198
+
199
+ # DP patch matching
200
+ sim_match_info = find_most_optimal_deg_patches(
201
+ ref_patches, ref_patch_indices, deg_db,
202
+ frame_duration, search_window
203
+ )
204
+
205
+ # Fine realignment
206
+ if not disable_realignment:
207
+ sim_match_info = finely_align_and_recreate_patches(
208
+ sim_match_info, ref_signal, deg_signal,
209
+ spect_builder, window
210
+ )
211
+
212
+ # Aggregate statistics
213
+ fvnsim = calc_per_patch_mean_freq_band_means(sim_match_info)
214
+ fvnsim10 = calc_per_patch_freq_band_quantile(sim_match_info, 0.10)
215
+ fstdnsim = calc_per_patch_mean_freq_band_stddevs(
216
+ sim_match_info, frame_duration
217
+ )
218
+ fvdegenergy = calc_per_patch_mean_freq_band_deg_energy(sim_match_info)
219
+
220
+ # Predict MOS
221
+ moslqo = quality_mapper.predict_quality(
222
+ fvnsim, fvnsim10, fstdnsim, fvdegenergy
223
+ )
224
+
225
+ # Calculate vnsim (mean of fvnsim)
226
+ vnsim = float(np.mean(fvnsim))
227
+
228
+ # Handle extreme cases
229
+ moslqo = alter_for_similarity_extremes(vnsim, moslqo)
230
+
231
+ return SimilarityResult(
232
+ moslqo=moslqo,
233
+ vnsim=vnsim,
234
+ fvnsim=fvnsim,
235
+ fvnsim10=fvnsim10,
236
+ fstdnsim=fstdnsim,
237
+ fvdegenergy=fvdegenergy,
238
+ center_freq_bands=ref_spectrogram.center_freq_bands,
239
+ patch_sims=sim_match_info,
240
+ )
@@ -0,0 +1,194 @@
1
+ """
2
+ ViSQOL Manager: orchestrates the complete ViSQOL workflow.
3
+
4
+ Corresponds to C++ file: visqol_manager.cc
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from visqol.audio_utils import AudioSignal, load_as_mono
12
+ from visqol.analysis_window import AnalysisWindow
13
+ from visqol.alignment import globally_align
14
+ from visqol.gammatone import GammatoneSpectrogramBuilder
15
+ from visqol.patch_creator import ImagePatchCreator, VadPatchCreator
16
+ from visqol.quality_mapper import (
17
+ SvrSimilarityToQualityMapper,
18
+ SpeechSimilarityToQualityMapper,
19
+ )
20
+ from visqol.visqol_core import VisqolCore, SimilarityResult
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Default parameters (matching C++ VisqolManager constants)
25
+ PATCH_SIZE_AUDIO = 30
26
+ PATCH_SIZE_SPEECH = 20
27
+ NUM_BANDS_AUDIO = 32
28
+ NUM_BANDS_SPEECH = 21
29
+ MINIMUM_FREQ = 50.0 # Hz (wideband)
30
+ OVERLAP = 0.25 # 25% overlap
31
+ DURATION_MISMATCH_TOLERANCE = 1.0 # seconds
32
+
33
+ K_16K_SAMPLE_RATE = 16000
34
+ K_48K_SAMPLE_RATE = 48000
35
+
36
+
37
+ class VisqolManager:
38
+ """
39
+ Main manager class that configures and runs ViSQOL.
40
+ """
41
+
42
+ def __init__(self):
43
+ self.use_speech_mode = False
44
+ self.search_window = 60
45
+ self.disable_global_alignment = False
46
+ self.disable_realignment = False
47
+ self.is_initialized = False
48
+
49
+ self.spect_builder = None
50
+ self.patch_creator = None
51
+ self.quality_mapper = None
52
+ self.visqol_core = VisqolCore()
53
+
54
+ def init(self,
55
+ model_path: str = "",
56
+ use_speech_mode: bool = False,
57
+ use_unscaled_speech: bool = False,
58
+ search_window: int = 60,
59
+ disable_global_alignment: bool = False,
60
+ disable_realignment: bool = False):
61
+ """
62
+ Initialize the ViSQOL manager.
63
+
64
+ Args:
65
+ model_path: Path to SVR model file (Audio mode).
66
+ use_speech_mode: Use speech mode (16kHz, 21 bands).
67
+ use_unscaled_speech: Don't scale speech MOS to max 5.0.
68
+ search_window: Search window radius in patch units.
69
+ disable_global_alignment: Skip global alignment.
70
+ disable_realignment: Skip fine realignment.
71
+ """
72
+ self.use_speech_mode = use_speech_mode
73
+ self.search_window = search_window
74
+ self.disable_global_alignment = disable_global_alignment
75
+ self.disable_realignment = disable_realignment
76
+
77
+ # Initialize patch creator
78
+ if use_speech_mode:
79
+ self.patch_creator = VadPatchCreator(PATCH_SIZE_SPEECH)
80
+ else:
81
+ self.patch_creator = ImagePatchCreator(PATCH_SIZE_AUDIO)
82
+
83
+ # Initialize spectrogram builder
84
+ if use_speech_mode:
85
+ self.spect_builder = GammatoneSpectrogramBuilder(
86
+ NUM_BANDS_SPEECH, MINIMUM_FREQ, speech_mode=True
87
+ )
88
+ else:
89
+ self.spect_builder = GammatoneSpectrogramBuilder(
90
+ NUM_BANDS_AUDIO, MINIMUM_FREQ, speech_mode=False
91
+ )
92
+
93
+ # Initialize quality mapper
94
+ if use_speech_mode:
95
+ self.quality_mapper = SpeechSimilarityToQualityMapper(
96
+ scale_to_max_mos=not use_unscaled_speech
97
+ )
98
+ else:
99
+ self.quality_mapper = SvrSimilarityToQualityMapper(model_path)
100
+
101
+ self.quality_mapper.init()
102
+ self.is_initialized = True
103
+
104
+ def run(self, ref_path: str, deg_path: str) -> SimilarityResult:
105
+ """
106
+ Run ViSQOL on two audio files.
107
+
108
+ Args:
109
+ ref_path: Path to reference audio file.
110
+ deg_path: Path to degraded audio file.
111
+
112
+ Returns:
113
+ SimilarityResult with MOS-LQO score.
114
+ """
115
+ if not self.is_initialized:
116
+ raise RuntimeError("VisqolManager must be initialized before use.")
117
+
118
+ # Load audio
119
+ ref_signal = load_as_mono(ref_path)
120
+ deg_signal = load_as_mono(deg_path)
121
+
122
+ return self.run_from_signals(ref_signal, deg_signal)
123
+
124
+ def run_from_signals(self, ref_signal: AudioSignal,
125
+ deg_signal: AudioSignal) -> SimilarityResult:
126
+ """
127
+ Run ViSQOL on two audio signals.
128
+
129
+ Args:
130
+ ref_signal: Reference audio signal.
131
+ deg_signal: Degraded audio signal.
132
+
133
+ Returns:
134
+ SimilarityResult with MOS-LQO score.
135
+ """
136
+ if not self.is_initialized:
137
+ raise RuntimeError("VisqolManager must be initialized before use.")
138
+
139
+ # Validate
140
+ self._validate_input(ref_signal, deg_signal)
141
+
142
+ # Global alignment
143
+ if not self.disable_global_alignment:
144
+ aligned_deg, lag = globally_align(ref_signal, deg_signal)
145
+ deg_signal = aligned_deg
146
+
147
+ # Create analysis window
148
+ window = AnalysisWindow(ref_signal.sample_rate, OVERLAP)
149
+
150
+ # Run core algorithm
151
+ return self.visqol_core.calculate_similarity(
152
+ ref_signal=ref_signal,
153
+ deg_signal=deg_signal,
154
+ spect_builder=self.spect_builder,
155
+ window=window,
156
+ patch_creator=self.patch_creator,
157
+ search_window=self.search_window,
158
+ quality_mapper=self.quality_mapper,
159
+ disable_realignment=self.disable_realignment,
160
+ )
161
+
162
+ def _validate_input(self, ref_signal: AudioSignal,
163
+ deg_signal: AudioSignal):
164
+ """Validate input audio signals."""
165
+ # Check duration mismatch
166
+ ref_dur = ref_signal.duration
167
+ deg_dur = deg_signal.duration
168
+ if abs(ref_dur - deg_dur) > DURATION_MISMATCH_TOLERANCE:
169
+ logger.warning(
170
+ "Duration mismatch: reference=%.2fs, degraded=%.2fs",
171
+ ref_dur, deg_dur
172
+ )
173
+
174
+ # Check sample rate match
175
+ if ref_signal.sample_rate != deg_signal.sample_rate:
176
+ raise ValueError(
177
+ f"Sample rate mismatch: reference={ref_signal.sample_rate}, "
178
+ f"degraded={deg_signal.sample_rate}"
179
+ )
180
+
181
+ if self.use_speech_mode:
182
+ if ref_signal.sample_rate > K_16K_SAMPLE_RATE:
183
+ logger.warning(
184
+ "Input sample rate (%d) is above 16kHz. "
185
+ "Consider resampling for speech mode.",
186
+ ref_signal.sample_rate
187
+ )
188
+ else:
189
+ if ref_signal.sample_rate != K_48K_SAMPLE_RATE:
190
+ logger.warning(
191
+ "Input sample rate (%d) is not 48kHz. "
192
+ "This may affect audio mode scoring.",
193
+ ref_signal.sample_rate
194
+ )
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: visqol-python
3
+ Version: 3.3.3
4
+ Summary: ViSQOL - Virtual Speech Quality Objective Listener (Pure Python)
5
+ Home-page: https://github.com/talker93/visqol-python
6
+ Author: Shan Jiang
7
+ License-Expression: Apache-2.0
8
+ Project-URL: Homepage, https://github.com/talker93/visqol-python
9
+ Project-URL: Bug Reports, https://github.com/talker93/visqol-python/issues
10
+ Project-URL: Source, https://github.com/talker93/visqol-python
11
+ Project-URL: Original C++, https://github.com/google/visqol
12
+ Keywords: audio-quality,speech-quality,MOS,PESQ,POLQA,visqol,objective-metric,perceptual-quality
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: numpy>=1.20
29
+ Requires-Dist: scipy>=1.7
30
+ Requires-Dist: soundfile>=0.10
31
+ Requires-Dist: libsvm-official>=3.25
32
+ Dynamic: home-page
33
+ Dynamic: license-file
34
+ Dynamic: requires-python
35
+
36
+ # ViSQOL (Python)
37
+
38
+ A pure Python implementation of [Google's ViSQOL](https://github.com/google/visqol) (Virtual Speech Quality Objective Listener) v3.3.3 for objective audio/speech quality assessment.
39
+
40
+ ViSQOL compares a reference audio signal with a degraded version and outputs a **MOS-LQO** (Mean Opinion Score - Listening Quality Objective) score on a scale of **1.0 – 5.0**.
41
+
42
+ ## Features
43
+
44
+ - **Two modes**: Audio mode (music/general audio at 48 kHz) and Speech mode (speech at 16 kHz)
45
+ - **High accuracy**: 11/11 conformance tests pass against the official C++ implementation
46
+ - Audio mode: 9/10 tests produce **identical** MOS scores (diff = 0.000000), 1 test diff = 0.000117
47
+ - Speech mode: diff = 0.006715
48
+ - **Pure Python**: no C/C++ compilation required
49
+ - **Minimal dependencies**: only 4 pip packages (`numpy`, `scipy`, `soundfile`, `libsvm-official`)
50
+ - **Faster than real-time**: Audio RTF ≈ 0.71x, Speech RTF ≈ 0.38x
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install numpy scipy soundfile libsvm-official
56
+ ```
57
+
58
+ Or install as a package:
59
+
60
+ ```bash
61
+ git clone https://github.com/talker93/visqol-python.git
62
+ cd visqol-python
63
+ pip install -e .
64
+ ```
65
+
66
+ ## Quick Start
67
+
68
+ ### Python API
69
+
70
+ ```python
71
+ from visqol import VisqolApi
72
+
73
+ # Audio mode (default) - for music and general audio
74
+ api = VisqolApi()
75
+ api.create(mode="audio")
76
+ result = api.measure("reference.wav", "degraded.wav")
77
+ print(f"MOS-LQO: {result.moslqo:.4f}")
78
+
79
+ # Speech mode - for speech signals
80
+ api = VisqolApi()
81
+ api.create(mode="speech")
82
+ result = api.measure("ref_speech.wav", "deg_speech.wav")
83
+ print(f"MOS-LQO: {result.moslqo:.4f}")
84
+ ```
85
+
86
+ ### Using NumPy Arrays
87
+
88
+ ```python
89
+ import numpy as np
90
+ import soundfile as sf
91
+ from visqol import VisqolApi
92
+
93
+ ref, sr = sf.read("reference.wav")
94
+ deg, _ = sf.read("degraded.wav")
95
+
96
+ api = VisqolApi()
97
+ api.create(mode="audio")
98
+ result = api.measure_from_arrays(ref, deg, sample_rate=sr)
99
+ print(f"MOS-LQO: {result.moslqo:.4f}")
100
+ ```
101
+
102
+ ### Command Line
103
+
104
+ ```bash
105
+ # Audio mode (default)
106
+ python -m visqol -r reference.wav -d degraded.wav
107
+
108
+ # Speech mode
109
+ python -m visqol -r reference.wav -d degraded.wav --speech_mode
110
+
111
+ # Verbose output (per-patch details)
112
+ python -m visqol -r reference.wav -d degraded.wav -v
113
+ ```
114
+
115
+ **CLI options:**
116
+
117
+ | Flag | Description |
118
+ |------|-------------|
119
+ | `-r`, `--reference` | Path to reference WAV file (required) |
120
+ | `-d`, `--degraded` | Path to degraded WAV file (required) |
121
+ | `--speech_mode` | Use speech mode (16 kHz, polynomial mapping) |
122
+ | `--model` | Custom SVR model file path (audio mode only) |
123
+ | `--search_window` | Search window radius (default: 60) |
124
+ | `--verbose`, `-v` | Show detailed per-patch results |
125
+
126
+ ## Output
127
+
128
+ The `measure()` method returns a `SimilarityResult` object with:
129
+
130
+ | Field | Description |
131
+ |-------|-------------|
132
+ | `moslqo` | MOS-LQO score (1.0 – 5.0) |
133
+ | `vnsim` | Mean NSIM across all patches |
134
+ | `fvnsim` | Per-frequency-band mean NSIM |
135
+ | `fstdnsim` | Per-frequency-band std of NSIM |
136
+ | `fvdegenergy` | Per-frequency-band degraded energy |
137
+ | `patch_sims` | List of per-patch similarity details |
138
+
139
+ ## Modes
140
+
141
+ ### Audio Mode (default)
142
+ - Target sample rate: **48 kHz**
143
+ - 32 Gammatone frequency bands (50 Hz – 15 000 Hz)
144
+ - Quality mapping: SVR (Support Vector Regression) model
145
+ - Best for: music, environmental audio, codecs
146
+
147
+ ### Speech Mode
148
+ - Target sample rate: **16 kHz**
149
+ - 32 Gammatone frequency bands (50 Hz – 8 000 Hz)
150
+ - Quality mapping: exponential polynomial fit
151
+ - VAD (Voice Activity Detection) based patch selection
152
+ - Best for: speech, VoIP, telephony
153
+
154
+ ## Performance
155
+
156
+ Measured on Apple M-series, Python 3.13:
157
+
158
+ | Mode | Avg RTF | Typical Time |
159
+ |------|---------|-------------|
160
+ | Audio (48 kHz) | **0.71x** | 7 – 12 s per file pair |
161
+ | Speech (16 kHz) | **0.38x** | ~1 s per file pair |
162
+
163
+ > RTF (Real-Time Factor) < 1.0 means faster than real-time.
164
+
165
+ ## Project Structure
166
+
167
+ ```
168
+ visqol-python/
169
+ ├── visqol/ # Main package
170
+ │ ├── __init__.py # Package exports
171
+ │ ├── api.py # Public API
172
+ │ ├── visqol_manager.py # Pipeline orchestrator
173
+ │ ├── visqol_core.py # Core algorithm
174
+ │ ├── audio_utils.py # Audio I/O & SPL normalization
175
+ │ ├── signal_utils.py # Envelope, cross-correlation
176
+ │ ├── analysis_window.py # Hann window
177
+ │ ├── gammatone.py # ERB + Gammatone filterbank + spectrogram
178
+ │ ├── patch_creator.py # Patch creation (Image + VAD modes)
179
+ │ ├── patch_selector.py # DP-based optimal patch matching
180
+ │ ├── alignment.py # Global alignment via cross-correlation
181
+ │ ├── nsim.py # NSIM similarity metric
182
+ │ ├── quality_mapper.py # SVR & exponential quality mapping
183
+ │ └── __main__.py # CLI entry point
184
+ ├── model/ # Bundled SVR model
185
+ │ └── libsvm_nu_svr_model.txt
186
+ ├── tests/ # Conformance tests
187
+ │ ├── test_conformance.py
188
+ │ └── test_quick.py
189
+ ├── setup.py
190
+ ├── requirements.txt
191
+ ├── LICENSE
192
+ └── README.md
193
+ ```
194
+
195
+ ## Conformance Test Results
196
+
197
+ Tested against the [official C++ ViSQOL v3.3.3](https://github.com/google/visqol) expected values:
198
+
199
+ | Test Case | Mode | Expected MOS | Python MOS | Δ |
200
+ |-----------|------|-------------|------------|---|
201
+ | strauss_lp35 | Audio | 1.3889 | 1.3889 | 0.000000 |
202
+ | steely_lp7 | Audio | 2.2502 | 2.2502 | 0.000000 |
203
+ | sopr_256aac | Audio | 4.6823 | 4.6823 | 0.000000 |
204
+ | ravel_128opus | Audio | 4.4651 | 4.4651 | 0.000000 |
205
+ | moonlight_128aac | Audio | 4.6843 | 4.6843 | 0.000000 |
206
+ | harpsichord_96mp3 | Audio | 4.2237 | 4.2237 | 0.000000 |
207
+ | guitar_64aac | Audio | 4.3497 | 4.3497 | 0.000000 |
208
+ | glock_48aac | Audio | 4.3325 | 4.3325 | 0.000000 |
209
+ | contrabassoon_24aac | Audio | 2.3469 | 2.3468 | 0.000117 |
210
+ | castanets_identity | Audio | 4.7321 | 4.7321 | 0.000000 |
211
+ | speech_CA01 | Speech | 3.3745 | 3.3678 | 0.006715 |
212
+
213
+ ## References
214
+
215
+ - [Google ViSQOL (C++)](https://github.com/google/visqol) — the original implementation this project is ported from
216
+ - Hines, A., Gillen, E., Kelly, D., Skoglund, J., Kokaram, A., & Harte, N. (2015). *ViSQOLAudio: An Objective Audio Quality Metric for Low Bitrate Codecs.* The Journal of the Acoustical Society of America.
217
+ - Chinen, M., Lim, F. S., Skoglund, J., Gureev, N., O'Gorman, F., & Hines, A. (2020). *ViSQOL v3: An Open Source Production Ready Objective Speech and Audio Metric.* 2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX).
218
+
219
+ ## License
220
+
221
+ Apache License 2.0. See [LICENSE](LICENSE) for details.
222
+
223
+ This project is a Python port of [Google's ViSQOL](https://github.com/google/visqol), which is also licensed under Apache 2.0.
@@ -0,0 +1,21 @@
1
+ visqol/__init__.py,sha256=0yqtDr50SFgAI67HA6i83B97YPvMEsjhQcYbimV6YfM,539
2
+ visqol/__main__.py,sha256=z3vXtZBKrQEnoUmvxxws58swBQuTY7CBHi5unHSSCCE,2776
3
+ visqol/alignment.py,sha256=I78OZzQ4N-zlr60WYQlYNVPDe-hFYzWfXIZGw2yfx0s,2664
4
+ visqol/analysis_window.py,sha256=j2P9ppg1PVDY5hY7c_mlLlbWn5-zePtsKO5NmQPHYuk,1659
5
+ visqol/api.py,sha256=pLqpThyr0ndq-e3UARv8FBgPJ7kN_u8FAafpu5i_078,3828
6
+ visqol/audio_utils.py,sha256=6iNrVJjKpxj7xd2LzbbLbvg0jdaSHuyhO4BJ7qR5kUY,2479
7
+ visqol/gammatone.py,sha256=fuM0zdmW5DLPxBa-iF_RamlQpPt6yClquSND6PEuBoA,13228
8
+ visqol/nsim.py,sha256=64qmZ6dHbSw1y_COJIcRK2YNcfJA4wNDopHcVLkEKHk,4789
9
+ visqol/patch_creator.py,sha256=bu6bKmBtUWn6Co9XiHsnnfQPjNmwaFhkMRB46ZlWahI,7591
10
+ visqol/patch_selector.py,sha256=p4YsV0qEaa54Eif3ENoUhtlmyEIOb5FixEdpT-rWXtw,12813
11
+ visqol/quality_mapper.py,sha256=0qgdIzwnwIqhsTScz2_6EW9hFZpUa_0-IeQnLqnhw30,3831
12
+ visqol/signal_utils.py,sha256=q3QfLb2Cy2uTYXMy9skgiASYKJe5klTrDDg8rWMalsI,2495
13
+ visqol/visqol_core.py,sha256=I9Qp2fzVe6n203t3HHPIuXThIM2PVXg5XfrQUH74PZE,8034
14
+ visqol/visqol_manager.py,sha256=roUk9hD5o613lUQ0kaxxQ3fwU4JOqqNz6OmfpN5vHfM,6527
15
+ visqol/model/libsvm_nu_svr_model.txt,sha256=HoJG7TO_NtxchZNR9xEPLNMfmGYZiXFcD8-XTsSNPi4,138117
16
+ visqol_python-3.3.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
+ visqol_python-3.3.3.dist-info/METADATA,sha256=V90XyFp1SnRj28t3BhYOVtXPe3IE7lnrzxKwGspcuvU,8217
18
+ visqol_python-3.3.3.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
19
+ visqol_python-3.3.3.dist-info/entry_points.txt,sha256=Dl7miY_7U-116DDs7JJsrD_XhQZf5r9nfdm4N3ocpBQ,48
20
+ visqol_python-3.3.3.dist-info/top_level.txt,sha256=JAzbSsJqgc6Ol-wc6r_xHonHsWVM-200RFAwuSMp0EY,7
21
+ visqol_python-3.3.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ visqol = visqol.__main__:main