sox-tensorflow 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ License
2
+ ================================================================================
3
+ The source code in this repository and the data made available through this repo / tool are licensed separately.
4
+
5
+ Source code
6
+ --------------------------------------------------------------------------------
7
+ Source code is made available under the BSD License:
8
+
9
+ Copyright 2024 (c) Regents of University of California ([The Eric and Wendy Schmidt Center for Data Science and the Environment at UC Berkeley](https://dse.berkeley.edu/), [Benioff Ocean Science Laboratory](https://bosl.ucsb.edu/)).
10
+
11
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
12
+
13
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
14
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
15
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18
+
19
+ Copyright 2024 (c) Regents of University of California ([The Eric and Wendy Schmidt Center for Data Science and the Environment at UC Berkeley](https://dse.berkeley.edu/)).
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: sox_tensorflow
3
+ Version: 0.0.1
4
+ Summary: DESCRIPTION
5
+ Author-email: Brookie Guzder-Williams <bguzder-williams@berkeley.edu>
6
+ License: CC-BY-4.0
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.11
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.md
16
+ Dynamic: license-file
17
+
18
+ # SOX GPU
19
+
20
+ Generating SOX-style spectrograms on a gpu
21
+
22
+ - sox: https://github.com/chirlu/sox
23
+ - pysox: https://github.com/marl/pysox
24
+ - audio samples:
25
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230522_000000.flac
26
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230526_000000.flac
27
+ - pnw-cnet-model: https://storage.googleapis.com/dse-soundhub-public/models/pnw-cnet/PNW-Cnet_v4_TF.h5
28
+
29
+
30
+ ### PNW MODEL
31
+
32
+ ** For PNW we need**
33
+
34
+ ```bash
35
+ export TF_USE_LEGACY_KERAS=1
36
+ ```
37
+
38
+ This environment variable forces TensorFlow 2.16+ to use the legacy Keras implementation instead of the new Keras 3, which maintains compatibility with H5 models saved in older TensorFlow versions. The newer Keras 3 has stricter input shape validation that can break when loading older model files.
39
+
40
+ ---
41
+
42
+ ## QUICK START
43
+
44
+ Usage example
45
+
46
+ ---
47
+
48
+ ## DOCUMENTATION
49
+
50
+ API Docs
51
+
52
+ ---
53
+
54
+ ## STYLE-GUIDE
55
+
56
+ Following PEP8. See [setup.cfg](./setup.cfg) for exceptions. Keeping honest with `pycodestyle .`
@@ -0,0 +1,39 @@
1
+ # SOX GPU
2
+
3
+ Generating SOX-style spectrograms on a gpu
4
+
5
+ - sox: https://github.com/chirlu/sox
6
+ - pysox: https://github.com/marl/pysox
7
+ - audio samples:
8
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230522_000000.flac
9
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230526_000000.flac
10
+ - pnw-cnet-model: https://storage.googleapis.com/dse-soundhub-public/models/pnw-cnet/PNW-Cnet_v4_TF.h5
11
+
12
+
13
+ ### PNW MODEL
14
+
15
+ ** For PNW we need**
16
+
17
+ ```bash
18
+ export TF_USE_LEGACY_KERAS=1
19
+ ```
20
+
21
+ This environment variable forces TensorFlow 2.16+ to use the legacy Keras implementation instead of the new Keras 3, which maintains compatibility with H5 models saved in older TensorFlow versions. The newer Keras 3 has stricter input shape validation that can break when loading older model files.
22
+
23
+ ---
24
+
25
+ ## QUICK START
26
+
27
+ Usage example
28
+
29
+ ---
30
+
31
+ ## DOCUMENTATION
32
+
33
+ API Docs
34
+
35
+ ---
36
+
37
+ ## STYLE-GUIDE
38
+
39
+ Following PEP8. See [setup.cfg](./setup.cfg) for exceptions. Keeping honest with `pycodestyle .`
@@ -0,0 +1,72 @@
1
+ [build-system]
2
+ requires = ["setuptools", "build"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sox_tensorflow"
7
+ version = "0.0.1"
8
+ readme = "README.md"
9
+ description = "DESCRIPTION"
10
+ license = {text = "CC-BY-4.0"}
11
+ authors = [
12
+ {name = "Brookie Guzder-Williams", email = "bguzder-williams@berkeley.edu"}
13
+ ]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Scientific/Engineering"
21
+ ]
22
+ requires-python = ">=3.11"
23
+ dependencies = []
24
+
25
+ [tool.setuptools]
26
+ packages = [
27
+ "sox_tensorflow",
28
+ ]
29
+
30
+ [tool.pixi.workspace]
31
+ channels = ["conda-forge"]
32
+ platforms = ["osx-arm64", "linux-64", "osx-64", "linux-aarch64"]
33
+
34
+ [tool.pixi.pypi-dependencies]
35
+ sox_tensorflow = { path = ".", editable = true }
36
+ soundfile = ">=0.13.1,<0.14"
37
+ soxr = ">=0.3.0"
38
+
39
+ # Dependencies
40
+ [tool.pixi.target.linux-64.pypi-dependencies]
41
+ tensorflow = ">=2.18.0"
42
+ tf_keras = "*"
43
+
44
+ [tool.pixi.target.osx-64.pypi-dependencies]
45
+ tensorflow = ">=2.16.0,<2.17.0"
46
+ tf_keras = ">=2.16.0,<3"
47
+
48
+ [tool.pixi.target.linux-aarch64.pypi-dependencies]
49
+ tensorflow = ">=2.18.0"
50
+ tf_keras = "*"
51
+
52
+ [tool.pixi.target.osx-arm64.pypi-dependencies]
53
+ tensorflow-macos = ">=2.16.0,<2.17.0"
54
+ tensorflow-metal = ">=1.2.0,<2"
55
+ tf_keras = ">=2.16.0,<3"
56
+
57
+ [tool.pixi.dependencies]
58
+ python = ">=3.11,<3.12"
59
+ numpy = ">=1.26.0,<1.27"
60
+ pillow = ">=10.0.0"
61
+
62
+
63
+ [tool.pixi.environments]
64
+ default = { features = [] }
65
+ dev = { features = ["dev"] }
66
+
67
+ [tool.pixi.feature.dev.dependencies]
68
+ twine = "*"
69
+
70
+ [tool.pixi.feature.dev.pypi-dependencies]
71
+ build = "*"
72
+ h5py = ">=3.15.1,<4"
@@ -0,0 +1,8 @@
1
+ [pycodestyle]
2
+ max-line-length = 100
3
+ ignore = E125,E128,E502,E731,E722,E402
4
+
5
+ [egg_info]
6
+ tag_build =
7
+ tag_date = 0
8
+
@@ -0,0 +1 @@
1
+ # __init__.py
@@ -0,0 +1,624 @@
1
+ """
2
+ Sox-compatible spectrogram generation using TensorFlow.
3
+
4
+ This module provides a TensorFlow implementation that generates spectrograms matching sox output
5
+ with 99.99%+ pixel accuracy. The main entry point is `spectrogram()`.
6
+
7
+ Example usage:
8
+ import tensorflow as tf
9
+ from tensorflow_sox_spectrogram import spectrogram
10
+
11
+ # From TensorFlow tensor
12
+ audio_tensor = tf.random.normal([96000]) # 12 seconds at 8kHz
13
+ spectrogram = spectrogram(audio_tensor, shape=(257, 1000), sample_rate=8000)
14
+
15
+ # From numpy array (converted internally)
16
+ spectrogram = spectrogram(audio_array, shape=(257, 1000), sample_rate=48000)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ import subprocess
23
+ import tempfile
24
+ import numpy as np
25
+ from pathlib import Path
26
+ from typing import List, Optional, Tuple, Union
27
+ import soxr
28
+ import soundfile as sf
29
+ import tensorflow as tf
30
+
31
+
32
+ #
33
+ # CONSTANTS
34
+ #
35
+ DEFAULT_DURATION: int = 12
36
+ DEFAULT_SHAPE: Tuple[int, int] = (257, 1000)
37
+ DEFAULT_DB_RANGE: int = 90
38
+
39
+
40
+ #
41
+ # PUBLIC
42
+ #
43
+ def load_audio(
44
+ flac_path: str,
45
+ start_time: Optional[float] = None,
46
+ segment: Optional[int] = None,
47
+ duration: Optional[float] = None,
48
+ channel: Optional[int] = 0
49
+ ) -> Union[tf.Tensor, str]:
50
+ """
51
+ utility wrapper to read flac file into tf-tensor using soundfile
52
+
53
+ Args:
54
+ flac_path: Path to FLAC file
55
+ start_time: Start time in seconds
56
+ segment (int): overrides <start_time> to be the <segment>-th (0-based) <duration> clip
57
+ duration: Duration in seconds
58
+ channel: channel to extract (0 for mono)
59
+
60
+ Returns (tuple):
61
+ tf-tensor, sample-rate
62
+ """
63
+ if segment is not None:
64
+ start_time = segment * duration
65
+ if duration is None:
66
+ frames = -1
67
+ else:
68
+ info = sf.info(flac_path)
69
+ sample_rate = info.samplerate
70
+ start_sample = round(start_time * sample_rate)
71
+ num_samples = round(duration * sample_rate)
72
+
73
+ samples, sr = sf.read(
74
+ flac_path,
75
+ start=start_sample,
76
+ frames=num_samples,
77
+ dtype="float64",
78
+ always_2d=True,
79
+ )
80
+
81
+ if channel is not None:
82
+ samples = samples[:, channel]
83
+
84
+ return tf.constant(samples, dtype=tf.float64), sr
85
+
86
+
87
+ def spectrogram_from_flac(
88
+ flac_path: str,
89
+ start_time: Optional[float] = None,
90
+ segment: Optional[int] = None,
91
+ duration: Optional[float] = DEFAULT_DURATION,
92
+ channel: Optional[int] = 0,
93
+ shape: Tuple[int, int] = DEFAULT_SHAPE,
94
+ dest: Optional[Union[str, Path]] = None,
95
+ db_range: int = DEFAULT_DB_RANGE,
96
+ ) -> Union[tf.Tensor, str]:
97
+ """
98
+ Generate spectrogram directly from FLAC file using TensorFlow.
99
+
100
+ Args:
101
+ flac_path: Path to FLAC file
102
+ start_time: Start time in seconds
103
+ segment (int): overrides <start_time> to be the <segment>-th (0-based) <duration> clip
104
+ duration: Duration in seconds
105
+ channel: channel to extract (0 for mono)
106
+ shape: Output shape as (height, width)
107
+ dest: Optional output path for PNG
108
+ db_range: Dynamic range in dB
109
+
110
+ Returns:
111
+ If dest is None: TensorFlow tensor (uint8)
112
+ If dest is provided: path to saved PNG
113
+ """
114
+ audio_tensor, sample_rate = load_audio(
115
+ flac_path=flac_path,
116
+ start_time=start_time,
117
+ segment=segment,
118
+ duration=duration,
119
+ channel=channel)
120
+ return spectrogram(
121
+ audio_array=audio_tensor,
122
+ shape=shape,
123
+ dest=dest,
124
+ sample_rate=sample_rate,
125
+ db_range=db_range,
126
+ )
127
+
128
+
129
+ def spectrogram(
130
+ audio_array: Union[tf.Tensor, np.ndarray],
131
+ shape: Tuple[int, int],
132
+ dest: Optional[Union[str, Path]] = None,
133
+ segment: Optional[int] = None,
134
+ segment_duration: Optional[float] = None,
135
+ segment_overlap: Optional[float] = None,
136
+ sample_rate: Optional[int] = None,
137
+ output_sample_rate: int = 8000,
138
+ create_parents: bool = True,
139
+ overwrite: bool = True,
140
+ db_range: int = 90,
141
+ ) -> Union[tf.Tensor, str]:
142
+ """
143
+ Create Sox-matching spectrogram using TensorFlow.
144
+
145
+ Generates a spectrogram that matches sox output with 99.99%+ pixel accuracy.
146
+ Uses TensorFlow operations for GPU acceleration where available.
147
+
148
+ Args:
149
+ audio_array: Audio data as TensorFlow tensor or numpy array.
150
+ Expects float32/float64 in [-1, 1] or int32 (sox format).
151
+ shape: Output shape as (height, width). Height determines frequency resolution
152
+ (DFT size = 2 * (height - 1)), width determines time resolution.
153
+ dest: Optional file path to save spectrogram as PNG. If None, returns tensor.
154
+ segment: Segment number for extracting specific audio portion (0-indexed).
155
+ segment_duration: Duration of each segment in seconds.
156
+ segment_overlap: Overlap between segments in seconds (default: 0).
157
+ sample_rate: Sample rate of input audio in Hz. Required for tensor/array input.
158
+ output_sample_rate: Sample rate for spectrogram generation (default: 8000).
159
+ create_parents: Create parent directories if dest path doesn't exist.
160
+ overwrite: Overwrite existing file at dest.
161
+ db_range: Dynamic range in dB (default: 90).
162
+
163
+ Returns:
164
+ If dest is None: TensorFlow tensor of pixel values (uint8) with shape (height, width)
165
+ If dest is provided: string path to saved PNG file
166
+
167
+ Example:
168
+ >>> audio = tf.random.normal([96000], dtype=tf.float32) # 12s at 8kHz
169
+ >>> pixels = spectrogram(audio, shape=(257, 1000), sample_rate=8000)
170
+ >>> pixels.shape
171
+ TensorShape([257, 1000])
172
+ """
173
+ # Extract and validate audio samples
174
+ samples, sr = _extract_audio_samples(
175
+ audio_array=audio_array,
176
+ sample_rate=sample_rate,
177
+ segment=segment,
178
+ segment_duration=segment_duration,
179
+ segment_overlap=segment_overlap,
180
+ )
181
+
182
+ # Convert to TensorFlow tensor if needed
183
+ if not isinstance(samples, tf.Tensor):
184
+ samples = tf.constant(samples, dtype=tf.float64)
185
+ elif samples.dtype != tf.float64:
186
+ samples = tf.cast(samples, tf.float64)
187
+
188
+ # Resample if needed
189
+ if sr != output_sample_rate:
190
+ samples = _resample(samples, in_rate=sr, out_rate=output_sample_rate)
191
+
192
+ # Generate spectrogram using TensorFlow
193
+ y_size, x_size = shape
194
+ pixels = _generate_spectrogram_tf(
195
+ samples=samples,
196
+ sample_rate=output_sample_rate,
197
+ x_size=x_size,
198
+ y_size=y_size,
199
+ db_range=db_range,
200
+ )
201
+
202
+ # Return tensor or save to file
203
+ if dest is None:
204
+ return pixels
205
+
206
+ dest_path = Path(dest)
207
+ if dest_path.exists() and not overwrite:
208
+ raise FileExistsError(f"File already exists: {dest}")
209
+ if create_parents:
210
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
211
+
212
+ _write_png_tf(pixels, y_size, str(dest_path))
213
+ return str(dest_path)
214
+
215
+
216
+
217
+ # ==================================================================================================
218
+ # INTERNAL: Audio Processing
219
+ # ==================================================================================================
220
+
221
+
222
+ def _extract_audio_samples(
223
+ audio_array: Union[tf.Tensor, np.ndarray],
224
+ sample_rate: Optional[int],
225
+ segment: Optional[int],
226
+ segment_duration: Optional[float],
227
+ segment_overlap: Optional[float],
228
+ ) -> Tuple[Union[tf.Tensor, np.ndarray], int]:
229
+ """Extract and validate audio samples from input."""
230
+ if sample_rate is None:
231
+ raise ValueError("sample_rate is required for tensor/array input")
232
+ sr = sample_rate
233
+ samples = audio_array
234
+
235
+ # Handle segmentation
236
+ if segment is not None:
237
+ if segment_duration is None:
238
+ raise ValueError("segment_duration required when segment is specified")
239
+ overlap = segment_overlap or 0.0
240
+ start_sample = round(segment * (segment_duration - overlap) * sr)
241
+ num_samples = round(segment_duration * sr)
242
+ samples = samples[start_sample:start_sample + num_samples]
243
+
244
+ return samples, sr
245
+
246
+
247
+ def _resample(
248
+ samples: tf.Tensor,
249
+ in_rate: int,
250
+ out_rate: int,
251
+ ) -> tf.Tensor:
252
+ """
253
+ Resample audio using soxr library (SoX Resampler).
254
+
255
+ Uses the soxr library which provides the same high-quality resampling as sox
256
+ but is ~3x faster and doesn't require subprocess calls. Achieves 99.8%+
257
+ spectrogram pixel match with sox binary.
258
+
259
+ Args:
260
+ samples: Audio samples as TensorFlow tensor (float64)
261
+ in_rate: Input sample rate in Hz
262
+ out_rate: Output sample rate in Hz
263
+
264
+ Returns:
265
+ Resampled audio as TensorFlow tensor (float64)
266
+ """
267
+ # Convert to numpy for soxr processing
268
+ samples_np = samples.numpy()
269
+
270
+ # Resample using soxr HQ quality (matches sox -h)
271
+ resampled = soxr.resample(samples_np, in_rate, out_rate, quality='HQ')
272
+
273
+ # Clip to [-1, 1] range to match sox behavior (soxr can overshoot on transients)
274
+ resampled = np.clip(resampled, -1.0, 1.0)
275
+
276
+ return tf.constant(resampled, dtype=tf.float64)
277
+
278
+
279
+ # ==================================================================================================
280
+ # INTERNAL: TensorFlow Spectrogram Generation
281
+ # ==================================================================================================
282
+
283
+
284
+ def _generate_spectrogram_tf(
285
+ samples: tf.Tensor,
286
+ sample_rate: int,
287
+ x_size: int,
288
+ y_size: int,
289
+ db_range: int,
290
+ ) -> tf.Tensor:
291
+ """
292
+ Generate spectrogram using TensorFlow operations.
293
+
294
+ Replicates sox's spectrogram algorithm using TensorFlow for potential GPU acceleration.
295
+ """
296
+ # Calculate parameters (matching sox)
297
+ dft_size = 2 * (y_size - 1) # 512 for y_size=257
298
+ rows = dft_size // 2 + 1 # 257
299
+
300
+ duration = tf.cast(tf.shape(samples)[0], tf.float64) / tf.cast(sample_rate, tf.float64)
301
+ pixels_per_sec = tf.cast(x_size, tf.float64) / duration
302
+
303
+ # Create Hann window with sox normalization
304
+ window = _make_hann_window_tf(dft_size)
305
+
306
+ # Calculate step_size and block_steps (sox algorithm)
307
+ # IMPORTANT: Use the UNNORMALIZED Hann window sum for step_size calculation
308
+ # The unnormalized Hann window sum is dft_size/2
309
+ window_sum_unnormalized = tf.cast(dft_size, tf.float64) / 2.0
310
+ step_size = tf.cast(tf.round(window_sum_unnormalized), tf.int32)
311
+ block_steps_float = tf.cast(sample_rate, tf.float64) / pixels_per_sec
312
+ step_size = tf.cast(
313
+ tf.round(block_steps_float / tf.math.ceil(block_steps_float / tf.cast(step_size, tf.float64))),
314
+ tf.int32
315
+ )
316
+ block_steps = tf.cast(tf.round(block_steps_float / tf.cast(step_size, tf.float64)), tf.int32)
317
+ block_norm = 1.0 / tf.cast(block_steps, tf.float64)
318
+
319
+ # Process audio through FFT loop
320
+ all_dBfs = _fft_loop_tf(
321
+ samples=samples,
322
+ window=window,
323
+ dft_size=dft_size,
324
+ step_size=step_size,
325
+ block_steps=block_steps,
326
+ block_norm=block_norm,
327
+ rows=rows,
328
+ x_size=x_size,
329
+ )
330
+
331
+ # Convert dBfs to pixel values
332
+ pixels = _render_pixels_tf(all_dBfs, db_range, rows)
333
+
334
+ return pixels
335
+
336
+
337
+ def _make_hann_window_tf(dft_size: int) -> tf.Tensor:
338
+ """Create Hann window with sox-specific normalization."""
339
+ n = dft_size
340
+ # Hann window: 0.5 - 0.5 * cos(2*pi*i/(n-1))
341
+ i = tf.range(n, dtype=tf.float64)
342
+ m = tf.cast(n - 1, tf.float64)
343
+ window = 0.5 - 0.5 * tf.cos(2.0 * np.pi * i / m)
344
+
345
+ # Sox normalization: window *= 2/sum * ((n-1)/dft_size)^2
346
+ window_sum = tf.reduce_sum(window)
347
+ norm_factor = 2.0 / window_sum * tf.square((m) / tf.cast(dft_size, tf.float64))
348
+ window = window * norm_factor
349
+
350
+ return window
351
+
352
+
353
+ def _make_window_vectorized(dft_size: int, end_val: int) -> np.ndarray:
354
+ """
355
+ Create Hann window with sox-specific edge handling.
356
+
357
+ Matches sox's make_window() exactly, supporting partial windows
358
+ for edge frames at the start and end of the signal.
359
+
360
+ Args:
361
+ dft_size: FFT size (e.g., 512)
362
+ end_val: Edge parameter. Positive = start edge, negative = end edge, 0 = full window
363
+
364
+ Returns:
365
+ Window array of shape (dft_size,)
366
+ """
367
+ w = np.zeros(dft_size + 1, dtype=np.float32)
368
+
369
+ w_start = 0 if end_val < 0 else end_val
370
+ n = 1 + dft_size - abs(end_val)
371
+
372
+ if n <= 0:
373
+ return w[:dft_size]
374
+
375
+ # Initialize window region to 1.0
376
+ for i in range(n):
377
+ if w_start + i < len(w):
378
+ w[w_start + i] = 1.0
379
+
380
+ # Apply Hann window: h[i] = 0.5 - 0.5 * cos(2*pi*i/(n-1))
381
+ m = n - 1
382
+ if m > 0:
383
+ for i in range(n):
384
+ if w_start + i < len(w):
385
+ x = 2.0 * np.pi * i / m
386
+ w[w_start + i] *= 0.5 - 0.5 * np.cos(x)
387
+
388
+ # Calculate sum for normalization
389
+ window_sum = np.sum(w[:dft_size])
390
+
391
+ # Sox normalization: window *= 2/sum * ((n-1)/dft_size)^2
392
+ n -= 1
393
+ if window_sum > 0:
394
+ norm_factor = 2.0 / window_sum * (n / dft_size) ** 2
395
+ w[:dft_size] *= norm_factor
396
+
397
+ return w[:dft_size]
398
+
399
+
400
+ def _compute_end_values(x_size: int, dft_size: int, step_size: int) -> np.ndarray:
401
+ """
402
+ Compute edge parameter (end value) for each frame.
403
+
404
+ Sox uses partial Hann windows at the start and end of the signal.
405
+ This function computes the end values for all frames, including
406
+ the main phase and drain phase.
407
+
408
+ Args:
409
+ x_size: Number of output columns (frames)
410
+ dft_size: FFT size
411
+ step_size: Step size between frames
412
+
413
+ Returns:
414
+ Array of end values for each frame
415
+ """
416
+ # initial_read starts negative: (step_size - dft_size) // 2 = -208
417
+ # Before first FFT, we consume: step_size - initial_read = 96 - (-208) = 304 samples
418
+ initial_read = (step_size - dft_size) // 2
419
+ initial_samples = step_size - initial_read # 304 samples before first FFT
420
+
421
+ end_values = []
422
+
423
+ # Main phase: frames consuming actual samples
424
+ main_frames = x_size - 3 # Reserve 3 for drain phase
425
+
426
+ for i in range(main_frames):
427
+ # After frame i, total samples consumed = initial_samples + i * step_size
428
+ samples_consumed = initial_samples + i * step_size
429
+ end = max(dft_size - samples_consumed, 0)
430
+ end_values.append(end)
431
+
432
+ # Drain phase: 3 frames with decreasing window coverage
433
+ # These frames process zero-padded tail of the signal
434
+ # end values: -16, -112, -208
435
+ end_values.extend([-16, -112, -208])
436
+
437
+ return np.array(end_values, dtype=np.int32)
438
+
439
+
440
+ def _create_windows_optimized(x_size: int, dft_size: int, step_size: int) -> np.ndarray:
441
+ """
442
+ Create optimized per-frame windows.
443
+
444
+ Most frames use the same full Hann window. Only edge frames (first ~3 and
445
+ last 3) need partial windows. This function exploits this to avoid
446
+ creating 1000 individual windows in a loop.
447
+
448
+ Args:
449
+ x_size: Number of frames
450
+ dft_size: FFT size
451
+ step_size: Step size between frames
452
+
453
+ Returns:
454
+ Window array of shape (x_size, dft_size)
455
+ """
456
+ # Compute end values
457
+ initial_read = (step_size - dft_size) // 2
458
+ initial_samples = step_size - initial_read
459
+
460
+ # Create full window (end=0) - used for most frames
461
+ full_window = _make_window_vectorized(dft_size, 0)
462
+
463
+ # Initialize all windows to full window
464
+ windows = np.tile(full_window, (x_size, 1))
465
+
466
+ # Find which frames need partial windows
467
+ # Start edge frames: samples_consumed < dft_size → end > 0
468
+ for i in range(x_size - 3):
469
+ samples_consumed = initial_samples + i * step_size
470
+ end = max(dft_size - samples_consumed, 0)
471
+ if end > 0:
472
+ windows[i] = _make_window_vectorized(dft_size, end)
473
+ else:
474
+ break # All remaining main phase frames use full window
475
+
476
+ # End edge frames (drain phase): last 3 frames
477
+ windows[-3] = _make_window_vectorized(dft_size, -16)
478
+ windows[-2] = _make_window_vectorized(dft_size, -112)
479
+ windows[-1] = _make_window_vectorized(dft_size, -208)
480
+
481
+ return windows
482
+
483
+
484
+ def _fft_loop_tf(
485
+ samples: tf.Tensor,
486
+ window: tf.Tensor,
487
+ dft_size: int,
488
+ step_size: tf.Tensor,
489
+ block_steps: tf.Tensor,
490
+ block_norm: tf.Tensor,
491
+ rows: int,
492
+ x_size: int,
493
+ ) -> tf.Tensor:
494
+ """
495
+ Vectorized FFT processing for GPU acceleration.
496
+
497
+ This implementation extracts all frames at once, applies per-frame windows,
498
+ and computes FFT in batch for efficient GPU execution.
499
+
500
+ The edge handling matches sox's spectrogram algorithm exactly:
501
+ - First ~3 frames: partial windows (start edge)
502
+ - Middle frames: full Hann window
503
+ - Last 3 frames: partial windows (drain phase)
504
+ """
505
+ step_size_val = int(step_size.numpy())
506
+ block_steps_val = int(block_steps.numpy())
507
+ block_norm_val = float(block_norm.numpy())
508
+
509
+ # Convert to numpy for frame extraction
510
+ samples_np = samples.numpy().astype(np.float32)
511
+
512
+ # Create optimized windows (only creates partial windows for edge frames)
513
+ windows = _create_windows_optimized(x_size, dft_size, step_size_val)
514
+
515
+ # Calculate padding for frame extraction
516
+ initial_read = (step_size_val - dft_size) // 2
517
+ pad_left = -initial_read # 208
518
+
519
+ # Pad right for drain phase
520
+ pad_right = dft_size + step_size_val
521
+
522
+ # Pad audio
523
+ audio_padded = np.concatenate([
524
+ np.zeros(pad_left, dtype=np.float32),
525
+ samples_np,
526
+ np.zeros(pad_right, dtype=np.float32)
527
+ ])
528
+
529
+ # Extract all frames at once using advanced indexing
530
+ frame_starts = np.arange(x_size) * step_size_val
531
+ indices = frame_starts[:, np.newaxis] + np.arange(dft_size)
532
+ frames = audio_padded[indices] # Shape: (x_size, dft_size)
533
+
534
+ # Apply per-frame windows
535
+ windowed = frames * windows
536
+
537
+ # Convert to TensorFlow and compute FFT
538
+ windowed_tf = tf.constant(windowed, dtype=tf.float32)
539
+ fft_out = tf.signal.rfft(windowed_tf)
540
+
541
+ # Compute magnitude squared
542
+ magnitudes = tf.abs(fft_out) ** 2 # Shape: (x_size, rows)
543
+
544
+ # Apply block normalization
545
+ magnitudes = magnitudes * block_norm_val
546
+
547
+ # Convert to dB: 10 * log10(mag)
548
+ epsilon = 1e-20
549
+ dBfs = 10.0 * tf.math.log(magnitudes + epsilon) / tf.math.log(10.0)
550
+
551
+ # Clip minimum to -200 dB
552
+ dBfs = tf.maximum(dBfs, -200.0)
553
+
554
+ return tf.cast(dBfs, tf.float32)
555
+
556
+
557
+ def _render_pixels_tf(all_dBfs: tf.Tensor, db_range: int, rows: int) -> tf.Tensor:
558
+ """Convert dBfs values to pixel values using TensorFlow."""
559
+ spectrum_points = 251
560
+ fixed_palette = 4
561
+
562
+ # Map dB to palette index
563
+ # c = 0 if dB < -db_range
564
+ # c = spectrum_points - 1 if dB >= 0
565
+ # c = 1 + (1 + dB/db_range) * (spectrum_points - 2) otherwise
566
+
567
+ dB_normalized = all_dBfs / float(db_range) # -1 to 0 range for valid values
568
+
569
+ # Calculate color index
570
+ c = 1.0 + (1.0 + dB_normalized) * (spectrum_points - 2)
571
+ c = tf.clip_by_value(c, 0, spectrum_points - 1)
572
+
573
+ # Apply boundary conditions
574
+ c = tf.where(all_dBfs < -db_range, tf.zeros_like(c), c)
575
+ c = tf.where(all_dBfs >= 0, tf.fill(tf.shape(c), float(spectrum_points - 1)), c)
576
+
577
+ # Add fixed palette offset and convert to uint8
578
+ pixel_values = tf.cast(c, tf.int32) + fixed_palette
579
+ pixel_values = tf.cast(pixel_values, tf.uint8)
580
+
581
+ # Transpose and flip for correct orientation
582
+ # Sox: row 0 = highest frequency, we have row 0 = DC
583
+ pixels = tf.transpose(pixel_values) # [rows, cols]
584
+ pixels = tf.reverse(pixels, axis=[0]) # Flip vertically
585
+
586
+ return pixels
587
+
588
+
589
+ # ==================================================================================================
590
+ # INTERNAL: PNG Output
591
+ # ==================================================================================================
592
+
593
+
594
+ def _create_palette_flat(spectrum_points: int = 251) -> List[int]:
595
+ """Create grayscale palette matching sox as flat RGB list."""
596
+ palette = []
597
+
598
+ # Fixed palette entries
599
+ palette.extend([0, 0, 0]) # Background
600
+ palette.extend([255, 255, 255]) # Text
601
+ palette.extend([191, 191, 191]) # Labels
602
+ palette.extend([127, 127, 127]) # Grid
603
+
604
+ # Spectrum palette (grayscale)
605
+ for i in range(spectrum_points):
606
+ x = i / (spectrum_points - 1)
607
+ gray = int(0.5 + 255 * x)
608
+ palette.extend([gray, gray, gray])
609
+
610
+ return palette
611
+
612
+
613
+ def _write_png_tf(pixels: tf.Tensor, y_size: int, output_path: str) -> None:
614
+ """Write spectrogram as indexed PNG file."""
615
+ from PIL import Image
616
+
617
+ spectrum_points = 251
618
+ palette = _create_palette_flat(spectrum_points)
619
+
620
+ pixels_np = pixels.numpy()
621
+
622
+ img = Image.fromarray(pixels_np, mode='P')
623
+ img.putpalette(palette)
624
+ img.save(output_path)
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: sox_tensorflow
3
+ Version: 0.0.1
4
+ Summary: DESCRIPTION
5
+ Author-email: Brookie Guzder-Williams <bguzder-williams@berkeley.edu>
6
+ License: CC-BY-4.0
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.11
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.md
16
+ Dynamic: license-file
17
+
18
+ # SOX GPU
19
+
20
+ Generating SOX-style spectrograms on a gpu
21
+
22
+ - sox: https://github.com/chirlu/sox
23
+ - pysox: https://github.com/marl/pysox
24
+ - audio samples:
25
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230522_000000.flac
26
+ * https://storage.googleapis.com/dse-soundhub-public/data/sample_audio/20230526_000000.flac
27
+ - pnw-cnet-model: https://storage.googleapis.com/dse-soundhub-public/models/pnw-cnet/PNW-Cnet_v4_TF.h5
28
+
29
+
30
+ ### PNW MODEL
31
+
32
+ ** For PNW we need**
33
+
34
+ ```bash
35
+ export TF_USE_LEGACY_KERAS=1
36
+ ```
37
+
38
+ This environment variable forces TensorFlow 2.16+ to use the legacy Keras implementation instead of the new Keras 3, which maintains compatibility with H5 models saved in older TensorFlow versions. The newer Keras 3 has stricter input shape validation that can break when loading older model files.
39
+
40
+ ---
41
+
42
+ ## QUICK START
43
+
44
+ Usage example
45
+
46
+ ---
47
+
48
+ ## DOCUMENTATION
49
+
50
+ API Docs
51
+
52
+ ---
53
+
54
+ ## STYLE-GUIDE
55
+
56
+ Following PEP8. See [setup.cfg](./setup.cfg) for exceptions. Keeping honest with `pycodestyle .`
@@ -0,0 +1,10 @@
1
+ LICENSE.md
2
+ README.md
3
+ pyproject.toml
4
+ setup.cfg
5
+ sox_tensorflow/__init__.py
6
+ sox_tensorflow/processor.py
7
+ sox_tensorflow.egg-info/PKG-INFO
8
+ sox_tensorflow.egg-info/SOURCES.txt
9
+ sox_tensorflow.egg-info/dependency_links.txt
10
+ sox_tensorflow.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ sox_tensorflow