birdnet-analyzer 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birdnet_analyzer/__init__.py +9 -8
- birdnet_analyzer/analyze/__init__.py +19 -5
- birdnet_analyzer/analyze/__main__.py +3 -4
- birdnet_analyzer/analyze/cli.py +30 -25
- birdnet_analyzer/analyze/core.py +246 -245
- birdnet_analyzer/analyze/utils.py +694 -701
- birdnet_analyzer/audio.py +368 -372
- birdnet_analyzer/cli.py +732 -707
- birdnet_analyzer/config.py +243 -242
- birdnet_analyzer/eBird_taxonomy_codes_2024E.json +13046 -0
- birdnet_analyzer/embeddings/__init__.py +3 -4
- birdnet_analyzer/embeddings/__main__.py +3 -3
- birdnet_analyzer/embeddings/cli.py +12 -13
- birdnet_analyzer/embeddings/core.py +70 -70
- birdnet_analyzer/embeddings/utils.py +220 -193
- birdnet_analyzer/evaluation/__init__.py +189 -195
- birdnet_analyzer/evaluation/__main__.py +3 -3
- birdnet_analyzer/evaluation/assessment/__init__.py +0 -0
- birdnet_analyzer/evaluation/assessment/metrics.py +388 -0
- birdnet_analyzer/evaluation/assessment/performance_assessor.py +364 -0
- birdnet_analyzer/evaluation/assessment/plotting.py +378 -0
- birdnet_analyzer/evaluation/preprocessing/__init__.py +0 -0
- birdnet_analyzer/evaluation/preprocessing/data_processor.py +631 -0
- birdnet_analyzer/evaluation/preprocessing/utils.py +98 -0
- birdnet_analyzer/gui/__init__.py +19 -23
- birdnet_analyzer/gui/__main__.py +3 -3
- birdnet_analyzer/gui/analysis.py +179 -174
- birdnet_analyzer/gui/assets/arrow_down.svg +4 -4
- birdnet_analyzer/gui/assets/arrow_left.svg +4 -4
- birdnet_analyzer/gui/assets/arrow_right.svg +4 -4
- birdnet_analyzer/gui/assets/arrow_up.svg +4 -4
- birdnet_analyzer/gui/assets/gui.css +36 -28
- birdnet_analyzer/gui/assets/gui.js +93 -93
- birdnet_analyzer/gui/embeddings.py +638 -620
- birdnet_analyzer/gui/evaluation.py +801 -813
- birdnet_analyzer/gui/localization.py +75 -68
- birdnet_analyzer/gui/multi_file.py +265 -246
- birdnet_analyzer/gui/review.py +472 -527
- birdnet_analyzer/gui/segments.py +191 -191
- birdnet_analyzer/gui/settings.py +149 -129
- birdnet_analyzer/gui/single_file.py +264 -269
- birdnet_analyzer/gui/species.py +95 -95
- birdnet_analyzer/gui/train.py +687 -698
- birdnet_analyzer/gui/utils.py +797 -808
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_af.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ar.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_bg.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ca.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_cs.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_da.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_de.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_el.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_en_uk.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_es.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_fi.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_fr.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_he.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_hr.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_hu.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_in.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_is.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_it.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ja.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ko.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_lt.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ml.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_nl.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_no.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pl.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pt_BR.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pt_PT.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ro.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ru.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sk.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sl.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sr.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sv.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_th.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_tr.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_uk.txt +6522 -6522
- birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_zh.txt +6522 -6522
- birdnet_analyzer/lang/de.json +341 -334
- birdnet_analyzer/lang/en.json +341 -334
- birdnet_analyzer/lang/fi.json +341 -334
- birdnet_analyzer/lang/fr.json +341 -334
- birdnet_analyzer/lang/id.json +341 -334
- birdnet_analyzer/lang/pt-br.json +341 -334
- birdnet_analyzer/lang/ru.json +341 -334
- birdnet_analyzer/lang/se.json +341 -334
- birdnet_analyzer/lang/tlh.json +341 -334
- birdnet_analyzer/lang/zh_TW.json +341 -334
- birdnet_analyzer/model.py +1212 -1243
- birdnet_analyzer/playground.py +5 -0
- birdnet_analyzer/search/__init__.py +3 -3
- birdnet_analyzer/search/__main__.py +3 -3
- birdnet_analyzer/search/cli.py +11 -12
- birdnet_analyzer/search/core.py +78 -78
- birdnet_analyzer/search/utils.py +107 -111
- birdnet_analyzer/segments/__init__.py +3 -3
- birdnet_analyzer/segments/__main__.py +3 -3
- birdnet_analyzer/segments/cli.py +13 -14
- birdnet_analyzer/segments/core.py +81 -78
- birdnet_analyzer/segments/utils.py +383 -394
- birdnet_analyzer/species/__init__.py +3 -3
- birdnet_analyzer/species/__main__.py +3 -3
- birdnet_analyzer/species/cli.py +13 -14
- birdnet_analyzer/species/core.py +35 -35
- birdnet_analyzer/species/utils.py +74 -75
- birdnet_analyzer/train/__init__.py +3 -3
- birdnet_analyzer/train/__main__.py +3 -3
- birdnet_analyzer/train/cli.py +13 -14
- birdnet_analyzer/train/core.py +113 -113
- birdnet_analyzer/train/utils.py +877 -847
- birdnet_analyzer/translate.py +133 -104
- birdnet_analyzer/utils.py +425 -419
- {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.1.0.dist-info}/METADATA +146 -129
- birdnet_analyzer-2.1.0.dist-info/RECORD +125 -0
- {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.1.0.dist-info}/WHEEL +1 -1
- {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.1.0.dist-info}/licenses/LICENSE +18 -18
- birdnet_analyzer/eBird_taxonomy_codes_2021E.json +0 -25280
- birdnet_analyzer-2.0.0.dist-info/RECORD +0 -117
- {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.1.0.dist-info}/entry_points.txt +0 -0
- {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.1.0.dist-info}/top_level.txt +0 -0
birdnet_analyzer/audio.py
CHANGED
@@ -1,372 +1,368 @@
|
|
1
|
-
"""Module containing audio helper functions."""
|
2
|
-
|
3
|
-
import librosa
|
4
|
-
import numpy as np
|
5
|
-
import soundfile as sf
|
6
|
-
from scipy.signal import firwin, kaiserord, lfilter
|
7
|
-
|
8
|
-
import birdnet_analyzer.config as cfg
|
9
|
-
|
10
|
-
RANDOM = np.random.RandomState(cfg.RANDOM_SEED)
|
11
|
-
|
12
|
-
|
13
|
-
def open_audio_file(path: str, sample_rate=48000, offset=0.0, duration=None, fmin=None, fmax=None, speed=1.0):
|
14
|
-
"""Open an audio file.
|
15
|
-
|
16
|
-
Opens an audio file with librosa and the given settings.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
path: Path to the audio file.
|
20
|
-
sample_rate: The sample rate at which the file should be processed.
|
21
|
-
offset: The starting offset.
|
22
|
-
duration: Maximum duration of the loaded content.
|
23
|
-
fmin: Minimum frequency for bandpass filter.
|
24
|
-
fmax: Maximum frequency for bandpass filter.
|
25
|
-
speed: Speed factor for audio playback.
|
26
|
-
|
27
|
-
Returns:
|
28
|
-
Returns the audio time series and the sampling rate.
|
29
|
-
"""
|
30
|
-
# Open file with librosa (uses ffmpeg or libav)
|
31
|
-
if speed == 1.0:
|
32
|
-
sig, rate = librosa.load(
|
33
|
-
path, sr=sample_rate, offset=offset, duration=duration, mono=True, res_type="kaiser_fast"
|
34
|
-
)
|
35
|
-
|
36
|
-
else:
|
37
|
-
# Load audio with original sample rate
|
38
|
-
sig, rate = librosa.load(path, sr=None, offset=offset, duration=duration, mono=True)
|
39
|
-
|
40
|
-
# Resample with "fake" sample rate
|
41
|
-
sig = librosa.resample(sig, orig_sr=int(rate * speed), target_sr=sample_rate, res_type="kaiser_fast")
|
42
|
-
rate = sample_rate
|
43
|
-
|
44
|
-
# Bandpass filter
|
45
|
-
if fmin is not None and fmax is not None:
|
46
|
-
sig = bandpass(sig, rate, fmin, fmax)
|
47
|
-
# sig = bandpassKaiserFIR(sig, rate, fmin, fmax)
|
48
|
-
|
49
|
-
return sig, rate
|
50
|
-
|
51
|
-
|
52
|
-
def get_audio_file_length(path):
|
53
|
-
"""
|
54
|
-
Get the length of an audio file in seconds.
|
55
|
-
|
56
|
-
Args:
|
57
|
-
path (str): The file path to the audio file.
|
58
|
-
|
59
|
-
Returns:
|
60
|
-
float: The duration of the audio file in seconds.
|
61
|
-
"""
|
62
|
-
# Open file with librosa (uses ffmpeg or libav)
|
63
|
-
|
64
|
-
return librosa.get_duration(
|
65
|
-
|
66
|
-
|
67
|
-
def get_sample_rate(path: str):
|
68
|
-
"""
|
69
|
-
Get the sample rate of an audio file.
|
70
|
-
|
71
|
-
Args:
|
72
|
-
path (str): The file path to the audio file.
|
73
|
-
|
74
|
-
Returns:
|
75
|
-
int: The sample rate of the audio file.
|
76
|
-
"""
|
77
|
-
return librosa.get_samplerate(path)
|
78
|
-
|
79
|
-
|
80
|
-
def save_signal(sig, fname: str, rate=48000):
|
81
|
-
"""Saves a signal to file.
|
82
|
-
|
83
|
-
Args:
|
84
|
-
sig: The signal to be saved.
|
85
|
-
fname: The file path.
|
86
|
-
|
87
|
-
Returns:
|
88
|
-
None
|
89
|
-
"""
|
90
|
-
|
91
|
-
sf.write(fname, sig, rate, "PCM_16")
|
92
|
-
|
93
|
-
|
94
|
-
def pad(sig, seconds, srate, amount=None):
|
95
|
-
"""Creates a noise vector with the given shape.
|
96
|
-
|
97
|
-
Args:
|
98
|
-
sig: The original audio signal.
|
99
|
-
shape: Shape of the noise.
|
100
|
-
amount: The noise intensity.
|
101
|
-
|
102
|
-
Returns:
|
103
|
-
An numpy array of noise with the given shape.
|
104
|
-
"""
|
105
|
-
|
106
|
-
target_len = int(srate * seconds)
|
107
|
-
|
108
|
-
if len(sig) < target_len:
|
109
|
-
noise_shape = target_len - len(sig)
|
110
|
-
|
111
|
-
if not cfg.USE_NOISE:
|
112
|
-
noise = np.zeros(noise_shape, dtype=sig.dtype)
|
113
|
-
else:
|
114
|
-
# Random noise intensity
|
115
|
-
if amount is None:
|
116
|
-
amount = RANDOM.uniform(0.1, 0.5)
|
117
|
-
|
118
|
-
# Create Gaussian noise
|
119
|
-
try:
|
120
|
-
noise = RANDOM.normal(min(sig) * amount, max(sig) * amount, noise_shape).astype(sig.dtype)
|
121
|
-
except:
|
122
|
-
noise = np.zeros(noise_shape, dtype=sig.dtype)
|
123
|
-
|
124
|
-
return np.concatenate((sig, noise))
|
125
|
-
|
126
|
-
return sig
|
127
|
-
|
128
|
-
|
129
|
-
def split_signal(sig, rate, seconds, overlap, minlen, amount=None):
|
130
|
-
"""Split signal with overlap.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
sig: The original signal to be split.
|
134
|
-
rate: The sampling rate.
|
135
|
-
seconds: The duration of a segment.
|
136
|
-
overlap: The overlapping seconds of segments.
|
137
|
-
minlen: Minimum length of a split.
|
138
|
-
|
139
|
-
Returns:
|
140
|
-
A list of splits.
|
141
|
-
"""
|
142
|
-
|
143
|
-
# Split signal to chunks of duration with overlap, whereas each chunk still has minimum duration of signal
|
144
|
-
if rate is None or rate <= 0:
|
145
|
-
rate = cfg.SAMPLE_RATE
|
146
|
-
if seconds is None or seconds <= 0:
|
147
|
-
seconds = cfg.SIG_LENGTH
|
148
|
-
if overlap is None or overlap < 0:
|
149
|
-
overlap = cfg.SIG_OVERLAP
|
150
|
-
if minlen is None or minlen <= 0 or minlen > seconds:
|
151
|
-
minlen = cfg.SIG_MINLEN
|
152
|
-
|
153
|
-
# Make sure overlap is smaller then signal duration
|
154
|
-
if overlap >= seconds:
|
155
|
-
overlap = seconds - 0.01
|
156
|
-
|
157
|
-
# Number of frames per chunk, per step and per minimum signal
|
158
|
-
chunksize = int(rate * seconds)
|
159
|
-
stepsize = int(rate * (seconds - overlap))
|
160
|
-
minsize = int(rate * minlen)
|
161
|
-
|
162
|
-
# Start of last chunk
|
163
|
-
lastchunkpos = int((sig.size - chunksize + stepsize - 1) / stepsize) * stepsize
|
164
|
-
# Make sure at least one chunk is returned
|
165
|
-
if lastchunkpos < 0:
|
166
|
-
lastchunkpos = 0
|
167
|
-
# Omit last chunk if minimum signal duration is underrun
|
168
|
-
elif sig.size - lastchunkpos < minsize:
|
169
|
-
lastchunkpos = lastchunkpos - stepsize
|
170
|
-
|
171
|
-
# Append noise or empty signal of chunk duration, so all splits have desired length
|
172
|
-
if not cfg.USE_NOISE:
|
173
|
-
noise = np.zeros(shape=chunksize, dtype=sig.dtype)
|
174
|
-
else:
|
175
|
-
# Random noise intensity
|
176
|
-
if amount is None:
|
177
|
-
amount = RANDOM.uniform(0.1, 0.5)
|
178
|
-
# Create Gaussian noise
|
179
|
-
try:
|
180
|
-
noise = RANDOM.normal(loc=min(sig) * amount, scale=max(sig) * amount, size=chunksize).astype(sig.dtype)
|
181
|
-
except:
|
182
|
-
noise = np.zeros(shape=chunksize, dtype=sig.dtype)
|
183
|
-
data = np.concatenate((sig, noise))
|
184
|
-
|
185
|
-
# Split signal with overlap
|
186
|
-
sig_splits = []
|
187
|
-
for i in range(0,
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
def smart_crop_signal(sig, rate, sig_length, sig_overlap, sig_minlen):
|
216
|
-
"""Smart crop audio signal based on peak detection.
|
217
|
-
|
218
|
-
This function analyzes the audio signal to find peaks in energy/amplitude,
|
219
|
-
which are more likely to contain relevant target signals (e.g., bird calls).
|
220
|
-
Only the audio segments with the highest energy peaks are returned.
|
221
|
-
|
222
|
-
Args:
|
223
|
-
sig: The audio signal.
|
224
|
-
rate: The sample rate of the audio signal.
|
225
|
-
sig_length: The desired length of each snippet in seconds.
|
226
|
-
sig_overlap: The overlap between snippets in seconds.
|
227
|
-
sig_minlen: The minimum length of a snippet in seconds.
|
228
|
-
|
229
|
-
Returns:
|
230
|
-
A list of audio snippets with the highest energy/peaks.
|
231
|
-
"""
|
232
|
-
|
233
|
-
# If signal is too short, just return it
|
234
|
-
if len(sig) / rate <= sig_length:
|
235
|
-
return [sig]
|
236
|
-
|
237
|
-
#
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
#
|
251
|
-
energy
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
#
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
#
|
326
|
-
#
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
# Apply the filter to the signal.
|
370
|
-
sig = lfilter(taps, 1.0, sig)
|
371
|
-
|
372
|
-
return sig.astype("float32")
|
1
|
+
"""Module containing audio helper functions."""
|
2
|
+
|
3
|
+
import librosa
|
4
|
+
import numpy as np
|
5
|
+
import soundfile as sf
|
6
|
+
from scipy.signal import find_peaks, firwin, kaiserord, lfilter
|
7
|
+
|
8
|
+
import birdnet_analyzer.config as cfg
|
9
|
+
|
10
|
+
RANDOM = np.random.RandomState(cfg.RANDOM_SEED)
|
11
|
+
|
12
|
+
|
13
|
+
def open_audio_file(path: str, sample_rate=48000, offset=0.0, duration=None, fmin=None, fmax=None, speed=1.0):
|
14
|
+
"""Open an audio file.
|
15
|
+
|
16
|
+
Opens an audio file with librosa and the given settings.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
path: Path to the audio file.
|
20
|
+
sample_rate: The sample rate at which the file should be processed.
|
21
|
+
offset: The starting offset.
|
22
|
+
duration: Maximum duration of the loaded content.
|
23
|
+
fmin: Minimum frequency for bandpass filter.
|
24
|
+
fmax: Maximum frequency for bandpass filter.
|
25
|
+
speed: Speed factor for audio playback.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
Returns the audio time series and the sampling rate.
|
29
|
+
"""
|
30
|
+
# Open file with librosa (uses ffmpeg or libav)
|
31
|
+
if speed == 1.0:
|
32
|
+
sig, rate = librosa.load(
|
33
|
+
path, sr=sample_rate, offset=offset, duration=duration, mono=True, res_type="kaiser_fast"
|
34
|
+
)
|
35
|
+
|
36
|
+
else:
|
37
|
+
# Load audio with original sample rate
|
38
|
+
sig, rate = librosa.load(path, sr=None, offset=offset, duration=duration, mono=True)
|
39
|
+
|
40
|
+
# Resample with "fake" sample rate
|
41
|
+
sig = librosa.resample(sig, orig_sr=int(rate * speed), target_sr=sample_rate, res_type="kaiser_fast")
|
42
|
+
rate = sample_rate
|
43
|
+
|
44
|
+
# Bandpass filter
|
45
|
+
if fmin is not None and fmax is not None:
|
46
|
+
sig = bandpass(sig, rate, fmin, fmax)
|
47
|
+
# sig = bandpassKaiserFIR(sig, rate, fmin, fmax)
|
48
|
+
|
49
|
+
return sig, rate
|
50
|
+
|
51
|
+
|
52
|
+
def get_audio_file_length(path):
|
53
|
+
"""
|
54
|
+
Get the length of an audio file in seconds.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
path (str): The file path to the audio file.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
float: The duration of the audio file in seconds.
|
61
|
+
"""
|
62
|
+
# Open file with librosa (uses ffmpeg or libav)
|
63
|
+
|
64
|
+
return librosa.get_duration(path=path, sr=None)
|
65
|
+
|
66
|
+
|
67
|
+
def get_sample_rate(path: str):
|
68
|
+
"""
|
69
|
+
Get the sample rate of an audio file.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
path (str): The file path to the audio file.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
int: The sample rate of the audio file.
|
76
|
+
"""
|
77
|
+
return librosa.get_samplerate(path)
|
78
|
+
|
79
|
+
|
80
|
+
def save_signal(sig, fname: str, rate=48000):
|
81
|
+
"""Saves a signal to file.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
sig: The signal to be saved.
|
85
|
+
fname: The file path.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
None
|
89
|
+
"""
|
90
|
+
|
91
|
+
sf.write(fname, sig, rate, "PCM_16")
|
92
|
+
|
93
|
+
|
94
|
+
def pad(sig, seconds, srate, amount=None):
|
95
|
+
"""Creates a noise vector with the given shape.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
sig: The original audio signal.
|
99
|
+
shape: Shape of the noise.
|
100
|
+
amount: The noise intensity.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
An numpy array of noise with the given shape.
|
104
|
+
"""
|
105
|
+
|
106
|
+
target_len = int(srate * seconds)
|
107
|
+
|
108
|
+
if len(sig) < target_len:
|
109
|
+
noise_shape = target_len - len(sig)
|
110
|
+
|
111
|
+
if not cfg.USE_NOISE:
|
112
|
+
noise = np.zeros(noise_shape, dtype=sig.dtype)
|
113
|
+
else:
|
114
|
+
# Random noise intensity
|
115
|
+
if amount is None:
|
116
|
+
amount = RANDOM.uniform(0.1, 0.5)
|
117
|
+
|
118
|
+
# Create Gaussian noise
|
119
|
+
try:
|
120
|
+
noise = RANDOM.normal(min(sig) * amount, max(sig) * amount, noise_shape).astype(sig.dtype)
|
121
|
+
except:
|
122
|
+
noise = np.zeros(noise_shape, dtype=sig.dtype)
|
123
|
+
|
124
|
+
return np.concatenate((sig, noise))
|
125
|
+
|
126
|
+
return sig
|
127
|
+
|
128
|
+
|
129
|
+
def split_signal(sig, rate, seconds, overlap, minlen, amount=None):
|
130
|
+
"""Split signal with overlap.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
sig: The original signal to be split.
|
134
|
+
rate: The sampling rate.
|
135
|
+
seconds: The duration of a segment.
|
136
|
+
overlap: The overlapping seconds of segments.
|
137
|
+
minlen: Minimum length of a split.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
A list of splits.
|
141
|
+
"""
|
142
|
+
|
143
|
+
# Split signal to chunks of duration with overlap, whereas each chunk still has minimum duration of signal
|
144
|
+
if rate is None or rate <= 0:
|
145
|
+
rate = cfg.SAMPLE_RATE
|
146
|
+
if seconds is None or seconds <= 0:
|
147
|
+
seconds = cfg.SIG_LENGTH
|
148
|
+
if overlap is None or overlap < 0:
|
149
|
+
overlap = cfg.SIG_OVERLAP
|
150
|
+
if minlen is None or minlen <= 0 or minlen > seconds:
|
151
|
+
minlen = cfg.SIG_MINLEN
|
152
|
+
|
153
|
+
# Make sure overlap is smaller then signal duration
|
154
|
+
if overlap >= seconds:
|
155
|
+
overlap = seconds - 0.01
|
156
|
+
|
157
|
+
# Number of frames per chunk, per step and per minimum signal
|
158
|
+
chunksize = int(rate * seconds)
|
159
|
+
stepsize = int(rate * (seconds - overlap))
|
160
|
+
minsize = int(rate * minlen)
|
161
|
+
|
162
|
+
# Start of last chunk
|
163
|
+
lastchunkpos = int((sig.size - chunksize + stepsize - 1) / stepsize) * stepsize
|
164
|
+
# Make sure at least one chunk is returned
|
165
|
+
if lastchunkpos < 0:
|
166
|
+
lastchunkpos = 0
|
167
|
+
# Omit last chunk if minimum signal duration is underrun
|
168
|
+
elif sig.size - lastchunkpos < minsize:
|
169
|
+
lastchunkpos = lastchunkpos - stepsize
|
170
|
+
|
171
|
+
# Append noise or empty signal of chunk duration, so all splits have desired length
|
172
|
+
if not cfg.USE_NOISE:
|
173
|
+
noise = np.zeros(shape=chunksize, dtype=sig.dtype)
|
174
|
+
else:
|
175
|
+
# Random noise intensity
|
176
|
+
if amount is None:
|
177
|
+
amount = RANDOM.uniform(0.1, 0.5)
|
178
|
+
# Create Gaussian noise
|
179
|
+
try:
|
180
|
+
noise = RANDOM.normal(loc=min(sig) * amount, scale=max(sig) * amount, size=chunksize).astype(sig.dtype)
|
181
|
+
except:
|
182
|
+
noise = np.zeros(shape=chunksize, dtype=sig.dtype)
|
183
|
+
data = np.concatenate((sig, noise))
|
184
|
+
|
185
|
+
# Split signal with overlap
|
186
|
+
sig_splits = []
|
187
|
+
sig_splits.extend(data[i : i + chunksize] for i in range(0, lastchunkpos + 1, stepsize))
|
188
|
+
|
189
|
+
return sig_splits
|
190
|
+
|
191
|
+
|
192
|
+
def crop_center(sig, rate, seconds):
|
193
|
+
"""Crop signal to center.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
sig: The original signal.
|
197
|
+
rate: The sampling rate.
|
198
|
+
seconds: The length of the signal.
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
The cropped signal.
|
202
|
+
"""
|
203
|
+
if len(sig) > int(seconds * rate):
|
204
|
+
start = int((len(sig) - int(seconds * rate)) / 2)
|
205
|
+
end = start + int(seconds * rate)
|
206
|
+
sig = sig[start:end]
|
207
|
+
|
208
|
+
# Pad with noise
|
209
|
+
else:
|
210
|
+
sig = pad(sig, seconds, rate, 0.5)
|
211
|
+
|
212
|
+
return sig
|
213
|
+
|
214
|
+
|
215
|
+
def smart_crop_signal(sig, rate, sig_length, sig_overlap, sig_minlen):
|
216
|
+
"""Smart crop audio signal based on peak detection.
|
217
|
+
|
218
|
+
This function analyzes the audio signal to find peaks in energy/amplitude,
|
219
|
+
which are more likely to contain relevant target signals (e.g., bird calls).
|
220
|
+
Only the audio segments with the highest energy peaks are returned.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
sig: The audio signal.
|
224
|
+
rate: The sample rate of the audio signal.
|
225
|
+
sig_length: The desired length of each snippet in seconds.
|
226
|
+
sig_overlap: The overlap between snippets in seconds.
|
227
|
+
sig_minlen: The minimum length of a snippet in seconds.
|
228
|
+
|
229
|
+
Returns:
|
230
|
+
A list of audio snippets with the highest energy/peaks.
|
231
|
+
"""
|
232
|
+
|
233
|
+
# If signal is too short, just return it
|
234
|
+
if len(sig) / rate <= sig_length:
|
235
|
+
return [sig]
|
236
|
+
|
237
|
+
# Split the signal into overlapping windows
|
238
|
+
splits = split_signal(sig, rate, sig_length, sig_overlap, sig_minlen)
|
239
|
+
|
240
|
+
if len(splits) <= 1:
|
241
|
+
return splits
|
242
|
+
|
243
|
+
# Calculate energy for each window
|
244
|
+
energies = []
|
245
|
+
for split in splits:
|
246
|
+
# Calculate RMS energy
|
247
|
+
energy = np.sqrt(np.mean(split**2))
|
248
|
+
# Also consider peak values
|
249
|
+
peak = np.max(np.abs(split))
|
250
|
+
# Combine both metrics
|
251
|
+
energies.append(energy * 0.7 + peak * 0.3) # Weighted combination
|
252
|
+
|
253
|
+
# Find peaks in the energy curve
|
254
|
+
# Smooth energies first to avoid small fluctuations
|
255
|
+
smoothed_energies = np.convolve(energies, np.ones(3) / 3, mode="same")
|
256
|
+
peaks, _ = find_peaks(smoothed_energies, height=np.mean(smoothed_energies), distance=2)
|
257
|
+
|
258
|
+
# If no clear peaks found, fall back to selecting top energy segments
|
259
|
+
if len(peaks) < 2:
|
260
|
+
# Sort segments by energy and take top segments (up to 3 or 1/3 of total, whichever is more)
|
261
|
+
num_segments = max(3, len(splits) // 3)
|
262
|
+
indices = np.argsort(energies)[-num_segments:]
|
263
|
+
return [splits[i] for i in sorted(indices)]
|
264
|
+
|
265
|
+
# Return the audio segments corresponding to the peaks
|
266
|
+
peak_splits = [splits[i] for i in peaks]
|
267
|
+
|
268
|
+
# If we have too many peaks, select the strongest ones
|
269
|
+
if len(peak_splits) > 5:
|
270
|
+
peak_energies = [energies[i] for i in peaks]
|
271
|
+
sorted_indices = np.argsort(peak_energies)[::-1] # Sort in descending order
|
272
|
+
peak_splits = [peak_splits[i] for i in sorted_indices[:5]] # Take top 5
|
273
|
+
|
274
|
+
return peak_splits
|
275
|
+
|
276
|
+
|
277
|
+
def bandpass(sig, rate, fmin, fmax, order=5):
|
278
|
+
"""
|
279
|
+
Apply a bandpass filter to the input signal.
|
280
|
+
|
281
|
+
Args:
|
282
|
+
sig (numpy.ndarray): The input signal to be filtered.
|
283
|
+
rate (int): The sampling rate of the input signal.
|
284
|
+
fmin (float): The minimum frequency for the bandpass filter.
|
285
|
+
fmax (float): The maximum frequency for the bandpass filter.
|
286
|
+
order (int, optional): The order of the filter. Default is 5.
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
numpy.ndarray: The filtered signal as a float32 array.
|
290
|
+
"""
|
291
|
+
# Check if we have to bandpass at all
|
292
|
+
if (fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX) or fmin > fmax:
|
293
|
+
return sig
|
294
|
+
|
295
|
+
from scipy.signal import butter, lfilter
|
296
|
+
|
297
|
+
nyquist = 0.5 * rate
|
298
|
+
|
299
|
+
# Highpass?
|
300
|
+
if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
|
301
|
+
low = fmin / nyquist
|
302
|
+
b, a = butter(order, low, btype="high")
|
303
|
+
sig = lfilter(b, a, sig)
|
304
|
+
|
305
|
+
# Lowpass?
|
306
|
+
elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
|
307
|
+
high = fmax / nyquist
|
308
|
+
b, a = butter(order, high, btype="low")
|
309
|
+
sig = lfilter(b, a, sig)
|
310
|
+
|
311
|
+
# Bandpass?
|
312
|
+
elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
|
313
|
+
low = fmin / nyquist
|
314
|
+
high = fmax / nyquist
|
315
|
+
b, a = butter(order, [low, high], btype="band")
|
316
|
+
sig = lfilter(b, a, sig)
|
317
|
+
|
318
|
+
return sig.astype("float32")
|
319
|
+
|
320
|
+
|
321
|
+
# Raven is using Kaiser window FIR filter, so we try to emulate it.
|
322
|
+
# Raven uses the Window method for FIR filter design.
|
323
|
+
# A Kaiser window is used with a default transition bandwidth of 0.02 times
|
324
|
+
# the Nyquist frequency and a default stop band attenuation of 100 dB.
|
325
|
+
# For a complete description of this method, see Discrete-Time Signal Processing
|
326
|
+
# (Second Edition), by Alan Oppenheim, Ronald Schafer, and John Buck, Prentice Hall 1998, pp. 474-476.
|
327
|
+
def bandpass_kaiser_fir(sig, rate, fmin, fmax, width=0.02, stopband_attenuation_db=100):
|
328
|
+
"""
|
329
|
+
Applies a bandpass filter to the given signal using a Kaiser window FIR filter.
|
330
|
+
Args:
|
331
|
+
sig (numpy.ndarray): The input signal to be filtered.
|
332
|
+
rate (int): The sample rate of the input signal.
|
333
|
+
fmin (float): The minimum frequency of the bandpass filter.
|
334
|
+
fmax (float): The maximum frequency of the bandpass filter.
|
335
|
+
width (float, optional): The transition width of the filter. Default is 0.02.
|
336
|
+
stopband_attenuation_db (float, optional): The desired attenuation in the stopband, in decibels. Default is 100.
|
337
|
+
Returns:
|
338
|
+
numpy.ndarray: The filtered signal as a float32 numpy array.
|
339
|
+
"""
|
340
|
+
# Check if we have to bandpass at all
|
341
|
+
if (fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX) or fmin > fmax:
|
342
|
+
return sig
|
343
|
+
|
344
|
+
nyquist = 0.5 * rate
|
345
|
+
|
346
|
+
# Calculate the order and Kaiser parameter for the desired specifications.
|
347
|
+
N, beta = kaiserord(stopband_attenuation_db, width)
|
348
|
+
|
349
|
+
# Highpass?
|
350
|
+
if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
|
351
|
+
low = fmin / nyquist
|
352
|
+
taps = firwin(N, low, window=("kaiser", beta), pass_zero=False)
|
353
|
+
|
354
|
+
# Lowpass?
|
355
|
+
elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
|
356
|
+
high = fmax / nyquist
|
357
|
+
taps = firwin(N, high, window=("kaiser", beta), pass_zero=True)
|
358
|
+
|
359
|
+
# Bandpass?
|
360
|
+
elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
|
361
|
+
low = fmin / nyquist
|
362
|
+
high = fmax / nyquist
|
363
|
+
taps = firwin(N, [low, high], window=("kaiser", beta), pass_zero=False)
|
364
|
+
|
365
|
+
# Apply the filter to the signal.
|
366
|
+
sig = lfilter(taps, 1.0, sig)
|
367
|
+
|
368
|
+
return sig.astype("float32")
|