birdnet-analyzer 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. birdnet_analyzer/__init__.py +9 -8
  2. birdnet_analyzer/analyze/__init__.py +5 -5
  3. birdnet_analyzer/analyze/__main__.py +3 -4
  4. birdnet_analyzer/analyze/cli.py +25 -25
  5. birdnet_analyzer/analyze/core.py +241 -245
  6. birdnet_analyzer/analyze/utils.py +692 -701
  7. birdnet_analyzer/audio.py +368 -372
  8. birdnet_analyzer/cli.py +709 -707
  9. birdnet_analyzer/config.py +242 -242
  10. birdnet_analyzer/eBird_taxonomy_codes_2021E.json +25279 -25279
  11. birdnet_analyzer/embeddings/__init__.py +3 -4
  12. birdnet_analyzer/embeddings/__main__.py +3 -3
  13. birdnet_analyzer/embeddings/cli.py +12 -13
  14. birdnet_analyzer/embeddings/core.py +69 -70
  15. birdnet_analyzer/embeddings/utils.py +179 -193
  16. birdnet_analyzer/evaluation/__init__.py +196 -195
  17. birdnet_analyzer/evaluation/__main__.py +3 -3
  18. birdnet_analyzer/evaluation/assessment/__init__.py +0 -0
  19. birdnet_analyzer/evaluation/assessment/metrics.py +388 -0
  20. birdnet_analyzer/evaluation/assessment/performance_assessor.py +409 -0
  21. birdnet_analyzer/evaluation/assessment/plotting.py +379 -0
  22. birdnet_analyzer/evaluation/preprocessing/__init__.py +0 -0
  23. birdnet_analyzer/evaluation/preprocessing/data_processor.py +631 -0
  24. birdnet_analyzer/evaluation/preprocessing/utils.py +98 -0
  25. birdnet_analyzer/gui/__init__.py +19 -23
  26. birdnet_analyzer/gui/__main__.py +3 -3
  27. birdnet_analyzer/gui/analysis.py +175 -174
  28. birdnet_analyzer/gui/assets/arrow_down.svg +4 -4
  29. birdnet_analyzer/gui/assets/arrow_left.svg +4 -4
  30. birdnet_analyzer/gui/assets/arrow_right.svg +4 -4
  31. birdnet_analyzer/gui/assets/arrow_up.svg +4 -4
  32. birdnet_analyzer/gui/assets/gui.css +28 -28
  33. birdnet_analyzer/gui/assets/gui.js +93 -93
  34. birdnet_analyzer/gui/embeddings.py +619 -620
  35. birdnet_analyzer/gui/evaluation.py +795 -813
  36. birdnet_analyzer/gui/localization.py +75 -68
  37. birdnet_analyzer/gui/multi_file.py +245 -246
  38. birdnet_analyzer/gui/review.py +519 -527
  39. birdnet_analyzer/gui/segments.py +191 -191
  40. birdnet_analyzer/gui/settings.py +128 -129
  41. birdnet_analyzer/gui/single_file.py +267 -269
  42. birdnet_analyzer/gui/species.py +95 -95
  43. birdnet_analyzer/gui/train.py +696 -698
  44. birdnet_analyzer/gui/utils.py +810 -808
  45. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_af.txt +6522 -6522
  46. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ar.txt +6522 -6522
  47. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_bg.txt +6522 -6522
  48. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ca.txt +6522 -6522
  49. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_cs.txt +6522 -6522
  50. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_da.txt +6522 -6522
  51. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_de.txt +6522 -6522
  52. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_el.txt +6522 -6522
  53. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_en_uk.txt +6522 -6522
  54. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_es.txt +6522 -6522
  55. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_fi.txt +6522 -6522
  56. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_fr.txt +6522 -6522
  57. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_he.txt +6522 -6522
  58. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_hr.txt +6522 -6522
  59. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_hu.txt +6522 -6522
  60. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_in.txt +6522 -6522
  61. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_is.txt +6522 -6522
  62. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_it.txt +6522 -6522
  63. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ja.txt +6522 -6522
  64. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ko.txt +6522 -6522
  65. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_lt.txt +6522 -6522
  66. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ml.txt +6522 -6522
  67. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_nl.txt +6522 -6522
  68. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_no.txt +6522 -6522
  69. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pl.txt +6522 -6522
  70. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pt_BR.txt +6522 -6522
  71. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_pt_PT.txt +6522 -6522
  72. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ro.txt +6522 -6522
  73. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_ru.txt +6522 -6522
  74. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sk.txt +6522 -6522
  75. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sl.txt +6522 -6522
  76. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sr.txt +6522 -6522
  77. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_sv.txt +6522 -6522
  78. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_th.txt +6522 -6522
  79. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_tr.txt +6522 -6522
  80. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_uk.txt +6522 -6522
  81. birdnet_analyzer/labels/V2.4/BirdNET_GLOBAL_6K_V2.4_Labels_zh.txt +6522 -6522
  82. birdnet_analyzer/lang/de.json +334 -334
  83. birdnet_analyzer/lang/en.json +334 -334
  84. birdnet_analyzer/lang/fi.json +334 -334
  85. birdnet_analyzer/lang/fr.json +334 -334
  86. birdnet_analyzer/lang/id.json +334 -334
  87. birdnet_analyzer/lang/pt-br.json +334 -334
  88. birdnet_analyzer/lang/ru.json +334 -334
  89. birdnet_analyzer/lang/se.json +334 -334
  90. birdnet_analyzer/lang/tlh.json +334 -334
  91. birdnet_analyzer/lang/zh_TW.json +334 -334
  92. birdnet_analyzer/model.py +1212 -1243
  93. birdnet_analyzer/playground.py +5 -0
  94. birdnet_analyzer/search/__init__.py +3 -3
  95. birdnet_analyzer/search/__main__.py +3 -3
  96. birdnet_analyzer/search/cli.py +11 -12
  97. birdnet_analyzer/search/core.py +78 -78
  98. birdnet_analyzer/search/utils.py +107 -111
  99. birdnet_analyzer/segments/__init__.py +3 -3
  100. birdnet_analyzer/segments/__main__.py +3 -3
  101. birdnet_analyzer/segments/cli.py +13 -14
  102. birdnet_analyzer/segments/core.py +81 -78
  103. birdnet_analyzer/segments/utils.py +383 -394
  104. birdnet_analyzer/species/__init__.py +3 -3
  105. birdnet_analyzer/species/__main__.py +3 -3
  106. birdnet_analyzer/species/cli.py +13 -14
  107. birdnet_analyzer/species/core.py +35 -35
  108. birdnet_analyzer/species/utils.py +74 -75
  109. birdnet_analyzer/train/__init__.py +3 -3
  110. birdnet_analyzer/train/__main__.py +3 -3
  111. birdnet_analyzer/train/cli.py +13 -14
  112. birdnet_analyzer/train/core.py +113 -113
  113. birdnet_analyzer/train/utils.py +877 -847
  114. birdnet_analyzer/translate.py +133 -104
  115. birdnet_analyzer/utils.py +426 -419
  116. {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.0.1.dist-info}/METADATA +137 -129
  117. birdnet_analyzer-2.0.1.dist-info/RECORD +125 -0
  118. {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.0.1.dist-info}/WHEEL +1 -1
  119. {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.0.1.dist-info}/licenses/LICENSE +18 -18
  120. birdnet_analyzer-2.0.0.dist-info/RECORD +0 -117
  121. {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.0.1.dist-info}/entry_points.txt +0 -0
  122. {birdnet_analyzer-2.0.0.dist-info → birdnet_analyzer-2.0.1.dist-info}/top_level.txt +0 -0
birdnet_analyzer/audio.py CHANGED
@@ -1,372 +1,368 @@
1
- """Module containing audio helper functions."""
2
-
3
- import librosa
4
- import numpy as np
5
- import soundfile as sf
6
- from scipy.signal import firwin, kaiserord, lfilter, find_peaks
7
-
8
- import birdnet_analyzer.config as cfg
9
-
10
- RANDOM = np.random.RandomState(cfg.RANDOM_SEED)
11
-
12
-
13
- def open_audio_file(path: str, sample_rate=48000, offset=0.0, duration=None, fmin=None, fmax=None, speed=1.0):
14
- """Open an audio file.
15
-
16
- Opens an audio file with librosa and the given settings.
17
-
18
- Args:
19
- path: Path to the audio file.
20
- sample_rate: The sample rate at which the file should be processed.
21
- offset: The starting offset.
22
- duration: Maximum duration of the loaded content.
23
- fmin: Minimum frequency for bandpass filter.
24
- fmax: Maximum frequency for bandpass filter.
25
- speed: Speed factor for audio playback.
26
-
27
- Returns:
28
- Returns the audio time series and the sampling rate.
29
- """
30
- # Open file with librosa (uses ffmpeg or libav)
31
- if speed == 1.0:
32
- sig, rate = librosa.load(
33
- path, sr=sample_rate, offset=offset, duration=duration, mono=True, res_type="kaiser_fast"
34
- )
35
-
36
- else:
37
- # Load audio with original sample rate
38
- sig, rate = librosa.load(path, sr=None, offset=offset, duration=duration, mono=True)
39
-
40
- # Resample with "fake" sample rate
41
- sig = librosa.resample(sig, orig_sr=int(rate * speed), target_sr=sample_rate, res_type="kaiser_fast")
42
- rate = sample_rate
43
-
44
- # Bandpass filter
45
- if fmin is not None and fmax is not None:
46
- sig = bandpass(sig, rate, fmin, fmax)
47
- # sig = bandpassKaiserFIR(sig, rate, fmin, fmax)
48
-
49
- return sig, rate
50
-
51
-
52
- def get_audio_file_length(path):
53
- """
54
- Get the length of an audio file in seconds.
55
-
56
- Args:
57
- path (str): The file path to the audio file.
58
-
59
- Returns:
60
- float: The duration of the audio file in seconds.
61
- """
62
- # Open file with librosa (uses ffmpeg or libav)
63
-
64
- return librosa.get_duration(filename=path, sr=None)
65
-
66
-
67
- def get_sample_rate(path: str):
68
- """
69
- Get the sample rate of an audio file.
70
-
71
- Args:
72
- path (str): The file path to the audio file.
73
-
74
- Returns:
75
- int: The sample rate of the audio file.
76
- """
77
- return librosa.get_samplerate(path)
78
-
79
-
80
- def save_signal(sig, fname: str, rate=48000):
81
- """Saves a signal to file.
82
-
83
- Args:
84
- sig: The signal to be saved.
85
- fname: The file path.
86
-
87
- Returns:
88
- None
89
- """
90
-
91
- sf.write(fname, sig, rate, "PCM_16")
92
-
93
-
94
- def pad(sig, seconds, srate, amount=None):
95
- """Creates a noise vector with the given shape.
96
-
97
- Args:
98
- sig: The original audio signal.
99
- shape: Shape of the noise.
100
- amount: The noise intensity.
101
-
102
- Returns:
103
- An numpy array of noise with the given shape.
104
- """
105
-
106
- target_len = int(srate * seconds)
107
-
108
- if len(sig) < target_len:
109
- noise_shape = target_len - len(sig)
110
-
111
- if not cfg.USE_NOISE:
112
- noise = np.zeros(noise_shape, dtype=sig.dtype)
113
- else:
114
- # Random noise intensity
115
- if amount is None:
116
- amount = RANDOM.uniform(0.1, 0.5)
117
-
118
- # Create Gaussian noise
119
- try:
120
- noise = RANDOM.normal(min(sig) * amount, max(sig) * amount, noise_shape).astype(sig.dtype)
121
- except:
122
- noise = np.zeros(noise_shape, dtype=sig.dtype)
123
-
124
- return np.concatenate((sig, noise))
125
-
126
- return sig
127
-
128
-
129
- def split_signal(sig, rate, seconds, overlap, minlen, amount=None):
130
- """Split signal with overlap.
131
-
132
- Args:
133
- sig: The original signal to be split.
134
- rate: The sampling rate.
135
- seconds: The duration of a segment.
136
- overlap: The overlapping seconds of segments.
137
- minlen: Minimum length of a split.
138
-
139
- Returns:
140
- A list of splits.
141
- """
142
-
143
- # Split signal to chunks of duration with overlap, whereas each chunk still has minimum duration of signal
144
- if rate is None or rate <= 0:
145
- rate = cfg.SAMPLE_RATE
146
- if seconds is None or seconds <= 0:
147
- seconds = cfg.SIG_LENGTH
148
- if overlap is None or overlap < 0:
149
- overlap = cfg.SIG_OVERLAP
150
- if minlen is None or minlen <= 0 or minlen > seconds:
151
- minlen = cfg.SIG_MINLEN
152
-
153
- # Make sure overlap is smaller then signal duration
154
- if overlap >= seconds:
155
- overlap = seconds - 0.01
156
-
157
- # Number of frames per chunk, per step and per minimum signal
158
- chunksize = int(rate * seconds)
159
- stepsize = int(rate * (seconds - overlap))
160
- minsize = int(rate * minlen)
161
-
162
- # Start of last chunk
163
- lastchunkpos = int((sig.size - chunksize + stepsize - 1) / stepsize) * stepsize
164
- # Make sure at least one chunk is returned
165
- if lastchunkpos < 0:
166
- lastchunkpos = 0
167
- # Omit last chunk if minimum signal duration is underrun
168
- elif sig.size - lastchunkpos < minsize:
169
- lastchunkpos = lastchunkpos - stepsize
170
-
171
- # Append noise or empty signal of chunk duration, so all splits have desired length
172
- if not cfg.USE_NOISE:
173
- noise = np.zeros(shape=chunksize, dtype=sig.dtype)
174
- else:
175
- # Random noise intensity
176
- if amount is None:
177
- amount = RANDOM.uniform(0.1, 0.5)
178
- # Create Gaussian noise
179
- try:
180
- noise = RANDOM.normal(loc=min(sig) * amount, scale=max(sig) * amount, size=chunksize).astype(sig.dtype)
181
- except:
182
- noise = np.zeros(shape=chunksize, dtype=sig.dtype)
183
- data = np.concatenate((sig, noise))
184
-
185
- # Split signal with overlap
186
- sig_splits = []
187
- for i in range(0, 1 + lastchunkpos, stepsize):
188
- sig_splits.append(data[i : i + chunksize])
189
-
190
- return sig_splits
191
-
192
-
193
- def crop_center(sig, rate, seconds):
194
- """Crop signal to center.
195
-
196
- Args:
197
- sig: The original signal.
198
- rate: The sampling rate.
199
- seconds: The length of the signal.
200
-
201
- Returns:
202
- The cropped signal.
203
- """
204
- if len(sig) > int(seconds * rate):
205
- start = int((len(sig) - int(seconds * rate)) / 2)
206
- end = start + int(seconds * rate)
207
- sig = sig[start:end]
208
-
209
- # Pad with noise
210
- else:
211
- sig = pad(sig, seconds, rate, 0.5)
212
-
213
- return sig
214
-
215
- def smart_crop_signal(sig, rate, sig_length, sig_overlap, sig_minlen):
216
- """Smart crop audio signal based on peak detection.
217
-
218
- This function analyzes the audio signal to find peaks in energy/amplitude,
219
- which are more likely to contain relevant target signals (e.g., bird calls).
220
- Only the audio segments with the highest energy peaks are returned.
221
-
222
- Args:
223
- sig: The audio signal.
224
- rate: The sample rate of the audio signal.
225
- sig_length: The desired length of each snippet in seconds.
226
- sig_overlap: The overlap between snippets in seconds.
227
- sig_minlen: The minimum length of a snippet in seconds.
228
-
229
- Returns:
230
- A list of audio snippets with the highest energy/peaks.
231
- """
232
-
233
- # If signal is too short, just return it
234
- if len(sig) / rate <= sig_length:
235
- return [sig]
236
-
237
- # Calculate the window size in samples
238
- window_size = int(sig_length * rate)
239
- hop_size = int((sig_length - sig_overlap) * rate)
240
-
241
- # Split the signal into overlapping windows
242
- splits = split_signal(sig, rate, sig_length, sig_overlap, sig_minlen)
243
-
244
- if len(splits) <= 1:
245
- return splits
246
-
247
- # Calculate energy for each window
248
- energies = []
249
- for split in splits:
250
- # Calculate RMS energy
251
- energy = np.sqrt(np.mean(split**2))
252
- # Also consider peak values
253
- peak = np.max(np.abs(split))
254
- # Combine both metrics
255
- energies.append(energy * 0.7 + peak * 0.3) # Weighted combination
256
-
257
- # Find peaks in the energy curve
258
- # Smooth energies first to avoid small fluctuations
259
- smoothed_energies = np.convolve(energies, np.ones(3)/3, mode='same')
260
- peaks, _ = find_peaks(smoothed_energies, height=np.mean(smoothed_energies), distance=2)
261
-
262
- # If no clear peaks found, fall back to selecting top energy segments
263
- if len(peaks) < 2:
264
- # Sort segments by energy and take top segments (up to 3 or 1/3 of total, whichever is more)
265
- num_segments = max(3, len(splits) // 3)
266
- indices = np.argsort(energies)[-num_segments:]
267
- return [splits[i] for i in sorted(indices)]
268
-
269
- # Return the audio segments corresponding to the peaks
270
- peak_splits = [splits[i] for i in peaks]
271
-
272
- # If we have too many peaks, select the strongest ones
273
- if len(peak_splits) > 5:
274
- peak_energies = [energies[i] for i in peaks]
275
- sorted_indices = np.argsort(peak_energies)[::-1] # Sort in descending order
276
- peak_splits = [peak_splits[i] for i in sorted_indices[:5]] # Take top 5
277
-
278
- return peak_splits
279
-
280
-
281
- def bandpass(sig, rate, fmin, fmax, order=5):
282
- """
283
- Apply a bandpass filter to the input signal.
284
-
285
- Args:
286
- sig (numpy.ndarray): The input signal to be filtered.
287
- rate (int): The sampling rate of the input signal.
288
- fmin (float): The minimum frequency for the bandpass filter.
289
- fmax (float): The maximum frequency for the bandpass filter.
290
- order (int, optional): The order of the filter. Default is 5.
291
-
292
- Returns:
293
- numpy.ndarray: The filtered signal as a float32 array.
294
- """
295
- # Check if we have to bandpass at all
296
- if fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX or fmin > fmax:
297
- return sig
298
-
299
- from scipy.signal import butter, lfilter
300
-
301
- nyquist = 0.5 * rate
302
-
303
- # Highpass?
304
- if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
305
- low = fmin / nyquist
306
- b, a = butter(order, low, btype="high")
307
- sig = lfilter(b, a, sig)
308
-
309
- # Lowpass?
310
- elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
311
- high = fmax / nyquist
312
- b, a = butter(order, high, btype="low")
313
- sig = lfilter(b, a, sig)
314
-
315
- # Bandpass?
316
- elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
317
- low = fmin / nyquist
318
- high = fmax / nyquist
319
- b, a = butter(order, [low, high], btype="band")
320
- sig = lfilter(b, a, sig)
321
-
322
- return sig.astype("float32")
323
-
324
-
325
- # Raven is using Kaiser window FIR filter, so we try to emulate it.
326
- # Raven uses the Window method for FIR filter design.
327
- # A Kaiser window is used with a default transition bandwidth of 0.02 times
328
- # the Nyquist frequency and a default stop band attenuation of 100 dB.
329
- # For a complete description of this method, see Discrete-Time Signal Processing
330
- # (Second Edition), by Alan Oppenheim, Ronald Schafer, and John Buck, Prentice Hall 1998, pp. 474-476.
331
- def bandpass_kaiser_fir(sig, rate, fmin, fmax, width=0.02, stopband_attenuation_db=100):
332
- """
333
- Applies a bandpass filter to the given signal using a Kaiser window FIR filter.
334
- Args:
335
- sig (numpy.ndarray): The input signal to be filtered.
336
- rate (int): The sample rate of the input signal.
337
- fmin (float): The minimum frequency of the bandpass filter.
338
- fmax (float): The maximum frequency of the bandpass filter.
339
- width (float, optional): The transition width of the filter. Default is 0.02.
340
- stopband_attenuation_db (float, optional): The desired attenuation in the stopband, in decibels. Default is 100.
341
- Returns:
342
- numpy.ndarray: The filtered signal as a float32 numpy array.
343
- """
344
- # Check if we have to bandpass at all
345
- if fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX or fmin > fmax:
346
- return sig
347
-
348
- nyquist = 0.5 * rate
349
-
350
- # Calculate the order and Kaiser parameter for the desired specifications.
351
- N, beta = kaiserord(stopband_attenuation_db, width)
352
-
353
- # Highpass?
354
- if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
355
- low = fmin / nyquist
356
- taps = firwin(N, low, window=("kaiser", beta), pass_zero=False)
357
-
358
- # Lowpass?
359
- elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
360
- high = fmax / nyquist
361
- taps = firwin(N, high, window=("kaiser", beta), pass_zero=True)
362
-
363
- # Bandpass?
364
- elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
365
- low = fmin / nyquist
366
- high = fmax / nyquist
367
- taps = firwin(N, [low, high], window=("kaiser", beta), pass_zero=False)
368
-
369
- # Apply the filter to the signal.
370
- sig = lfilter(taps, 1.0, sig)
371
-
372
- return sig.astype("float32")
1
+ """Module containing audio helper functions."""
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from scipy.signal import find_peaks, firwin, kaiserord, lfilter
7
+
8
+ import birdnet_analyzer.config as cfg
9
+
10
+ RANDOM = np.random.RandomState(cfg.RANDOM_SEED)
11
+
12
+
13
+ def open_audio_file(path: str, sample_rate=48000, offset=0.0, duration=None, fmin=None, fmax=None, speed=1.0):
14
+ """Open an audio file.
15
+
16
+ Opens an audio file with librosa and the given settings.
17
+
18
+ Args:
19
+ path: Path to the audio file.
20
+ sample_rate: The sample rate at which the file should be processed.
21
+ offset: The starting offset.
22
+ duration: Maximum duration of the loaded content.
23
+ fmin: Minimum frequency for bandpass filter.
24
+ fmax: Maximum frequency for bandpass filter.
25
+ speed: Speed factor for audio playback.
26
+
27
+ Returns:
28
+ Returns the audio time series and the sampling rate.
29
+ """
30
+ # Open file with librosa (uses ffmpeg or libav)
31
+ if speed == 1.0:
32
+ sig, rate = librosa.load(
33
+ path, sr=sample_rate, offset=offset, duration=duration, mono=True, res_type="kaiser_fast"
34
+ )
35
+
36
+ else:
37
+ # Load audio with original sample rate
38
+ sig, rate = librosa.load(path, sr=None, offset=offset, duration=duration, mono=True)
39
+
40
+ # Resample with "fake" sample rate
41
+ sig = librosa.resample(sig, orig_sr=int(rate * speed), target_sr=sample_rate, res_type="kaiser_fast")
42
+ rate = sample_rate
43
+
44
+ # Bandpass filter
45
+ if fmin is not None and fmax is not None:
46
+ sig = bandpass(sig, rate, fmin, fmax)
47
+ # sig = bandpassKaiserFIR(sig, rate, fmin, fmax)
48
+
49
+ return sig, rate
50
+
51
+
52
+ def get_audio_file_length(path):
53
+ """
54
+ Get the length of an audio file in seconds.
55
+
56
+ Args:
57
+ path (str): The file path to the audio file.
58
+
59
+ Returns:
60
+ float: The duration of the audio file in seconds.
61
+ """
62
+ # Open file with librosa (uses ffmpeg or libav)
63
+
64
+ return librosa.get_duration(filename=path, sr=None)
65
+
66
+
67
+ def get_sample_rate(path: str):
68
+ """
69
+ Get the sample rate of an audio file.
70
+
71
+ Args:
72
+ path (str): The file path to the audio file.
73
+
74
+ Returns:
75
+ int: The sample rate of the audio file.
76
+ """
77
+ return librosa.get_samplerate(path)
78
+
79
+
80
+ def save_signal(sig, fname: str, rate=48000):
81
+ """Saves a signal to file.
82
+
83
+ Args:
84
+ sig: The signal to be saved.
85
+ fname: The file path.
86
+
87
+ Returns:
88
+ None
89
+ """
90
+
91
+ sf.write(fname, sig, rate, "PCM_16")
92
+
93
+
94
+ def pad(sig, seconds, srate, amount=None):
95
+ """Creates a noise vector with the given shape.
96
+
97
+ Args:
98
+ sig: The original audio signal.
99
+ shape: Shape of the noise.
100
+ amount: The noise intensity.
101
+
102
+ Returns:
103
+ An numpy array of noise with the given shape.
104
+ """
105
+
106
+ target_len = int(srate * seconds)
107
+
108
+ if len(sig) < target_len:
109
+ noise_shape = target_len - len(sig)
110
+
111
+ if not cfg.USE_NOISE:
112
+ noise = np.zeros(noise_shape, dtype=sig.dtype)
113
+ else:
114
+ # Random noise intensity
115
+ if amount is None:
116
+ amount = RANDOM.uniform(0.1, 0.5)
117
+
118
+ # Create Gaussian noise
119
+ try:
120
+ noise = RANDOM.normal(min(sig) * amount, max(sig) * amount, noise_shape).astype(sig.dtype)
121
+ except:
122
+ noise = np.zeros(noise_shape, dtype=sig.dtype)
123
+
124
+ return np.concatenate((sig, noise))
125
+
126
+ return sig
127
+
128
+
129
+ def split_signal(sig, rate, seconds, overlap, minlen, amount=None):
130
+ """Split signal with overlap.
131
+
132
+ Args:
133
+ sig: The original signal to be split.
134
+ rate: The sampling rate.
135
+ seconds: The duration of a segment.
136
+ overlap: The overlapping seconds of segments.
137
+ minlen: Minimum length of a split.
138
+
139
+ Returns:
140
+ A list of splits.
141
+ """
142
+
143
+ # Split signal to chunks of duration with overlap, whereas each chunk still has minimum duration of signal
144
+ if rate is None or rate <= 0:
145
+ rate = cfg.SAMPLE_RATE
146
+ if seconds is None or seconds <= 0:
147
+ seconds = cfg.SIG_LENGTH
148
+ if overlap is None or overlap < 0:
149
+ overlap = cfg.SIG_OVERLAP
150
+ if minlen is None or minlen <= 0 or minlen > seconds:
151
+ minlen = cfg.SIG_MINLEN
152
+
153
+ # Make sure overlap is smaller then signal duration
154
+ if overlap >= seconds:
155
+ overlap = seconds - 0.01
156
+
157
+ # Number of frames per chunk, per step and per minimum signal
158
+ chunksize = int(rate * seconds)
159
+ stepsize = int(rate * (seconds - overlap))
160
+ minsize = int(rate * minlen)
161
+
162
+ # Start of last chunk
163
+ lastchunkpos = int((sig.size - chunksize + stepsize - 1) / stepsize) * stepsize
164
+ # Make sure at least one chunk is returned
165
+ if lastchunkpos < 0:
166
+ lastchunkpos = 0
167
+ # Omit last chunk if minimum signal duration is underrun
168
+ elif sig.size - lastchunkpos < minsize:
169
+ lastchunkpos = lastchunkpos - stepsize
170
+
171
+ # Append noise or empty signal of chunk duration, so all splits have desired length
172
+ if not cfg.USE_NOISE:
173
+ noise = np.zeros(shape=chunksize, dtype=sig.dtype)
174
+ else:
175
+ # Random noise intensity
176
+ if amount is None:
177
+ amount = RANDOM.uniform(0.1, 0.5)
178
+ # Create Gaussian noise
179
+ try:
180
+ noise = RANDOM.normal(loc=min(sig) * amount, scale=max(sig) * amount, size=chunksize).astype(sig.dtype)
181
+ except:
182
+ noise = np.zeros(shape=chunksize, dtype=sig.dtype)
183
+ data = np.concatenate((sig, noise))
184
+
185
+ # Split signal with overlap
186
+ sig_splits = []
187
+ sig_splits.extend(data[i : i + chunksize] for i in range(0, lastchunkpos, stepsize))
188
+
189
+ return sig_splits
190
+
191
+
192
+ def crop_center(sig, rate, seconds):
193
+ """Crop signal to center.
194
+
195
+ Args:
196
+ sig: The original signal.
197
+ rate: The sampling rate.
198
+ seconds: The length of the signal.
199
+
200
+ Returns:
201
+ The cropped signal.
202
+ """
203
+ if len(sig) > int(seconds * rate):
204
+ start = int((len(sig) - int(seconds * rate)) / 2)
205
+ end = start + int(seconds * rate)
206
+ sig = sig[start:end]
207
+
208
+ # Pad with noise
209
+ else:
210
+ sig = pad(sig, seconds, rate, 0.5)
211
+
212
+ return sig
213
+
214
+
215
+ def smart_crop_signal(sig, rate, sig_length, sig_overlap, sig_minlen):
216
+ """Smart crop audio signal based on peak detection.
217
+
218
+ This function analyzes the audio signal to find peaks in energy/amplitude,
219
+ which are more likely to contain relevant target signals (e.g., bird calls).
220
+ Only the audio segments with the highest energy peaks are returned.
221
+
222
+ Args:
223
+ sig: The audio signal.
224
+ rate: The sample rate of the audio signal.
225
+ sig_length: The desired length of each snippet in seconds.
226
+ sig_overlap: The overlap between snippets in seconds.
227
+ sig_minlen: The minimum length of a snippet in seconds.
228
+
229
+ Returns:
230
+ A list of audio snippets with the highest energy/peaks.
231
+ """
232
+
233
+ # If signal is too short, just return it
234
+ if len(sig) / rate <= sig_length:
235
+ return [sig]
236
+
237
+ # Split the signal into overlapping windows
238
+ splits = split_signal(sig, rate, sig_length, sig_overlap, sig_minlen)
239
+
240
+ if len(splits) <= 1:
241
+ return splits
242
+
243
+ # Calculate energy for each window
244
+ energies = []
245
+ for split in splits:
246
+ # Calculate RMS energy
247
+ energy = np.sqrt(np.mean(split**2))
248
+ # Also consider peak values
249
+ peak = np.max(np.abs(split))
250
+ # Combine both metrics
251
+ energies.append(energy * 0.7 + peak * 0.3) # Weighted combination
252
+
253
+ # Find peaks in the energy curve
254
+ # Smooth energies first to avoid small fluctuations
255
+ smoothed_energies = np.convolve(energies, np.ones(3) / 3, mode="same")
256
+ peaks, _ = find_peaks(smoothed_energies, height=np.mean(smoothed_energies), distance=2)
257
+
258
+ # If no clear peaks found, fall back to selecting top energy segments
259
+ if len(peaks) < 2:
260
+ # Sort segments by energy and take top segments (up to 3 or 1/3 of total, whichever is more)
261
+ num_segments = max(3, len(splits) // 3)
262
+ indices = np.argsort(energies)[-num_segments:]
263
+ return [splits[i] for i in sorted(indices)]
264
+
265
+ # Return the audio segments corresponding to the peaks
266
+ peak_splits = [splits[i] for i in peaks]
267
+
268
+ # If we have too many peaks, select the strongest ones
269
+ if len(peak_splits) > 5:
270
+ peak_energies = [energies[i] for i in peaks]
271
+ sorted_indices = np.argsort(peak_energies)[::-1] # Sort in descending order
272
+ peak_splits = [peak_splits[i] for i in sorted_indices[:5]] # Take top 5
273
+
274
+ return peak_splits
275
+
276
+
277
+ def bandpass(sig, rate, fmin, fmax, order=5):
278
+ """
279
+ Apply a bandpass filter to the input signal.
280
+
281
+ Args:
282
+ sig (numpy.ndarray): The input signal to be filtered.
283
+ rate (int): The sampling rate of the input signal.
284
+ fmin (float): The minimum frequency for the bandpass filter.
285
+ fmax (float): The maximum frequency for the bandpass filter.
286
+ order (int, optional): The order of the filter. Default is 5.
287
+
288
+ Returns:
289
+ numpy.ndarray: The filtered signal as a float32 array.
290
+ """
291
+ # Check if we have to bandpass at all
292
+ if (fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX) or fmin > fmax:
293
+ return sig
294
+
295
+ from scipy.signal import butter, lfilter
296
+
297
+ nyquist = 0.5 * rate
298
+
299
+ # Highpass?
300
+ if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
301
+ low = fmin / nyquist
302
+ b, a = butter(order, low, btype="high")
303
+ sig = lfilter(b, a, sig)
304
+
305
+ # Lowpass?
306
+ elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
307
+ high = fmax / nyquist
308
+ b, a = butter(order, high, btype="low")
309
+ sig = lfilter(b, a, sig)
310
+
311
+ # Bandpass?
312
+ elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
313
+ low = fmin / nyquist
314
+ high = fmax / nyquist
315
+ b, a = butter(order, [low, high], btype="band")
316
+ sig = lfilter(b, a, sig)
317
+
318
+ return sig.astype("float32")
319
+
320
+
321
+ # Raven is using Kaiser window FIR filter, so we try to emulate it.
322
+ # Raven uses the Window method for FIR filter design.
323
+ # A Kaiser window is used with a default transition bandwidth of 0.02 times
324
+ # the Nyquist frequency and a default stop band attenuation of 100 dB.
325
+ # For a complete description of this method, see Discrete-Time Signal Processing
326
+ # (Second Edition), by Alan Oppenheim, Ronald Schafer, and John Buck, Prentice Hall 1998, pp. 474-476.
327
+ def bandpass_kaiser_fir(sig, rate, fmin, fmax, width=0.02, stopband_attenuation_db=100):
328
+ """
329
+ Applies a bandpass filter to the given signal using a Kaiser window FIR filter.
330
+ Args:
331
+ sig (numpy.ndarray): The input signal to be filtered.
332
+ rate (int): The sample rate of the input signal.
333
+ fmin (float): The minimum frequency of the bandpass filter.
334
+ fmax (float): The maximum frequency of the bandpass filter.
335
+ width (float, optional): The transition width of the filter. Default is 0.02.
336
+ stopband_attenuation_db (float, optional): The desired attenuation in the stopband, in decibels. Default is 100.
337
+ Returns:
338
+ numpy.ndarray: The filtered signal as a float32 numpy array.
339
+ """
340
+ # Check if we have to bandpass at all
341
+ if (fmin == cfg.SIG_FMIN and fmax == cfg.SIG_FMAX) or fmin > fmax:
342
+ return sig
343
+
344
+ nyquist = 0.5 * rate
345
+
346
+ # Calculate the order and Kaiser parameter for the desired specifications.
347
+ N, beta = kaiserord(stopband_attenuation_db, width)
348
+
349
+ # Highpass?
350
+ if fmin > cfg.SIG_FMIN and fmax == cfg.SIG_FMAX:
351
+ low = fmin / nyquist
352
+ taps = firwin(N, low, window=("kaiser", beta), pass_zero=False)
353
+
354
+ # Lowpass?
355
+ elif fmin == cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
356
+ high = fmax / nyquist
357
+ taps = firwin(N, high, window=("kaiser", beta), pass_zero=True)
358
+
359
+ # Bandpass?
360
+ elif fmin > cfg.SIG_FMIN and fmax < cfg.SIG_FMAX:
361
+ low = fmin / nyquist
362
+ high = fmax / nyquist
363
+ taps = firwin(N, [low, high], window=("kaiser", beta), pass_zero=False)
364
+
365
+ # Apply the filter to the signal.
366
+ sig = lfilter(taps, 1.0, sig)
367
+
368
+ return sig.astype("float32")