audio2midi 0.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio2midi/__init__.py +0 -0
- audio2midi/basic_pitch_pitch_detector.py +783 -0
- audio2midi/crepe_pitch_detector.py +130 -0
- audio2midi/librosa_pitch_detector.py +153 -0
- audio2midi/melodia_pitch_detector.py +58 -0
- audio2midi/pop2piano.py +2604 -0
- audio2midi/py.typed +0 -0
- audio2midi/violin_pitch_detector.py +1281 -0
- audio2midi-0.1.0.dist-info/METADATA +100 -0
- audio2midi-0.1.0.dist-info/RECORD +11 -0
- audio2midi-0.1.0.dist-info/WHEEL +5 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
from math import ceil as math_ceil
|
2
|
+
from typing import Callable
|
3
|
+
from numpy.lib.stride_tricks import as_strided
|
4
|
+
from keras.layers import Input, Reshape, Conv2D, BatchNormalization
|
5
|
+
from keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
|
6
|
+
from keras.models import Model
|
7
|
+
from keras.callbacks import Callback
|
8
|
+
from hmmlearn.hmm import CategoricalHMM
|
9
|
+
from librosa import load as librosa_load
|
10
|
+
from pretty_midi_fix import PrettyMIDI , PitchBend , Note ,Instrument
|
11
|
+
import numpy as np
|
12
|
+
from huggingface_hub import hf_hub_download
|
13
|
+
|
14
|
+
class PredictProgressCallback(Callback):
|
15
|
+
def __init__(self, total_batches,progress_callback: Callable[[int, int], None] = None):
|
16
|
+
super().__init__()
|
17
|
+
self.total_batches = total_batches
|
18
|
+
self.progress_callback = progress_callback
|
19
|
+
def on_predict_begin(self, logs=None):
|
20
|
+
self.progress_callback(0,self.total_batches)
|
21
|
+
def on_predict_batch_end(self, batch, logs=None):
|
22
|
+
self.progress_callback(batch,self.total_batches)
|
23
|
+
def on_predict_end(self, logs=None):
|
24
|
+
self.progress_callback(self.total_batches,self.total_batches)
|
25
|
+
|
26
|
+
|
27
|
+
class Crepe():
|
28
|
+
def __init__(self,model_type="full",model_path=None):
|
29
|
+
if not model_path:
|
30
|
+
model_path = hf_hub_download("shethjenil/Audio2Midi_Models",f"crepe_{model_type}.h5")
|
31
|
+
model_type_importance = {'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32}[model_type]
|
32
|
+
filters = [n * model_type_importance for n in [32, 4, 4, 4, 8, 16]]
|
33
|
+
widths = [512, 64, 64, 64, 64, 64]
|
34
|
+
strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]
|
35
|
+
x = Input(shape=(1024,), name='input', dtype='float32')
|
36
|
+
y = Reshape(target_shape=(1024, 1, 1), name='input-reshape')(x)
|
37
|
+
layers = [1, 2, 3, 4, 5, 6]
|
38
|
+
for l, f, w, s in zip(layers, filters, widths, strides):
|
39
|
+
y = Conv2D(f, (w, 1), strides=s, padding='same', activation='relu', name="conv%d" % l)(y)
|
40
|
+
y = BatchNormalization(name="conv%d-BN" % l)(y)
|
41
|
+
y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid', name="conv%d-maxpool" % l)(y)
|
42
|
+
y = Dropout(0.25, name="conv%d-dropout" % l)(y)
|
43
|
+
y = Permute((2, 1, 3), name="transpose")(y)
|
44
|
+
y = Flatten(name="flatten")(y)
|
45
|
+
y = Dense(360, activation='sigmoid', name="classifier")(y)
|
46
|
+
self.model = Model(inputs=x, outputs=y)
|
47
|
+
self.model.load_weights(model_path)
|
48
|
+
self.model.compile('adam', 'binary_crossentropy')
|
49
|
+
self.cents_mapping=(np.linspace(0, 7180, 360) + 1997.3794084376191)
|
50
|
+
|
51
|
+
def to_local_average_cents(self, salience, center=None):
|
52
|
+
if salience.ndim == 1:
|
53
|
+
if center is None:
|
54
|
+
center = int(np.argmax(salience))
|
55
|
+
start = max(0, center - 4)
|
56
|
+
end = min(len(salience), center + 5)
|
57
|
+
salience = salience[start:end]
|
58
|
+
product_sum = np.sum(salience * self.cents_mapping[start:end])
|
59
|
+
weight_sum = np.sum(salience)
|
60
|
+
return product_sum / weight_sum
|
61
|
+
if salience.ndim == 2:
|
62
|
+
return np.array([self.to_local_average_cents(salience[i, :]) for i in range(salience.shape[0])])
|
63
|
+
raise Exception("label should be either 1d or 2d ndarray")
|
64
|
+
|
65
|
+
def to_viterbi_cents(self,salience):
|
66
|
+
starting = np.ones(360) / 360
|
67
|
+
xx, yy = np.meshgrid(range(360), range(360))
|
68
|
+
transition = np.maximum(12 - abs(xx - yy), 0)
|
69
|
+
transition = transition / np.sum(transition, axis=1)[:, None]
|
70
|
+
self_emission = 0.1
|
71
|
+
emission = (np.eye(360) * self_emission + np.ones(shape=(360, 360)) * ((1 - self_emission) / 360))
|
72
|
+
model = CategoricalHMM(360, starting, transition)
|
73
|
+
model.startprob_, model.transmat_, model.emissionprob_ = starting, transition, emission
|
74
|
+
observations = np.argmax(salience, axis=1)
|
75
|
+
path = model.predict(observations.reshape(-1, 1), [len(observations)])
|
76
|
+
return np.array([self.to_local_average_cents(salience[i, :], path[i]) for i in range(len(observations))])
|
77
|
+
|
78
|
+
def get_activation(self,audio:np.ndarray,center, step_size, progress_callback,batch_size):
|
79
|
+
if center:
|
80
|
+
audio = np.pad(audio, 512, mode='constant', constant_values=0)
|
81
|
+
hop_length = int(16000 * step_size / 1000)
|
82
|
+
n_frames = 1 + int((len(audio) - 1024) / hop_length)
|
83
|
+
frames = as_strided(audio, shape=(1024, n_frames),strides=(audio.itemsize, hop_length * audio.itemsize))
|
84
|
+
frames = frames.transpose().copy()
|
85
|
+
frames -= np.mean(frames, axis=1)[:, np.newaxis]
|
86
|
+
frames /= np.clip(np.std(frames, axis=1)[:, np.newaxis], 1e-8, None)
|
87
|
+
return self.model.predict(frames,batch_size,0,callbacks=[PredictProgressCallback(math_ceil(len(frames) / batch_size),progress_callback)])
|
88
|
+
|
89
|
+
def model_predict(self,audio:np.ndarray,viterbi, center, step_size,progress_callback,batch_size):
|
90
|
+
activation = self.get_activation(audio.astype(np.float32), center, step_size,progress_callback,batch_size)
|
91
|
+
confidence = activation.max(axis=1)
|
92
|
+
cents = self.to_viterbi_cents(activation) if viterbi else self.to_local_average_cents(activation)
|
93
|
+
frequency = 10 * 2 ** (cents / 1200)
|
94
|
+
frequency[np.isnan(frequency)] = 0
|
95
|
+
time = np.arange(confidence.shape[0]) * step_size / 1000.0
|
96
|
+
return time, frequency, confidence
|
97
|
+
|
98
|
+
def predict(self,audio_path,viterbi=False, center=True, step_size=10,min_confidence=0.8,batch_size=32,progress_callback: Callable[[int, int], None] = None,output_file= "output.mid"):
|
99
|
+
time, frequency, confidence = self.model_predict(librosa_load(audio_path, sr=16000, mono=True)[0],viterbi,center,step_size,progress_callback,batch_size)
|
100
|
+
mask = confidence > min_confidence
|
101
|
+
times = time[mask]
|
102
|
+
frequencies = frequency[mask]
|
103
|
+
midi_floats = 69 + 12 * np.log2(frequencies / 440.0)
|
104
|
+
midi_notes = np.round(midi_floats).astype(int)
|
105
|
+
pitch_offsets = midi_floats - midi_notes # in semitones
|
106
|
+
midi = PrettyMIDI()
|
107
|
+
instrument = Instrument(program=40) # e.g., Violin for pitch bend demo
|
108
|
+
if len(times) > 0:
|
109
|
+
current_note = midi_notes[0]
|
110
|
+
note_start = times[0]
|
111
|
+
for i in range(1, len(times)):
|
112
|
+
if midi_notes[i] != current_note or i == len(times) - 1:
|
113
|
+
note_end = times[i]
|
114
|
+
if 0 <= current_note <= 127:
|
115
|
+
note = Note(velocity=100,pitch=int(current_note),start=note_start,end=note_end)
|
116
|
+
instrument.notes.append(note)
|
117
|
+
seg_mask = (times >= note_start) & (times <= note_end)
|
118
|
+
seg_times = times[seg_mask]
|
119
|
+
seg_offsets = pitch_offsets[seg_mask]
|
120
|
+
for t, offset in zip(seg_times, seg_offsets):
|
121
|
+
# Assuming pitch bend range is +/- 2 semitones
|
122
|
+
bend_value = int(offset / 2.0 * 8192) # Scale to -8192 to +8191
|
123
|
+
bend_value = np.clip(bend_value, -8192, 8191)
|
124
|
+
pb = PitchBend(pitch=bend_value, time=t)
|
125
|
+
instrument.pitch_bends.append(pb)
|
126
|
+
current_note = midi_notes[i]
|
127
|
+
note_start = times[i]
|
128
|
+
midi.instruments.append(instrument)
|
129
|
+
midi.write(output_file)
|
130
|
+
return output_file
|
@@ -0,0 +1,153 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
import librosa
|
3
|
+
from pretty_midi_fix import Instrument , PrettyMIDI , Note
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
|
7
|
+
class Normal_Pitch_Det:
|
8
|
+
def smooth_pitch_sequence(self, pitches, magnitudes, threshold):
|
9
|
+
midi_sequence = []
|
10
|
+
for i in range(pitches.shape[1]):
|
11
|
+
index = np.argmax(magnitudes[:, i])
|
12
|
+
pitch_mag = magnitudes[index, i]
|
13
|
+
pitch = pitches[index, i]
|
14
|
+
if pitch_mag < threshold or np.isnan(pitch) or pitch <= 0:
|
15
|
+
midi_sequence.append(None)
|
16
|
+
else:
|
17
|
+
midi_note = int(round(librosa.hz_to_midi(pitch)))
|
18
|
+
midi_sequence.append(midi_note)
|
19
|
+
return midi_sequence
|
20
|
+
|
21
|
+
def clean_midi_sequence(self, sequence, min_note_length):
|
22
|
+
cleaned = []
|
23
|
+
current_note = None
|
24
|
+
count = 0
|
25
|
+
for note in sequence + [None]:
|
26
|
+
if note == current_note:
|
27
|
+
count += 1
|
28
|
+
else:
|
29
|
+
if current_note is not None and count >= min_note_length:
|
30
|
+
cleaned.extend([current_note] * count)
|
31
|
+
else:
|
32
|
+
cleaned.extend([None] * count)
|
33
|
+
current_note = note
|
34
|
+
count = 1
|
35
|
+
return cleaned
|
36
|
+
|
37
|
+
def predict(self, input_file, tempo_bpm=120, hop_length=512,min_note_length=2,threshold=0.1,output_file="output.mid"):
|
38
|
+
wav, sr = librosa.load(input_file)
|
39
|
+
audio_duration = len(wav) / sr
|
40
|
+
pitches, magnitudes = librosa.piptrack(y=wav, sr=sr, hop_length=hop_length)
|
41
|
+
midi_sequence = self.clean_midi_sequence(self.smooth_pitch_sequence(pitches, magnitudes,threshold),min_note_length)
|
42
|
+
time_per_frame = audio_duration / len(midi_sequence)
|
43
|
+
pm = PrettyMIDI(initial_tempo=tempo_bpm)
|
44
|
+
instrument = Instrument(program=0) # Acoustic Grand Piano
|
45
|
+
last_note = None
|
46
|
+
start_time = 0
|
47
|
+
for i, note in enumerate(midi_sequence):
|
48
|
+
current_time = i * time_per_frame
|
49
|
+
if note != last_note:
|
50
|
+
if last_note is not None:
|
51
|
+
end_time = current_time
|
52
|
+
instrument.notes.append(Note(velocity=100,pitch=last_note,start=start_time,end=end_time))
|
53
|
+
if note is not None:
|
54
|
+
start_time = current_time
|
55
|
+
last_note = note
|
56
|
+
|
57
|
+
if last_note is not None:
|
58
|
+
end_time = len(midi_sequence) * time_per_frame
|
59
|
+
instrument.notes.append(Note(velocity=100,pitch=last_note,start=start_time,end=end_time))
|
60
|
+
|
61
|
+
pm.instruments.append(instrument)
|
62
|
+
pm.write(output_file)
|
63
|
+
return output_file
|
64
|
+
|
65
|
+
|
66
|
+
class Guitar_Pitch_Det:
|
67
|
+
def __init__(self):
|
68
|
+
# nfft=2048
|
69
|
+
# overlap=0.5
|
70
|
+
# self.HOP_LENGTH = int(nfft * (1 - overlap))
|
71
|
+
self.FMIN = librosa.note_to_hz('C1')
|
72
|
+
|
73
|
+
def calc_cqt(self,audio, sr, mag_exp):
|
74
|
+
"""Compute CQT and convert to dB."""
|
75
|
+
return librosa.amplitude_to_db(np.abs(librosa.cqt(audio, sr=sr, hop_length=self.HOP_LENGTH, fmin=self.FMIN, n_bins=self.N_BINS, bins_per_octave=self.BINS_PER_OCTAVE)) ** mag_exp, ref=np.max)
|
76
|
+
|
77
|
+
def cqt_thresholded(self,cqt_db, threshold_db):
|
78
|
+
"""Threshold CQT in dB."""
|
79
|
+
cqt_copy = np.copy(cqt_db)
|
80
|
+
cqt_copy[cqt_copy < threshold_db] = -120
|
81
|
+
return cqt_copy
|
82
|
+
|
83
|
+
def calc_onset(self,cqt_db, sr, pre_post_max, backtrack):
|
84
|
+
"""Detect onsets using the onset envelope from thresholded CQT."""
|
85
|
+
onset_env = librosa.onset.onset_strength(S=cqt_db, sr=sr, hop_length=self.HOP_LENGTH, aggregate=np.mean)
|
86
|
+
onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, hop_length=self.HOP_LENGTH,units='frames', backtrack=backtrack,pre_max=pre_post_max, post_max=pre_post_max)
|
87
|
+
boundaries = np.concatenate([[0], onset_frames, [cqt_db.shape[1]]])
|
88
|
+
return boundaries, onset_env
|
89
|
+
|
90
|
+
def estimate_segment_note(self,cqt_db, boundaries, i, sr, tempo_bpm, threshold_db,round_to_sixteenth):
|
91
|
+
"""Estimate pitch for one onset segment and generate note."""
|
92
|
+
n0, n1 = int(boundaries[i]), int(boundaries[i + 1])
|
93
|
+
# Pass the CQT segment directly to estimate_pitch
|
94
|
+
segment_cqt = np.mean(cqt_db[:, n0:n1], axis=1)
|
95
|
+
f0_info = self.estimate_pitch(segment_cqt, sr, threshold_db)
|
96
|
+
return self.generate_note(cqt_db, tempo_bpm, f0_info, sr, n1 - n0,round_to_sixteenth)
|
97
|
+
|
98
|
+
def generate_note(self,cqt_db, tempo_bpm, f0_info, sr, n_duration,round_to_sixteenth):
|
99
|
+
"""Generate sinewave, MIDI note data, and Note or Rest."""
|
100
|
+
f0, amplitude = f0_info
|
101
|
+
# Remap amplitude based on the range of cqt_db values for velocity mapping
|
102
|
+
duration_beats = librosa.frames_to_time(n_duration, sr=sr, hop_length=self.HOP_LENGTH) * (tempo_bpm / 60)
|
103
|
+
if round_to_sixteenth:
|
104
|
+
duration_beats = round(duration_beats * 16) / 16
|
105
|
+
# Remap amplitude based on the range of cqt_db values for MIDI velocity
|
106
|
+
return None if f0 is None else int(np.round(librosa.hz_to_midi(f0))), duration_beats, int(np.clip(self.remap(amplitude, cqt_db.min(), cqt_db.max(), 0, 127), 0, 127))
|
107
|
+
|
108
|
+
def estimate_pitch(self,segment_cqt, sr, threshold_db):
|
109
|
+
"""Estimate pitch from CQT segment."""
|
110
|
+
# Analyze the CQT segment to find the dominant frequency
|
111
|
+
# Find the frequency bin with the maximum energy in the segment
|
112
|
+
max_bin = np.argmax(segment_cqt)
|
113
|
+
# Convert the bin index to frequency (Hz)
|
114
|
+
pitch_hz = librosa.cqt_frequencies(n_bins=self.N_BINS, fmin=self.FMIN, bins_per_octave=self.BINS_PER_OCTAVE)[max_bin]
|
115
|
+
amplitude = segment_cqt[max_bin] # Use the amplitude from the CQT bin
|
116
|
+
|
117
|
+
if pitch_hz is not None and amplitude > threshold_db:
|
118
|
+
return pitch_hz, amplitude
|
119
|
+
else:
|
120
|
+
return None, 0
|
121
|
+
|
122
|
+
def remap(self,x, in_min, in_max, out_min, out_max):
|
123
|
+
"""Remap a value or array from one range to another."""
|
124
|
+
return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min
|
125
|
+
|
126
|
+
def predict(self,audio_path, mag_exp=4,threshold_db=-61,pre_post_max=6,backtrack=False,round_to_sixteenth=False,hop_length=1024,n_bins=72,bins_per_octave=12,output_file="output.mid"):
|
127
|
+
self.BINS_PER_OCTAVE = bins_per_octave
|
128
|
+
self.HOP_LENGTH = hop_length
|
129
|
+
self.N_BINS = n_bins
|
130
|
+
audio , sr = librosa.load(audio_path, sr=None)
|
131
|
+
cqt_db = self.calc_cqt(audio, sr, mag_exp)
|
132
|
+
cqt_thresh = self.cqt_thresholded(cqt_db, threshold_db)
|
133
|
+
boundaries, onset_env = self.calc_onset(cqt_thresh, sr, pre_post_max, backtrack)
|
134
|
+
tempo_bpm, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr,hop_length=self.HOP_LENGTH)
|
135
|
+
tempo_bpm = round(tempo_bpm[0])
|
136
|
+
# Process all segments
|
137
|
+
notes_data = [self.estimate_segment_note(cqt_db, boundaries, i, sr, tempo_bpm, threshold_db,round_to_sixteenth) for i in range(len(boundaries) - 1)]
|
138
|
+
pm = PrettyMIDI(initial_tempo=tempo_bpm)
|
139
|
+
piano = Instrument(program=40)
|
140
|
+
note_time = 0.0
|
141
|
+
for (pitch, duration, velocity) in notes_data:
|
142
|
+
if pitch is not None:
|
143
|
+
# Convert duration in beats to duration in seconds for PrettyMIDI
|
144
|
+
duration_sec = duration * (60 / tempo_bpm)
|
145
|
+
piano.notes.append(Note(velocity, pitch, note_time, note_time + duration_sec))
|
146
|
+
note_time += duration_sec # Increment note_time by duration in seconds
|
147
|
+
else:
|
148
|
+
# If it's a rest, just advance the time
|
149
|
+
duration_sec = duration * (60 / tempo_bpm)
|
150
|
+
note_time += duration_sec
|
151
|
+
pm.instruments.append(piano)
|
152
|
+
pm.write(output_file)
|
153
|
+
return output_file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from scipy.signal import medfilt
|
3
|
+
from pretty_midi_fix import PrettyMIDI,Instrument,Note
|
4
|
+
from librosa import load as librosa_load
|
5
|
+
from vamp import collect as vamp_collect
|
6
|
+
|
7
|
+
class Melodia():
|
8
|
+
def midi_to_notes(self,midi, fs, smooth, minduration,hop=128):
|
9
|
+
if (smooth > 0):
|
10
|
+
filter_duration = smooth
|
11
|
+
filter_size = int(filter_duration * fs / float(hop))
|
12
|
+
if filter_size % 2 == 0:
|
13
|
+
filter_size += 1
|
14
|
+
midi_filt = medfilt(midi, filter_size)
|
15
|
+
else:
|
16
|
+
midi_filt = midi
|
17
|
+
notes = []
|
18
|
+
p_prev = 0
|
19
|
+
duration = 0
|
20
|
+
onset = 0
|
21
|
+
for n, p in enumerate(midi_filt):
|
22
|
+
if p == p_prev:
|
23
|
+
duration += 1
|
24
|
+
else:
|
25
|
+
if p_prev > 0:
|
26
|
+
duration_sec = duration * hop / float(fs)
|
27
|
+
if duration_sec >= minduration:
|
28
|
+
onset_sec = onset * hop / float(fs)
|
29
|
+
notes.append((onset_sec, duration_sec, p_prev))
|
30
|
+
onset = n
|
31
|
+
duration = 1
|
32
|
+
p_prev = p
|
33
|
+
if p_prev > 0:
|
34
|
+
duration_sec = duration * hop / float(fs)
|
35
|
+
onset_sec = onset * hop / float(fs)
|
36
|
+
notes.append((onset_sec, duration_sec, p_prev))
|
37
|
+
return notes
|
38
|
+
|
39
|
+
def hz2midi(self,hz:np.ndarray):
|
40
|
+
hz_nonneg = hz.copy()
|
41
|
+
idx = hz_nonneg <= 0
|
42
|
+
hz_nonneg[idx] = 1
|
43
|
+
midi = 69 + 12*np.log2(hz_nonneg/440.)
|
44
|
+
midi[idx] = 0
|
45
|
+
midi = np.round(midi)
|
46
|
+
return midi
|
47
|
+
|
48
|
+
def predict(self,audio, tempo=120, smooth=0.25, minduration=0.1,hop=128,output_file="output.mid"):
|
49
|
+
data, sr = librosa_load(audio, sr=44100, mono=True)
|
50
|
+
pm = PrettyMIDI(initial_tempo=tempo)
|
51
|
+
instrument = Instrument(program=40)
|
52
|
+
seconds_per_beat = 60.0 / tempo
|
53
|
+
for onset_beats, duration_beats, pitch in self.midi_to_notes(self.hz2midi(np.insert(vamp_collect(data, sr, "mtg-melodia:melodia",parameters={"voicing": 0.2})['vector'][1], 0, [0]*8)), 44100, smooth, minduration,hop):
|
54
|
+
start = onset_beats * seconds_per_beat
|
55
|
+
instrument.notes.append(Note(100,int(pitch),start,start + (duration_beats * seconds_per_beat)))
|
56
|
+
pm.instruments.append(instrument)
|
57
|
+
pm.write(output_file)
|
58
|
+
return output_file
|