audio2midi 0.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ from math import ceil as math_ceil
2
+ from typing import Callable
3
+ from numpy.lib.stride_tricks import as_strided
4
+ from keras.layers import Input, Reshape, Conv2D, BatchNormalization
5
+ from keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
6
+ from keras.models import Model
7
+ from keras.callbacks import Callback
8
+ from hmmlearn.hmm import CategoricalHMM
9
+ from librosa import load as librosa_load
10
+ from pretty_midi_fix import PrettyMIDI , PitchBend , Note ,Instrument
11
+ import numpy as np
12
+ from huggingface_hub import hf_hub_download
13
+
14
+ class PredictProgressCallback(Callback):
15
+ def __init__(self, total_batches,progress_callback: Callable[[int, int], None] = None):
16
+ super().__init__()
17
+ self.total_batches = total_batches
18
+ self.progress_callback = progress_callback
19
+ def on_predict_begin(self, logs=None):
20
+ self.progress_callback(0,self.total_batches)
21
+ def on_predict_batch_end(self, batch, logs=None):
22
+ self.progress_callback(batch,self.total_batches)
23
+ def on_predict_end(self, logs=None):
24
+ self.progress_callback(self.total_batches,self.total_batches)
25
+
26
+
27
+ class Crepe():
28
+ def __init__(self,model_type="full",model_path=None):
29
+ if not model_path:
30
+ model_path = hf_hub_download("shethjenil/Audio2Midi_Models",f"crepe_{model_type}.h5")
31
+ model_type_importance = {'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32}[model_type]
32
+ filters = [n * model_type_importance for n in [32, 4, 4, 4, 8, 16]]
33
+ widths = [512, 64, 64, 64, 64, 64]
34
+ strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]
35
+ x = Input(shape=(1024,), name='input', dtype='float32')
36
+ y = Reshape(target_shape=(1024, 1, 1), name='input-reshape')(x)
37
+ layers = [1, 2, 3, 4, 5, 6]
38
+ for l, f, w, s in zip(layers, filters, widths, strides):
39
+ y = Conv2D(f, (w, 1), strides=s, padding='same', activation='relu', name="conv%d" % l)(y)
40
+ y = BatchNormalization(name="conv%d-BN" % l)(y)
41
+ y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid', name="conv%d-maxpool" % l)(y)
42
+ y = Dropout(0.25, name="conv%d-dropout" % l)(y)
43
+ y = Permute((2, 1, 3), name="transpose")(y)
44
+ y = Flatten(name="flatten")(y)
45
+ y = Dense(360, activation='sigmoid', name="classifier")(y)
46
+ self.model = Model(inputs=x, outputs=y)
47
+ self.model.load_weights(model_path)
48
+ self.model.compile('adam', 'binary_crossentropy')
49
+ self.cents_mapping=(np.linspace(0, 7180, 360) + 1997.3794084376191)
50
+
51
+ def to_local_average_cents(self, salience, center=None):
52
+ if salience.ndim == 1:
53
+ if center is None:
54
+ center = int(np.argmax(salience))
55
+ start = max(0, center - 4)
56
+ end = min(len(salience), center + 5)
57
+ salience = salience[start:end]
58
+ product_sum = np.sum(salience * self.cents_mapping[start:end])
59
+ weight_sum = np.sum(salience)
60
+ return product_sum / weight_sum
61
+ if salience.ndim == 2:
62
+ return np.array([self.to_local_average_cents(salience[i, :]) for i in range(salience.shape[0])])
63
+ raise Exception("label should be either 1d or 2d ndarray")
64
+
65
+ def to_viterbi_cents(self,salience):
66
+ starting = np.ones(360) / 360
67
+ xx, yy = np.meshgrid(range(360), range(360))
68
+ transition = np.maximum(12 - abs(xx - yy), 0)
69
+ transition = transition / np.sum(transition, axis=1)[:, None]
70
+ self_emission = 0.1
71
+ emission = (np.eye(360) * self_emission + np.ones(shape=(360, 360)) * ((1 - self_emission) / 360))
72
+ model = CategoricalHMM(360, starting, transition)
73
+ model.startprob_, model.transmat_, model.emissionprob_ = starting, transition, emission
74
+ observations = np.argmax(salience, axis=1)
75
+ path = model.predict(observations.reshape(-1, 1), [len(observations)])
76
+ return np.array([self.to_local_average_cents(salience[i, :], path[i]) for i in range(len(observations))])
77
+
78
+ def get_activation(self,audio:np.ndarray,center, step_size, progress_callback,batch_size):
79
+ if center:
80
+ audio = np.pad(audio, 512, mode='constant', constant_values=0)
81
+ hop_length = int(16000 * step_size / 1000)
82
+ n_frames = 1 + int((len(audio) - 1024) / hop_length)
83
+ frames = as_strided(audio, shape=(1024, n_frames),strides=(audio.itemsize, hop_length * audio.itemsize))
84
+ frames = frames.transpose().copy()
85
+ frames -= np.mean(frames, axis=1)[:, np.newaxis]
86
+ frames /= np.clip(np.std(frames, axis=1)[:, np.newaxis], 1e-8, None)
87
+ return self.model.predict(frames,batch_size,0,callbacks=[PredictProgressCallback(math_ceil(len(frames) / batch_size),progress_callback)])
88
+
89
+ def model_predict(self,audio:np.ndarray,viterbi, center, step_size,progress_callback,batch_size):
90
+ activation = self.get_activation(audio.astype(np.float32), center, step_size,progress_callback,batch_size)
91
+ confidence = activation.max(axis=1)
92
+ cents = self.to_viterbi_cents(activation) if viterbi else self.to_local_average_cents(activation)
93
+ frequency = 10 * 2 ** (cents / 1200)
94
+ frequency[np.isnan(frequency)] = 0
95
+ time = np.arange(confidence.shape[0]) * step_size / 1000.0
96
+ return time, frequency, confidence
97
+
98
+ def predict(self,audio_path,viterbi=False, center=True, step_size=10,min_confidence=0.8,batch_size=32,progress_callback: Callable[[int, int], None] = None,output_file= "output.mid"):
99
+ time, frequency, confidence = self.model_predict(librosa_load(audio_path, sr=16000, mono=True)[0],viterbi,center,step_size,progress_callback,batch_size)
100
+ mask = confidence > min_confidence
101
+ times = time[mask]
102
+ frequencies = frequency[mask]
103
+ midi_floats = 69 + 12 * np.log2(frequencies / 440.0)
104
+ midi_notes = np.round(midi_floats).astype(int)
105
+ pitch_offsets = midi_floats - midi_notes # in semitones
106
+ midi = PrettyMIDI()
107
+ instrument = Instrument(program=40) # e.g., Violin for pitch bend demo
108
+ if len(times) > 0:
109
+ current_note = midi_notes[0]
110
+ note_start = times[0]
111
+ for i in range(1, len(times)):
112
+ if midi_notes[i] != current_note or i == len(times) - 1:
113
+ note_end = times[i]
114
+ if 0 <= current_note <= 127:
115
+ note = Note(velocity=100,pitch=int(current_note),start=note_start,end=note_end)
116
+ instrument.notes.append(note)
117
+ seg_mask = (times >= note_start) & (times <= note_end)
118
+ seg_times = times[seg_mask]
119
+ seg_offsets = pitch_offsets[seg_mask]
120
+ for t, offset in zip(seg_times, seg_offsets):
121
+ # Assuming pitch bend range is +/- 2 semitones
122
+ bend_value = int(offset / 2.0 * 8192) # Scale to -8192 to +8191
123
+ bend_value = np.clip(bend_value, -8192, 8191)
124
+ pb = PitchBend(pitch=bend_value, time=t)
125
+ instrument.pitch_bends.append(pb)
126
+ current_note = midi_notes[i]
127
+ note_start = times[i]
128
+ midi.instruments.append(instrument)
129
+ midi.write(output_file)
130
+ return output_file
@@ -0,0 +1,153 @@
1
+ from typing import Callable
2
+ import librosa
3
+ from pretty_midi_fix import Instrument , PrettyMIDI , Note
4
+ import numpy as np
5
+
6
+
7
+ class Normal_Pitch_Det:
8
+ def smooth_pitch_sequence(self, pitches, magnitudes, threshold):
9
+ midi_sequence = []
10
+ for i in range(pitches.shape[1]):
11
+ index = np.argmax(magnitudes[:, i])
12
+ pitch_mag = magnitudes[index, i]
13
+ pitch = pitches[index, i]
14
+ if pitch_mag < threshold or np.isnan(pitch) or pitch <= 0:
15
+ midi_sequence.append(None)
16
+ else:
17
+ midi_note = int(round(librosa.hz_to_midi(pitch)))
18
+ midi_sequence.append(midi_note)
19
+ return midi_sequence
20
+
21
+ def clean_midi_sequence(self, sequence, min_note_length):
22
+ cleaned = []
23
+ current_note = None
24
+ count = 0
25
+ for note in sequence + [None]:
26
+ if note == current_note:
27
+ count += 1
28
+ else:
29
+ if current_note is not None and count >= min_note_length:
30
+ cleaned.extend([current_note] * count)
31
+ else:
32
+ cleaned.extend([None] * count)
33
+ current_note = note
34
+ count = 1
35
+ return cleaned
36
+
37
+ def predict(self, input_file, tempo_bpm=120, hop_length=512,min_note_length=2,threshold=0.1,output_file="output.mid"):
38
+ wav, sr = librosa.load(input_file)
39
+ audio_duration = len(wav) / sr
40
+ pitches, magnitudes = librosa.piptrack(y=wav, sr=sr, hop_length=hop_length)
41
+ midi_sequence = self.clean_midi_sequence(self.smooth_pitch_sequence(pitches, magnitudes,threshold),min_note_length)
42
+ time_per_frame = audio_duration / len(midi_sequence)
43
+ pm = PrettyMIDI(initial_tempo=tempo_bpm)
44
+ instrument = Instrument(program=0) # Acoustic Grand Piano
45
+ last_note = None
46
+ start_time = 0
47
+ for i, note in enumerate(midi_sequence):
48
+ current_time = i * time_per_frame
49
+ if note != last_note:
50
+ if last_note is not None:
51
+ end_time = current_time
52
+ instrument.notes.append(Note(velocity=100,pitch=last_note,start=start_time,end=end_time))
53
+ if note is not None:
54
+ start_time = current_time
55
+ last_note = note
56
+
57
+ if last_note is not None:
58
+ end_time = len(midi_sequence) * time_per_frame
59
+ instrument.notes.append(Note(velocity=100,pitch=last_note,start=start_time,end=end_time))
60
+
61
+ pm.instruments.append(instrument)
62
+ pm.write(output_file)
63
+ return output_file
64
+
65
+
66
+ class Guitar_Pitch_Det:
67
+ def __init__(self):
68
+ # nfft=2048
69
+ # overlap=0.5
70
+ # self.HOP_LENGTH = int(nfft * (1 - overlap))
71
+ self.FMIN = librosa.note_to_hz('C1')
72
+
73
+ def calc_cqt(self,audio, sr, mag_exp):
74
+ """Compute CQT and convert to dB."""
75
+ return librosa.amplitude_to_db(np.abs(librosa.cqt(audio, sr=sr, hop_length=self.HOP_LENGTH, fmin=self.FMIN, n_bins=self.N_BINS, bins_per_octave=self.BINS_PER_OCTAVE)) ** mag_exp, ref=np.max)
76
+
77
+ def cqt_thresholded(self,cqt_db, threshold_db):
78
+ """Threshold CQT in dB."""
79
+ cqt_copy = np.copy(cqt_db)
80
+ cqt_copy[cqt_copy < threshold_db] = -120
81
+ return cqt_copy
82
+
83
+ def calc_onset(self,cqt_db, sr, pre_post_max, backtrack):
84
+ """Detect onsets using the onset envelope from thresholded CQT."""
85
+ onset_env = librosa.onset.onset_strength(S=cqt_db, sr=sr, hop_length=self.HOP_LENGTH, aggregate=np.mean)
86
+ onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, hop_length=self.HOP_LENGTH,units='frames', backtrack=backtrack,pre_max=pre_post_max, post_max=pre_post_max)
87
+ boundaries = np.concatenate([[0], onset_frames, [cqt_db.shape[1]]])
88
+ return boundaries, onset_env
89
+
90
+ def estimate_segment_note(self,cqt_db, boundaries, i, sr, tempo_bpm, threshold_db,round_to_sixteenth):
91
+ """Estimate pitch for one onset segment and generate note."""
92
+ n0, n1 = int(boundaries[i]), int(boundaries[i + 1])
93
+ # Pass the CQT segment directly to estimate_pitch
94
+ segment_cqt = np.mean(cqt_db[:, n0:n1], axis=1)
95
+ f0_info = self.estimate_pitch(segment_cqt, sr, threshold_db)
96
+ return self.generate_note(cqt_db, tempo_bpm, f0_info, sr, n1 - n0,round_to_sixteenth)
97
+
98
+ def generate_note(self,cqt_db, tempo_bpm, f0_info, sr, n_duration,round_to_sixteenth):
99
+ """Generate sinewave, MIDI note data, and Note or Rest."""
100
+ f0, amplitude = f0_info
101
+ # Remap amplitude based on the range of cqt_db values for velocity mapping
102
+ duration_beats = librosa.frames_to_time(n_duration, sr=sr, hop_length=self.HOP_LENGTH) * (tempo_bpm / 60)
103
+ if round_to_sixteenth:
104
+ duration_beats = round(duration_beats * 16) / 16
105
+ # Remap amplitude based on the range of cqt_db values for MIDI velocity
106
+ return None if f0 is None else int(np.round(librosa.hz_to_midi(f0))), duration_beats, int(np.clip(self.remap(amplitude, cqt_db.min(), cqt_db.max(), 0, 127), 0, 127))
107
+
108
+ def estimate_pitch(self,segment_cqt, sr, threshold_db):
109
+ """Estimate pitch from CQT segment."""
110
+ # Analyze the CQT segment to find the dominant frequency
111
+ # Find the frequency bin with the maximum energy in the segment
112
+ max_bin = np.argmax(segment_cqt)
113
+ # Convert the bin index to frequency (Hz)
114
+ pitch_hz = librosa.cqt_frequencies(n_bins=self.N_BINS, fmin=self.FMIN, bins_per_octave=self.BINS_PER_OCTAVE)[max_bin]
115
+ amplitude = segment_cqt[max_bin] # Use the amplitude from the CQT bin
116
+
117
+ if pitch_hz is not None and amplitude > threshold_db:
118
+ return pitch_hz, amplitude
119
+ else:
120
+ return None, 0
121
+
122
+ def remap(self,x, in_min, in_max, out_min, out_max):
123
+ """Remap a value or array from one range to another."""
124
+ return (x - in_min) * (out_max - out_min) / (in_max - in_min) + out_min
125
+
126
+ def predict(self,audio_path, mag_exp=4,threshold_db=-61,pre_post_max=6,backtrack=False,round_to_sixteenth=False,hop_length=1024,n_bins=72,bins_per_octave=12,output_file="output.mid"):
127
+ self.BINS_PER_OCTAVE = bins_per_octave
128
+ self.HOP_LENGTH = hop_length
129
+ self.N_BINS = n_bins
130
+ audio , sr = librosa.load(audio_path, sr=None)
131
+ cqt_db = self.calc_cqt(audio, sr, mag_exp)
132
+ cqt_thresh = self.cqt_thresholded(cqt_db, threshold_db)
133
+ boundaries, onset_env = self.calc_onset(cqt_thresh, sr, pre_post_max, backtrack)
134
+ tempo_bpm, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr,hop_length=self.HOP_LENGTH)
135
+ tempo_bpm = round(tempo_bpm[0])
136
+ # Process all segments
137
+ notes_data = [self.estimate_segment_note(cqt_db, boundaries, i, sr, tempo_bpm, threshold_db,round_to_sixteenth) for i in range(len(boundaries) - 1)]
138
+ pm = PrettyMIDI(initial_tempo=tempo_bpm)
139
+ piano = Instrument(program=40)
140
+ note_time = 0.0
141
+ for (pitch, duration, velocity) in notes_data:
142
+ if pitch is not None:
143
+ # Convert duration in beats to duration in seconds for PrettyMIDI
144
+ duration_sec = duration * (60 / tempo_bpm)
145
+ piano.notes.append(Note(velocity, pitch, note_time, note_time + duration_sec))
146
+ note_time += duration_sec # Increment note_time by duration in seconds
147
+ else:
148
+ # If it's a rest, just advance the time
149
+ duration_sec = duration * (60 / tempo_bpm)
150
+ note_time += duration_sec
151
+ pm.instruments.append(piano)
152
+ pm.write(output_file)
153
+ return output_file
@@ -0,0 +1,58 @@
1
+ import numpy as np
2
+ from scipy.signal import medfilt
3
+ from pretty_midi_fix import PrettyMIDI,Instrument,Note
4
+ from librosa import load as librosa_load
5
+ from vamp import collect as vamp_collect
6
+
7
+ class Melodia():
8
+ def midi_to_notes(self,midi, fs, smooth, minduration,hop=128):
9
+ if (smooth > 0):
10
+ filter_duration = smooth
11
+ filter_size = int(filter_duration * fs / float(hop))
12
+ if filter_size % 2 == 0:
13
+ filter_size += 1
14
+ midi_filt = medfilt(midi, filter_size)
15
+ else:
16
+ midi_filt = midi
17
+ notes = []
18
+ p_prev = 0
19
+ duration = 0
20
+ onset = 0
21
+ for n, p in enumerate(midi_filt):
22
+ if p == p_prev:
23
+ duration += 1
24
+ else:
25
+ if p_prev > 0:
26
+ duration_sec = duration * hop / float(fs)
27
+ if duration_sec >= minduration:
28
+ onset_sec = onset * hop / float(fs)
29
+ notes.append((onset_sec, duration_sec, p_prev))
30
+ onset = n
31
+ duration = 1
32
+ p_prev = p
33
+ if p_prev > 0:
34
+ duration_sec = duration * hop / float(fs)
35
+ onset_sec = onset * hop / float(fs)
36
+ notes.append((onset_sec, duration_sec, p_prev))
37
+ return notes
38
+
39
+ def hz2midi(self,hz:np.ndarray):
40
+ hz_nonneg = hz.copy()
41
+ idx = hz_nonneg <= 0
42
+ hz_nonneg[idx] = 1
43
+ midi = 69 + 12*np.log2(hz_nonneg/440.)
44
+ midi[idx] = 0
45
+ midi = np.round(midi)
46
+ return midi
47
+
48
+ def predict(self,audio, tempo=120, smooth=0.25, minduration=0.1,hop=128,output_file="output.mid"):
49
+ data, sr = librosa_load(audio, sr=44100, mono=True)
50
+ pm = PrettyMIDI(initial_tempo=tempo)
51
+ instrument = Instrument(program=40)
52
+ seconds_per_beat = 60.0 / tempo
53
+ for onset_beats, duration_beats, pitch in self.midi_to_notes(self.hz2midi(np.insert(vamp_collect(data, sr, "mtg-melodia:melodia",parameters={"voicing": 0.2})['vector'][1], 0, [0]*8)), 44100, smooth, minduration,hop):
54
+ start = onset_beats * seconds_per_beat
55
+ instrument.notes.append(Note(100,int(pitch),start,start + (duration_beats * seconds_per_beat)))
56
+ pm.instruments.append(instrument)
57
+ pm.write(output_file)
58
+ return output_file