mortm 4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mortm-4.5/PKG-INFO +254 -0
- mortm-4.5/README.md +241 -0
- mortm-4.5/mortm/__init__.py +0 -0
- mortm-4.5/mortm/constants.py +31 -0
- mortm-4.5/mortm/models/__init__.py +0 -0
- mortm-4.5/mortm/models/bertm.py +294 -0
- mortm-4.5/mortm/models/modules/PositionalEncoding.py +27 -0
- mortm-4.5/mortm/models/modules/__init__.py +0 -0
- mortm-4.5/mortm/models/modules/attention.py +300 -0
- mortm-4.5/mortm/models/modules/audio_patch.py +44 -0
- mortm-4.5/mortm/models/modules/config.py +77 -0
- mortm-4.5/mortm/models/modules/layers.py +471 -0
- mortm-4.5/mortm/models/modules/progress.py +52 -0
- mortm-4.5/mortm/models/mortm.py +338 -0
- mortm-4.5/mortm/models/mortm_live.py +26 -0
- mortm-4.5/mortm/models/v_mortm.py +65 -0
- mortm-4.5/mortm/train/__init__.py +0 -0
- mortm-4.5/mortm/train/config.py +55 -0
- mortm-4.5/mortm/train/custom_token.py +603 -0
- mortm-4.5/mortm/train/datasets.py +321 -0
- mortm-4.5/mortm/train/epoch.py +20 -0
- mortm-4.5/mortm/train/noam.py +7 -0
- mortm-4.5/mortm/train/rl/__init__.py +0 -0
- mortm-4.5/mortm/train/rl/reinforcement.py +207 -0
- mortm-4.5/mortm/train/tokenizer.py +204 -0
- mortm-4.5/mortm/train/train.py +686 -0
- mortm-4.5/mortm/train/utils/__init__.py +0 -0
- mortm-4.5/mortm/train/utils/chord_midi.py +47 -0
- mortm-4.5/mortm/train/utils/loss.py +135 -0
- mortm-4.5/mortm/utils/__init__.py +0 -0
- mortm-4.5/mortm/utils/convert.py +1220 -0
- mortm-4.5/mortm/utils/de_convert.py +40 -0
- mortm-4.5/mortm/utils/eval.py +155 -0
- mortm-4.5/mortm/utils/generate.py +149 -0
- mortm-4.5/mortm/utils/gmail_messanger.py +66 -0
- mortm-4.5/mortm/utils/key.py +354 -0
- mortm-4.5/mortm/utils/messager.py +21 -0
- mortm-4.5/mortm/utils/pianoroll_convert.py +182 -0
- mortm-4.5/mortm/utils/tag.py +97 -0
- mortm-4.5/mortm.egg-info/PKG-INFO +254 -0
- mortm-4.5/mortm.egg-info/SOURCES.txt +43 -0
- mortm-4.5/mortm.egg-info/dependency_links.txt +1 -0
- mortm-4.5/mortm.egg-info/top_level.txt +1 -0
- mortm-4.5/setup.cfg +4 -0
- mortm-4.5/setup.py +21 -0
mortm-4.5/PKG-INFO
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mortm
|
|
3
|
+
Version: 4.5
|
|
4
|
+
Summary: 旋律生成、コード推定、マルチタスクな音楽生成を行うライブラリ
|
|
5
|
+
Home-page: https://github.com/Ayato964
|
|
6
|
+
Author: Nagoshi Takaaki
|
|
7
|
+
Author-email: nagoshi@kthrlab.jp
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
Welcome to the MORTM Library!
|
|
15
|
+
|
|
16
|
+
I am your personal guide to MORTM (Metric-Oriented Rhythmic Transformer for Music Generation). This document provides a beginner-friendly overview of MORTM, its main features, installation instructions, and basic usage. MORTM is a Transformer-based melody generation model that focuses on the metric structure of music.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
### 🎵 MORTM: Metric-Oriented Rhythmic Transformer for Music Generation
|
|
21
|
+
|
|
22
|
+
MORTM (Metric-Oriented Rhythmic Transformer for Music Generation) is a Transformer-based melody generation model that focuses on the **metric structure** of music. It generates musical sequences autoregressively, one bar at a time, while preserving rhythmic consistency. MORTM also includes V_MORTM for audio-based generation and BERTM for music classification tasks.
|
|
23
|
+
|
|
24
|
+
#### ✨ Key Features
|
|
25
|
+
|
|
26
|
+
* **Bar-level Autoregressive Generation**: Each bar is normalized to 96 ticks (or 64 ticks in some contexts) and generated one bar at a time. It sequentially predicts one bar and uses it as the next input.
|
|
27
|
+
* **High-Quality Music Generation**: Utilizes a custom tokenizer to capture musical structure, including pitch, duration, relative timing, and bars, leading to coherent outputs.
|
|
28
|
+
* **Efficient Transformer Architecture**:
|
|
29
|
+
* **Decoder-Only (GPT-style)**: Optimized for autoregressive generation.
|
|
30
|
+
* **FlashAttention2 & ALiBi**: Offers memory-efficient, high-speed attention with excellent long-sequence generalization. FlashAttention2 resolves computational bottlenecks, allowing deeper models to be trained. ALiBi (Attention with Linear Biases) adds linear biases for relative positions to handle long-range dependencies and is compatible with FlashAttention2 as an alternative to Relative Positional Encoding (RPE).
|
|
31
|
+
* **Mixture of Experts (MoE)**: Employs sparsely activated Feed-Forward Network (FFN) layers, typically with Top-2 routing, to significantly increase model capacity while maintaining computational efficiency.
|
|
32
|
+
* **Structured Tokenization**: Uses tokens for Pitch, Duration, and Position, along with structural tokens like `<SME>` (End of Bar), `<TS>` (Track Start), and `<TE>` (Track End). Position tokens represent the start position within a bar (0-95 ticks).
|
|
33
|
+
* **Multimodal Support (V_MORTM)**: Can directly process audio features such as Mel spectrograms.
|
|
34
|
+
* **Classification (BERTM)**: Features a BERT-like encoder for music classification tasks.
|
|
35
|
+
* **Versatile Applications**: Applicable for melody generation, improvisation assistance, education, human-AI co-creation, and audio style transfer.
|
|
36
|
+
|
|
37
|
+
#### 🚀 Why MORTM?
|
|
38
|
+
|
|
39
|
+
* **State-of-the-Art**: Combines advanced techniques such as FlashAttention2, MoE, and ALiBi.
|
|
40
|
+
* **Musical Understanding**: Its custom tokenizer effectively captures core musical elements.
|
|
41
|
+
* **Scalability**: Supports diverse styles and long musical sequences.
|
|
42
|
+
* **Audio Domain**: V_MORTM enables richer audio-based generation.
|
|
43
|
+
* **Modular**: Facilitates easy prototyping and comparative experiments.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
### 🛠️ Installation
|
|
48
|
+
|
|
49
|
+
To set up your environment for MORTM, follow these steps:
|
|
50
|
+
|
|
51
|
+
#### Prerequisites
|
|
52
|
+
|
|
53
|
+
* Python 3.8+
|
|
54
|
+
* NVIDIA GPU (for FlashAttention2)
|
|
55
|
+
* CUDA Toolkit (compatible with PyTorch)
|
|
56
|
+
|
|
57
|
+
#### 1. Install PyTorch
|
|
58
|
+
|
|
59
|
+
Follow the instructions at [pytorch.org](https://pytorch.org). For example:
|
|
60
|
+
```bash
|
|
61
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
#### 2. Install FlashAttention2
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install flash-attn --no-build-isolation
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
#### 3. Install Other Dependencies
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install numpy einops pretty_midi midi2audio soundfile torchaudio PyYAML
|
|
74
|
+
```
|
|
75
|
+
**Note**: `midi2audio` requires FluidSynth and a soundfont (e.g., `.sf2` file).
|
|
76
|
+
|
|
77
|
+
#### 4. Optional: Gmail Notifications
|
|
78
|
+
|
|
79
|
+
If you wish to receive training progress updates via Gmail:
|
|
80
|
+
```bash
|
|
81
|
+
pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib
|
|
82
|
+
```
|
|
83
|
+
This requires OAuth2 setup (`client_secret.json`).
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
### ⚡ Quick Start
|
|
88
|
+
|
|
89
|
+
#### Data Preparation
|
|
90
|
+
|
|
91
|
+
Convert MIDI files into tokenized `.npz` format:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_TOKEN
|
|
95
|
+
from mortm.convert import MIDI2Seq
|
|
96
|
+
|
|
97
|
+
# Initialize tokenizer
|
|
98
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_TOKEN)) #
|
|
99
|
+
# Convert MIDI to sequence
|
|
100
|
+
converter = MIDI2Seq(tokenizer, "midi_dir", "your_midi.mid", program_list=, split_measure=12) #
|
|
101
|
+
converter.convert() #
|
|
102
|
+
# Save converted data
|
|
103
|
+
converter.save("output_npz_dir") #
|
|
104
|
+
# Save tokenizer vocabulary
|
|
105
|
+
tokenizer.save("vocab_output_dir") #
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
#### Inference
|
|
109
|
+
|
|
110
|
+
##### MORTM: Melody Generation
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
import torch
|
|
114
|
+
import numpy as np
|
|
115
|
+
from mortm.models.mortm import MORTM, MORTMArgs
|
|
116
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
|
|
117
|
+
from mortm.de_convert import ct_token_to_midi
|
|
118
|
+
from mortm.models.modules.progress import _DefaultLearningProgress
|
|
119
|
+
|
|
120
|
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
|
|
121
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
|
|
122
|
+
args = MORTMArgs("configs/models/mortm/A.json") #
|
|
123
|
+
model = MORTM(progress=_DefaultLearningProgress(), args=args) #
|
|
124
|
+
model.load_state_dict(torch.load("trained_mortm.pth", map_location=DEVICE)) #
|
|
125
|
+
model.to(DEVICE).eval() #
|
|
126
|
+
|
|
127
|
+
seed_ids = torch.tensor([tokenizer.get("<MGEN>"), tokenizer.get("<TS>")], device=DEVICE) #
|
|
128
|
+
with torch.no_grad(): #
|
|
129
|
+
_, full_seq = model.top_p_sampling_measure_kv_cache(seed_ids, p=0.95, max_measure=8, temperature=0.7) #
|
|
130
|
+
ct_token_to_midi(tokenizer, full_seq, "generated_melody.mid", program=0, tempo=120) #
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
##### BERTM: Music Classification
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import torch
|
|
137
|
+
import numpy as np
|
|
138
|
+
import torch.nn.functional as F
|
|
139
|
+
from mortm.models.bertm import BERTM, MORTMArgs as BERTMArgs
|
|
140
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
|
|
141
|
+
from mortm.models.modules.progress import _DefaultLearningProgress
|
|
142
|
+
|
|
143
|
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
|
|
144
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
|
|
145
|
+
args = BERTMArgs("configs/models/bertm/class_file.json") #
|
|
146
|
+
model = BERTM(progress=_DefaultLearningProgress(), args=args) #
|
|
147
|
+
model.load_state_dict(torch.load("trained_bertm.pth", map_location=DEVICE)) #
|
|
148
|
+
model.to(DEVICE).eval() #
|
|
149
|
+
|
|
150
|
+
input_npz = np.load("input_music.npz")['array1'] #
|
|
151
|
+
input_ids = torch.tensor(input_npz, dtype=torch.long, device=DEVICE).unsqueeze(0) #
|
|
152
|
+
|
|
153
|
+
with torch.no_grad(): #
|
|
154
|
+
logits = model(input_ids) #
|
|
155
|
+
probs = F.softmax(logits, dim=-1) #
|
|
156
|
+
pred = "Human" if probs.argmax() == 0 else "AI" #
|
|
157
|
+
print(f"Prediction: {pred}, Probabilities: {probs.squeeze().tolist()}") #
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
#### Training
|
|
161
|
+
|
|
162
|
+
##### Train MORTM
|
|
163
|
+
```bash
|
|
164
|
+
python run_train.py --model_config configs/models/mortm/A.json \
|
|
165
|
+
--train_config configs/train/pre_training.json \
|
|
166
|
+
--root_directory path/to/npz_dataset \
|
|
167
|
+
--save_directory out/models_mortm \
|
|
168
|
+
--version MyMORTM_v1
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
##### Train V_MORTM
|
|
172
|
+
```bash
|
|
173
|
+
python run_v_train.py --model_config configs/models/v_mortm/A.json \
|
|
174
|
+
--train_config configs/train/pre_training.json \
|
|
175
|
+
--root_directory path/to/wav_dataset \
|
|
176
|
+
--save_directory out/models_v_mortm \
|
|
177
|
+
--version MyV_MORTM_v1
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
##### Train BERTM
|
|
181
|
+
```bash
|
|
182
|
+
python class_train.py --model_config configs/models/bertm/class_file.json \
|
|
183
|
+
--train_config configs/train/pre_training.json \
|
|
184
|
+
--human_dir path/to/human_npz \
|
|
185
|
+
--ai_dir path/to/ai_npz \
|
|
186
|
+
--save_directory out/models_bertm \
|
|
187
|
+
--version MyBERTM_v1
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
### Token Format (Example)
|
|
193
|
+
|
|
194
|
+
MORTM represents musical events as structured tokens. For example:
|
|
195
|
+
`<MGEN> <TS> Pitch=64 Duration=8 Position=0 Pitch=66 Duration=8 Position=8 ... <TE> <SME>`
|
|
196
|
+
|
|
197
|
+
* `Pitch`: MIDI note number (e.g., 64 = E4)
|
|
198
|
+
* `Duration`: Length in ticks (8 ticks = eighth note)
|
|
199
|
+
* `Position`: Start position within the bar (0–95 ticks)
|
|
200
|
+
* `<SME>`: Special token indicating the end of a bar
|
|
201
|
+
* `<TS>` / `<TE>`: Track start/end tokens
|
|
202
|
+
* `<MGEN>`: Generation start token
|
|
203
|
+
* `<ESEQ>`: Sequence end token
|
|
204
|
+
* `<BLANK>`: Blank token
|
|
205
|
+
* `<CLS>`: Classification token
|
|
206
|
+
* `<Query_M>` / `</Query_M>`: Query melody start/end tokens
|
|
207
|
+
* `<Query_C>` / `</Query_C>`: Query chord start/end tokens
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### Troubleshooting
|
|
212
|
+
|
|
213
|
+
* **`load_state_dict` errors**: Check configuration and `map_location`.
|
|
214
|
+
* **Inference errors**: Ensure correct tensor shapes and vocabulary are used.
|
|
215
|
+
* **CUDA OOM (Out of Memory)**: Reduce batch size or use a smaller model.
|
|
216
|
+
* **FlashAttention2 issues**: Verify CUDA and compiler compatibility.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
### Model Variants
|
|
221
|
+
|
|
222
|
+
Model parameters are defined in JSON configuration files (e.g., `configs/models/...`). Key parameters and model types include:
|
|
223
|
+
|
|
224
|
+
| Parameter | Value | Description |
|
|
225
|
+
| :---------------- | :---- | :-------------------- |
|
|
226
|
+
| `d_model` | 512 | Embedding dimension |
|
|
227
|
+
| `num_heads` | 8 | Number of attention heads |
|
|
228
|
+
| `num_layers` | 12 | Number of decoder layers |
|
|
229
|
+
| `dim_feedforward` | 2048 | FFN dimension |
|
|
230
|
+
| `num_experts` | 16 | Number of MoE experts |
|
|
231
|
+
| `topk_experts` | 2 | Number of active experts per token |
|
|
232
|
+
| `vocab_size` | ... | Obtained from `vocab_list.json` |
|
|
233
|
+
|
|
234
|
+
Since MORTM 3.0, models are provided based on the number of experts.
|
|
235
|
+
|
|
236
|
+
| Model | Layers | Experts | Shared Experts | Embedding Dim | Heads |
|
|
237
|
+
| :------ | :----- | :------ | :------------- | :------------ | :---- |
|
|
238
|
+
| MORTM-C | 12 | 6 | 1 | 512 | 8 |
|
|
239
|
+
| MORTM-B | 12 | 12 | 1 | 512 | 8 |
|
|
240
|
+
| MORTM-A | 12 | 16 | 1 | 512 | 8 |
|
|
241
|
+
| MORTM-S | 12 | 24 | 1 | 512 | 8 |
|
|
242
|
+
| MORTM-SS| 12 | 64 | 1 | 512 | 8 |
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
### License
|
|
247
|
+
|
|
248
|
+
MIT License
|
|
249
|
+
|
|
250
|
+
### Author
|
|
251
|
+
|
|
252
|
+
Takaaki Nagoshi
|
|
253
|
+
Graduate School of Integrated Basic Sciences, Nihon University
|
|
254
|
+
cs23033@g.nihon-u.ac.jp
|
mortm-4.5/README.md
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
Welcome to the MORTM Library!
|
|
2
|
+
|
|
3
|
+
I am your personal guide to MORTM (Metric-Oriented Rhythmic Transformer for Music Generation). This document provides a beginner-friendly overview of MORTM, its main features, installation instructions, and basic usage. MORTM is a Transformer-based melody generation model that focuses on the metric structure of music.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
### 🎵 MORTM: Metric-Oriented Rhythmic Transformer for Music Generation
|
|
8
|
+
|
|
9
|
+
MORTM (Metric-Oriented Rhythmic Transformer for Music Generation) is a Transformer-based melody generation model that focuses on the **metric structure** of music. It generates musical sequences autoregressively, one bar at a time, while preserving rhythmic consistency. MORTM also includes V_MORTM for audio-based generation and BERTM for music classification tasks.
|
|
10
|
+
|
|
11
|
+
#### ✨ Key Features
|
|
12
|
+
|
|
13
|
+
* **Bar-level Autoregressive Generation**: Each bar is normalized to 96 ticks (or 64 ticks in some contexts) and generated one bar at a time. It sequentially predicts one bar and uses it as the next input.
|
|
14
|
+
* **High-Quality Music Generation**: Utilizes a custom tokenizer to capture musical structure, including pitch, duration, relative timing, and bars, leading to coherent outputs.
|
|
15
|
+
* **Efficient Transformer Architecture**:
|
|
16
|
+
* **Decoder-Only (GPT-style)**: Optimized for autoregressive generation.
|
|
17
|
+
* **FlashAttention2 & ALiBi**: Offers memory-efficient, high-speed attention with excellent long-sequence generalization. FlashAttention2 resolves computational bottlenecks, allowing deeper models to be trained. ALiBi (Attention with Linear Biases) adds linear biases for relative positions to handle long-range dependencies and is compatible with FlashAttention2 as an alternative to Relative Positional Encoding (RPE).
|
|
18
|
+
* **Mixture of Experts (MoE)**: Employs sparsely activated Feed-Forward Network (FFN) layers, typically with Top-2 routing, to significantly increase model capacity while maintaining computational efficiency.
|
|
19
|
+
* **Structured Tokenization**: Uses tokens for Pitch, Duration, and Position, along with structural tokens like `<SME>` (End of Bar), `<TS>` (Track Start), and `<TE>` (Track End). Position tokens represent the start position within a bar (0-95 ticks).
|
|
20
|
+
* **Multimodal Support (V_MORTM)**: Can directly process audio features such as Mel spectrograms.
|
|
21
|
+
* **Classification (BERTM)**: Features a BERT-like encoder for music classification tasks.
|
|
22
|
+
* **Versatile Applications**: Applicable for melody generation, improvisation assistance, education, human-AI co-creation, and audio style transfer.
|
|
23
|
+
|
|
24
|
+
#### 🚀 Why MORTM?
|
|
25
|
+
|
|
26
|
+
* **State-of-the-Art**: Combines advanced techniques such as FlashAttention2, MoE, and ALiBi.
|
|
27
|
+
* **Musical Understanding**: Its custom tokenizer effectively captures core musical elements.
|
|
28
|
+
* **Scalability**: Supports diverse styles and long musical sequences.
|
|
29
|
+
* **Audio Domain**: V_MORTM enables richer audio-based generation.
|
|
30
|
+
* **Modular**: Facilitates easy prototyping and comparative experiments.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
### 🛠️ Installation
|
|
35
|
+
|
|
36
|
+
To set up your environment for MORTM, follow these steps:
|
|
37
|
+
|
|
38
|
+
#### Prerequisites
|
|
39
|
+
|
|
40
|
+
* Python 3.8+
|
|
41
|
+
* NVIDIA GPU (for FlashAttention2)
|
|
42
|
+
* CUDA Toolkit (compatible with PyTorch)
|
|
43
|
+
|
|
44
|
+
#### 1. Install PyTorch
|
|
45
|
+
|
|
46
|
+
Follow the instructions at [pytorch.org](https://pytorch.org). For example:
|
|
47
|
+
```bash
|
|
48
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
#### 2. Install FlashAttention2
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install flash-attn --no-build-isolation
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### 3. Install Other Dependencies
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install numpy einops pretty_midi midi2audio soundfile torchaudio PyYAML
|
|
61
|
+
```
|
|
62
|
+
**Note**: `midi2audio` requires FluidSynth and a soundfont (e.g., `.sf2` file).
|
|
63
|
+
|
|
64
|
+
#### 4. Optional: Gmail Notifications
|
|
65
|
+
|
|
66
|
+
If you wish to receive training progress updates via Gmail:
|
|
67
|
+
```bash
|
|
68
|
+
pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib
|
|
69
|
+
```
|
|
70
|
+
This requires OAuth2 setup (`client_secret.json`).
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
### ⚡ Quick Start
|
|
75
|
+
|
|
76
|
+
#### Data Preparation
|
|
77
|
+
|
|
78
|
+
Convert MIDI files into tokenized `.npz` format:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_TOKEN
|
|
82
|
+
from mortm.convert import MIDI2Seq
|
|
83
|
+
|
|
84
|
+
# Initialize tokenizer
|
|
85
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_TOKEN)) #
|
|
86
|
+
# Convert MIDI to sequence
|
|
87
|
+
converter = MIDI2Seq(tokenizer, "midi_dir", "your_midi.mid", program_list=, split_measure=12) #
|
|
88
|
+
converter.convert() #
|
|
89
|
+
# Save converted data
|
|
90
|
+
converter.save("output_npz_dir") #
|
|
91
|
+
# Save tokenizer vocabulary
|
|
92
|
+
tokenizer.save("vocab_output_dir") #
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
#### Inference
|
|
96
|
+
|
|
97
|
+
##### MORTM: Melody Generation
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import torch
|
|
101
|
+
import numpy as np
|
|
102
|
+
from mortm.models.mortm import MORTM, MORTMArgs
|
|
103
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
|
|
104
|
+
from mortm.de_convert import ct_token_to_midi
|
|
105
|
+
from mortm.models.modules.progress import _DefaultLearningProgress
|
|
106
|
+
|
|
107
|
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
|
|
108
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
|
|
109
|
+
args = MORTMArgs("configs/models/mortm/A.json") #
|
|
110
|
+
model = MORTM(progress=_DefaultLearningProgress(), args=args) #
|
|
111
|
+
model.load_state_dict(torch.load("trained_mortm.pth", map_location=DEVICE)) #
|
|
112
|
+
model.to(DEVICE).eval() #
|
|
113
|
+
|
|
114
|
+
seed_ids = torch.tensor([tokenizer.get("<MGEN>"), tokenizer.get("<TS>")], device=DEVICE) #
|
|
115
|
+
with torch.no_grad(): #
|
|
116
|
+
_, full_seq = model.top_p_sampling_measure_kv_cache(seed_ids, p=0.95, max_measure=8, temperature=0.7) #
|
|
117
|
+
ct_token_to_midi(tokenizer, full_seq, "generated_melody.mid", program=0, tempo=120) #
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
##### BERTM: Music Classification
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import torch
|
|
124
|
+
import numpy as np
|
|
125
|
+
import torch.nn.functional as F
|
|
126
|
+
from mortm.models.bertm import BERTM, MORTMArgs as BERTMArgs
|
|
127
|
+
from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
|
|
128
|
+
from mortm.models.modules.progress import _DefaultLearningProgress
|
|
129
|
+
|
|
130
|
+
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
|
|
131
|
+
tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
|
|
132
|
+
args = BERTMArgs("configs/models/bertm/class_file.json") #
|
|
133
|
+
model = BERTM(progress=_DefaultLearningProgress(), args=args) #
|
|
134
|
+
model.load_state_dict(torch.load("trained_bertm.pth", map_location=DEVICE)) #
|
|
135
|
+
model.to(DEVICE).eval() #
|
|
136
|
+
|
|
137
|
+
input_npz = np.load("input_music.npz")['array1'] #
|
|
138
|
+
input_ids = torch.tensor(input_npz, dtype=torch.long, device=DEVICE).unsqueeze(0) #
|
|
139
|
+
|
|
140
|
+
with torch.no_grad(): #
|
|
141
|
+
logits = model(input_ids) #
|
|
142
|
+
probs = F.softmax(logits, dim=-1) #
|
|
143
|
+
pred = "Human" if probs.argmax() == 0 else "AI" #
|
|
144
|
+
print(f"Prediction: {pred}, Probabilities: {probs.squeeze().tolist()}") #
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
#### Training
|
|
148
|
+
|
|
149
|
+
##### Train MORTM
|
|
150
|
+
```bash
|
|
151
|
+
python run_train.py --model_config configs/models/mortm/A.json \
|
|
152
|
+
--train_config configs/train/pre_training.json \
|
|
153
|
+
--root_directory path/to/npz_dataset \
|
|
154
|
+
--save_directory out/models_mortm \
|
|
155
|
+
--version MyMORTM_v1
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
##### Train V_MORTM
|
|
159
|
+
```bash
|
|
160
|
+
python run_v_train.py --model_config configs/models/v_mortm/A.json \
|
|
161
|
+
--train_config configs/train/pre_training.json \
|
|
162
|
+
--root_directory path/to/wav_dataset \
|
|
163
|
+
--save_directory out/models_v_mortm \
|
|
164
|
+
--version MyV_MORTM_v1
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
##### Train BERTM
|
|
168
|
+
```bash
|
|
169
|
+
python class_train.py --model_config configs/models/bertm/class_file.json \
|
|
170
|
+
--train_config configs/train/pre_training.json \
|
|
171
|
+
--human_dir path/to/human_npz \
|
|
172
|
+
--ai_dir path/to/ai_npz \
|
|
173
|
+
--save_directory out/models_bertm \
|
|
174
|
+
--version MyBERTM_v1
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
### Token Format (Example)
|
|
180
|
+
|
|
181
|
+
MORTM represents musical events as structured tokens. For example:
|
|
182
|
+
`<MGEN> <TS> Pitch=64 Duration=8 Position=0 Pitch=66 Duration=8 Position=8 ... <TE> <SME>`
|
|
183
|
+
|
|
184
|
+
* `Pitch`: MIDI note number (e.g., 64 = E4)
|
|
185
|
+
* `Duration`: Length in ticks (8 ticks = eighth note)
|
|
186
|
+
* `Position`: Start position within the bar (0–95 ticks)
|
|
187
|
+
* `<SME>`: Special token indicating the end of a bar
|
|
188
|
+
* `<TS>` / `<TE>`: Track start/end tokens
|
|
189
|
+
* `<MGEN>`: Generation start token
|
|
190
|
+
* `<ESEQ>`: Sequence end token
|
|
191
|
+
* `<BLANK>`: Blank token
|
|
192
|
+
* `<CLS>`: Classification token
|
|
193
|
+
* `<Query_M>` / `</Query_M>`: Query melody start/end tokens
|
|
194
|
+
* `<Query_C>` / `</Query_C>`: Query chord start/end tokens
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
### Troubleshooting
|
|
199
|
+
|
|
200
|
+
* **`load_state_dict` errors**: Check configuration and `map_location`.
|
|
201
|
+
* **Inference errors**: Ensure correct tensor shapes and vocabulary are used.
|
|
202
|
+
* **CUDA OOM (Out of Memory)**: Reduce batch size or use a smaller model.
|
|
203
|
+
* **FlashAttention2 issues**: Verify CUDA and compiler compatibility.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
### Model Variants
|
|
208
|
+
|
|
209
|
+
Model parameters are defined in JSON configuration files (e.g., `configs/models/...`). Key parameters and model types include:
|
|
210
|
+
|
|
211
|
+
| Parameter | Value | Description |
|
|
212
|
+
| :---------------- | :---- | :-------------------- |
|
|
213
|
+
| `d_model` | 512 | Embedding dimension |
|
|
214
|
+
| `num_heads` | 8 | Number of attention heads |
|
|
215
|
+
| `num_layers` | 12 | Number of decoder layers |
|
|
216
|
+
| `dim_feedforward` | 2048 | FFN dimension |
|
|
217
|
+
| `num_experts` | 16 | Number of MoE experts |
|
|
218
|
+
| `topk_experts` | 2 | Number of active experts per token |
|
|
219
|
+
| `vocab_size` | ... | Obtained from `vocab_list.json` |
|
|
220
|
+
|
|
221
|
+
Since MORTM 3.0, models are provided based on the number of experts.
|
|
222
|
+
|
|
223
|
+
| Model | Layers | Experts | Shared Experts | Embedding Dim | Heads |
|
|
224
|
+
| :------ | :----- | :------ | :------------- | :------------ | :---- |
|
|
225
|
+
| MORTM-C | 12 | 6 | 1 | 512 | 8 |
|
|
226
|
+
| MORTM-B | 12 | 12 | 1 | 512 | 8 |
|
|
227
|
+
| MORTM-A | 12 | 16 | 1 | 512 | 8 |
|
|
228
|
+
| MORTM-S | 12 | 24 | 1 | 512 | 8 |
|
|
229
|
+
| MORTM-SS| 12 | 64 | 1 | 512 | 8 |
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
### License
|
|
234
|
+
|
|
235
|
+
MIT License
|
|
236
|
+
|
|
237
|
+
### Author
|
|
238
|
+
|
|
239
|
+
Takaaki Nagoshi
|
|
240
|
+
Graduate School of Integrated Basic Sciences, Nihon University
|
|
241
|
+
cs23033@g.nihon-u.ac.jp
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
PITCH_MAX = 128
|
|
4
|
+
VELO = 128
|
|
5
|
+
LENGTH = 999
|
|
6
|
+
LENGTH_HALF = 999
|
|
7
|
+
BEGIN = 999
|
|
8
|
+
BEGIN_HALF = 999
|
|
9
|
+
ROOT = 99
|
|
10
|
+
START_SEQ_TOKEN = "<S_SEQ>"
|
|
11
|
+
END_SEQ_TOKEN = "<E_SEQ>"
|
|
12
|
+
PADDING_TOKEN = "<PAD>"
|
|
13
|
+
|
|
14
|
+
MODEL_NAME = "MORTM"
|
|
15
|
+
|
|
16
|
+
# 前回のトークンのID + 前回のトークンの使用個数
|
|
17
|
+
PADDING_BEGIN_ID = 0
|
|
18
|
+
SPECIAL_BEGIN_ID = PADDING_BEGIN_ID + 1
|
|
19
|
+
PITCH_BEGIN_ID = SPECIAL_BEGIN_ID + 2
|
|
20
|
+
VELOCITY_BEGIN_ID = PITCH_BEGIN_ID + 128
|
|
21
|
+
DURATION_BEGIN_ID = VELOCITY_BEGIN_ID + 128
|
|
22
|
+
START_BEGIN_ID = DURATION_BEGIN_ID + 100
|
|
23
|
+
SHIFT_BEGIN_ID = START_BEGIN_ID + 32
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
PITCH_GROUP = range(PITCH_BEGIN_ID, PITCH_BEGIN_ID + 128 + 1)
|
|
27
|
+
VELOCITY_GROUP = range(VELOCITY_BEGIN_ID, VELOCITY_BEGIN_ID + 128 + 1)
|
|
28
|
+
DURATION_GROUP = range(DURATION_BEGIN_ID, DURATION_BEGIN_ID + 100 + 1)
|
|
29
|
+
START_GROUP = range(START_BEGIN_ID, START_BEGIN_ID + 32 + 1)
|
|
30
|
+
SHIFT_GROUP = range(SHIFT_BEGIN_ID, SHIFT_BEGIN_ID + 4 + 1)
|
|
31
|
+
|
|
File without changes
|