mortm 4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. mortm-4.5/PKG-INFO +254 -0
  2. mortm-4.5/README.md +241 -0
  3. mortm-4.5/mortm/__init__.py +0 -0
  4. mortm-4.5/mortm/constants.py +31 -0
  5. mortm-4.5/mortm/models/__init__.py +0 -0
  6. mortm-4.5/mortm/models/bertm.py +294 -0
  7. mortm-4.5/mortm/models/modules/PositionalEncoding.py +27 -0
  8. mortm-4.5/mortm/models/modules/__init__.py +0 -0
  9. mortm-4.5/mortm/models/modules/attention.py +300 -0
  10. mortm-4.5/mortm/models/modules/audio_patch.py +44 -0
  11. mortm-4.5/mortm/models/modules/config.py +77 -0
  12. mortm-4.5/mortm/models/modules/layers.py +471 -0
  13. mortm-4.5/mortm/models/modules/progress.py +52 -0
  14. mortm-4.5/mortm/models/mortm.py +338 -0
  15. mortm-4.5/mortm/models/mortm_live.py +26 -0
  16. mortm-4.5/mortm/models/v_mortm.py +65 -0
  17. mortm-4.5/mortm/train/__init__.py +0 -0
  18. mortm-4.5/mortm/train/config.py +55 -0
  19. mortm-4.5/mortm/train/custom_token.py +603 -0
  20. mortm-4.5/mortm/train/datasets.py +321 -0
  21. mortm-4.5/mortm/train/epoch.py +20 -0
  22. mortm-4.5/mortm/train/noam.py +7 -0
  23. mortm-4.5/mortm/train/rl/__init__.py +0 -0
  24. mortm-4.5/mortm/train/rl/reinforcement.py +207 -0
  25. mortm-4.5/mortm/train/tokenizer.py +204 -0
  26. mortm-4.5/mortm/train/train.py +686 -0
  27. mortm-4.5/mortm/train/utils/__init__.py +0 -0
  28. mortm-4.5/mortm/train/utils/chord_midi.py +47 -0
  29. mortm-4.5/mortm/train/utils/loss.py +135 -0
  30. mortm-4.5/mortm/utils/__init__.py +0 -0
  31. mortm-4.5/mortm/utils/convert.py +1220 -0
  32. mortm-4.5/mortm/utils/de_convert.py +40 -0
  33. mortm-4.5/mortm/utils/eval.py +155 -0
  34. mortm-4.5/mortm/utils/generate.py +149 -0
  35. mortm-4.5/mortm/utils/gmail_messanger.py +66 -0
  36. mortm-4.5/mortm/utils/key.py +354 -0
  37. mortm-4.5/mortm/utils/messager.py +21 -0
  38. mortm-4.5/mortm/utils/pianoroll_convert.py +182 -0
  39. mortm-4.5/mortm/utils/tag.py +97 -0
  40. mortm-4.5/mortm.egg-info/PKG-INFO +254 -0
  41. mortm-4.5/mortm.egg-info/SOURCES.txt +43 -0
  42. mortm-4.5/mortm.egg-info/dependency_links.txt +1 -0
  43. mortm-4.5/mortm.egg-info/top_level.txt +1 -0
  44. mortm-4.5/setup.cfg +4 -0
  45. mortm-4.5/setup.py +21 -0
mortm-4.5/PKG-INFO ADDED
@@ -0,0 +1,254 @@
1
+ Metadata-Version: 2.1
2
+ Name: mortm
3
+ Version: 4.5
4
+ Summary: 旋律生成、コード推定、マルチタスクな音楽生成を行うライブラリ
5
+ Home-page: https://github.com/Ayato964
6
+ Author: Nagoshi Takaaki
7
+ Author-email: nagoshi@kthrlab.jp
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.0
12
+ Description-Content-Type: text/markdown
13
+
14
+ Welcome to the MORTM Library!
15
+
16
+ I am your personal guide to MORTM (Metric-Oriented Rhythmic Transformer for Music Generation). This document provides a beginner-friendly overview of MORTM, its main features, installation instructions, and basic usage. MORTM is a Transformer-based melody generation model that focuses on the metric structure of music.
17
+
18
+ ---
19
+
20
+ ### 🎵 MORTM: Metric-Oriented Rhythmic Transformer for Music Generation
21
+
22
+ MORTM (Metric-Oriented Rhythmic Transformer for Music Generation) is a Transformer-based melody generation model that focuses on the **metric structure** of music. It generates musical sequences autoregressively, one bar at a time, while preserving rhythmic consistency. MORTM also includes V_MORTM for audio-based generation and BERTM for music classification tasks.
23
+
24
+ #### ✨ Key Features
25
+
26
+ * **Bar-level Autoregressive Generation**: Each bar is normalized to 96 ticks (or 64 ticks in some contexts) and generated one bar at a time. It sequentially predicts one bar and uses it as the next input.
27
+ * **High-Quality Music Generation**: Utilizes a custom tokenizer to capture musical structure, including pitch, duration, relative timing, and bars, leading to coherent outputs.
28
+ * **Efficient Transformer Architecture**:
29
+ * **Decoder-Only (GPT-style)**: Optimized for autoregressive generation.
30
+ * **FlashAttention2 & ALiBi**: Offers memory-efficient, high-speed attention with excellent long-sequence generalization. FlashAttention2 resolves computational bottlenecks, allowing deeper models to be trained. ALiBi (Attention with Linear Biases) adds linear biases for relative positions to handle long-range dependencies and is compatible with FlashAttention2 as an alternative to Relative Positional Encoding (RPE).
31
+ * **Mixture of Experts (MoE)**: Employs sparsely activated Feed-Forward Network (FFN) layers, typically with Top-2 routing, to significantly increase model capacity while maintaining computational efficiency.
32
+ * **Structured Tokenization**: Uses tokens for Pitch, Duration, and Position, along with structural tokens like `<SME>` (End of Bar), `<TS>` (Track Start), and `<TE>` (Track End). Position tokens represent the start position within a bar (0-95 ticks).
33
+ * **Multimodal Support (V_MORTM)**: Can directly process audio features such as Mel spectrograms.
34
+ * **Classification (BERTM)**: Features a BERT-like encoder for music classification tasks.
35
+ * **Versatile Applications**: Applicable for melody generation, improvisation assistance, education, human-AI co-creation, and audio style transfer.
36
+
37
+ #### 🚀 Why MORTM?
38
+
39
+ * **State-of-the-Art**: Combines advanced techniques such as FlashAttention2, MoE, and ALiBi.
40
+ * **Musical Understanding**: Its custom tokenizer effectively captures core musical elements.
41
+ * **Scalability**: Supports diverse styles and long musical sequences.
42
+ * **Audio Domain**: V_MORTM enables richer audio-based generation.
43
+ * **Modular**: Facilitates easy prototyping and comparative experiments.
44
+
45
+ ---
46
+
47
+ ### 🛠️ Installation
48
+
49
+ To set up your environment for MORTM, follow these steps:
50
+
51
+ #### Prerequisites
52
+
53
+ * Python 3.8+
54
+ * NVIDIA GPU (for FlashAttention2)
55
+ * CUDA Toolkit (compatible with PyTorch)
56
+
57
+ #### 1. Install PyTorch
58
+
59
+ Follow the instructions at [pytorch.org](https://pytorch.org). For example:
60
+ ```bash
61
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
62
+ ```
63
+
64
+ #### 2. Install FlashAttention2
65
+
66
+ ```bash
67
+ pip install flash-attn --no-build-isolation
68
+ ```
69
+
70
+ #### 3. Install Other Dependencies
71
+
72
+ ```bash
73
+ pip install numpy einops pretty_midi midi2audio soundfile torchaudio PyYAML
74
+ ```
75
+ **Note**: `midi2audio` requires FluidSynth and a soundfont (e.g., `.sf2` file).
76
+
77
+ #### 4. Optional: Gmail Notifications
78
+
79
+ If you wish to receive training progress updates via Gmail:
80
+ ```bash
81
+ pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib
82
+ ```
83
+ This requires OAuth2 setup (`client_secret.json`).
84
+
85
+ ---
86
+
87
+ ### ⚡ Quick Start
88
+
89
+ #### Data Preparation
90
+
91
+ Convert MIDI files into tokenized `.npz` format:
92
+
93
+ ```python
94
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_TOKEN
95
+ from mortm.convert import MIDI2Seq
96
+
97
+ # Initialize tokenizer
98
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_TOKEN)) #
99
+ # Convert MIDI to sequence
100
+ converter = MIDI2Seq(tokenizer, "midi_dir", "your_midi.mid", program_list=, split_measure=12) #
101
+ converter.convert() #
102
+ # Save converted data
103
+ converter.save("output_npz_dir") #
104
+ # Save tokenizer vocabulary
105
+ tokenizer.save("vocab_output_dir") #
106
+ ```
107
+
108
+ #### Inference
109
+
110
+ ##### MORTM: Melody Generation
111
+
112
+ ```python
113
+ import torch
114
+ import numpy as np
115
+ from mortm.models.mortm import MORTM, MORTMArgs
116
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
117
+ from mortm.de_convert import ct_token_to_midi
118
+ from mortm.models.modules.progress import _DefaultLearningProgress
119
+
120
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
121
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
122
+ args = MORTMArgs("configs/models/mortm/A.json") #
123
+ model = MORTM(progress=_DefaultLearningProgress(), args=args) #
124
+ model.load_state_dict(torch.load("trained_mortm.pth", map_location=DEVICE)) #
125
+ model.to(DEVICE).eval() #
126
+
127
+ seed_ids = torch.tensor([tokenizer.get("<MGEN>"), tokenizer.get("<TS>")], device=DEVICE) #
128
+ with torch.no_grad(): #
129
+ _, full_seq = model.top_p_sampling_measure_kv_cache(seed_ids, p=0.95, max_measure=8, temperature=0.7) #
130
+ ct_token_to_midi(tokenizer, full_seq, "generated_melody.mid", program=0, tempo=120) #
131
+ ```
132
+
133
+ ##### BERTM: Music Classification
134
+
135
+ ```python
136
+ import torch
137
+ import numpy as np
138
+ import torch.nn.functional as F
139
+ from mortm.models.bertm import BERTM, MORTMArgs as BERTMArgs
140
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
141
+ from mortm.models.modules.progress import _DefaultLearningProgress
142
+
143
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
144
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
145
+ args = BERTMArgs("configs/models/bertm/class_file.json") #
146
+ model = BERTM(progress=_DefaultLearningProgress(), args=args) #
147
+ model.load_state_dict(torch.load("trained_bertm.pth", map_location=DEVICE)) #
148
+ model.to(DEVICE).eval() #
149
+
150
+ input_npz = np.load("input_music.npz")['array1'] #
151
+ input_ids = torch.tensor(input_npz, dtype=torch.long, device=DEVICE).unsqueeze(0) #
152
+
153
+ with torch.no_grad(): #
154
+ logits = model(input_ids) #
155
+ probs = F.softmax(logits, dim=-1) #
156
+ pred = "Human" if probs.argmax() == 0 else "AI" #
157
+ print(f"Prediction: {pred}, Probabilities: {probs.squeeze().tolist()}") #
158
+ ```
159
+
160
+ #### Training
161
+
162
+ ##### Train MORTM
163
+ ```bash
164
+ python run_train.py --model_config configs/models/mortm/A.json \
165
+ --train_config configs/train/pre_training.json \
166
+ --root_directory path/to/npz_dataset \
167
+ --save_directory out/models_mortm \
168
+ --version MyMORTM_v1
169
+ ```
170
+
171
+ ##### Train V_MORTM
172
+ ```bash
173
+ python run_v_train.py --model_config configs/models/v_mortm/A.json \
174
+ --train_config configs/train/pre_training.json \
175
+ --root_directory path/to/wav_dataset \
176
+ --save_directory out/models_v_mortm \
177
+ --version MyV_MORTM_v1
178
+ ```
179
+
180
+ ##### Train BERTM
181
+ ```bash
182
+ python class_train.py --model_config configs/models/bertm/class_file.json \
183
+ --train_config configs/train/pre_training.json \
184
+ --human_dir path/to/human_npz \
185
+ --ai_dir path/to/ai_npz \
186
+ --save_directory out/models_bertm \
187
+ --version MyBERTM_v1
188
+ ```
189
+
190
+ ---
191
+
192
+ ### Token Format (Example)
193
+
194
+ MORTM represents musical events as structured tokens. For example:
195
+ `<MGEN> <TS> Pitch=64 Duration=8 Position=0 Pitch=66 Duration=8 Position=8 ... <TE> <SME>`
196
+
197
+ * `Pitch`: MIDI note number (e.g., 64 = E4)
198
+ * `Duration`: Length in ticks (8 ticks = eighth note)
199
+ * `Position`: Start position within the bar (0–95 ticks)
200
+ * `<SME>`: Special token indicating the end of a bar
201
+ * `<TS>` / `<TE>`: Track start/end tokens
202
+ * `<MGEN>`: Generation start token
203
+ * `<ESEQ>`: Sequence end token
204
+ * `<BLANK>`: Blank token
205
+ * `<CLS>`: Classification token
206
+ * `<Query_M>` / `</Query_M>`: Query melody start/end tokens
207
+ * `<Query_C>` / `</Query_C>`: Query chord start/end tokens
208
+
209
+ ---
210
+
211
+ ### Troubleshooting
212
+
213
+ * **`load_state_dict` errors**: Check configuration and `map_location`.
214
+ * **Inference errors**: Ensure correct tensor shapes and vocabulary are used.
215
+ * **CUDA OOM (Out of Memory)**: Reduce batch size or use a smaller model.
216
+ * **FlashAttention2 issues**: Verify CUDA and compiler compatibility.
217
+
218
+ ---
219
+
220
+ ### Model Variants
221
+
222
+ Model parameters are defined in JSON configuration files (e.g., `configs/models/...`). Key parameters and model types include:
223
+
224
+ | Parameter | Value | Description |
225
+ | :---------------- | :---- | :-------------------- |
226
+ | `d_model` | 512 | Embedding dimension |
227
+ | `num_heads` | 8 | Number of attention heads |
228
+ | `num_layers` | 12 | Number of decoder layers |
229
+ | `dim_feedforward` | 2048 | FFN dimension |
230
+ | `num_experts` | 16 | Number of MoE experts |
231
+ | `topk_experts` | 2 | Number of active experts per token |
232
+ | `vocab_size` | ... | Obtained from `vocab_list.json` |
233
+
234
+ Since MORTM 3.0, models are provided based on the number of experts.
235
+
236
+ | Model | Layers | Experts | Shared Experts | Embedding Dim | Heads |
237
+ | :------ | :----- | :------ | :------------- | :------------ | :---- |
238
+ | MORTM-C | 12 | 6 | 1 | 512 | 8 |
239
+ | MORTM-B | 12 | 12 | 1 | 512 | 8 |
240
+ | MORTM-A | 12 | 16 | 1 | 512 | 8 |
241
+ | MORTM-S | 12 | 24 | 1 | 512 | 8 |
242
+ | MORTM-SS| 12 | 64 | 1 | 512 | 8 |
243
+
244
+ ---
245
+
246
+ ### License
247
+
248
+ MIT License
249
+
250
+ ### Author
251
+
252
+ Takaaki Nagoshi
253
+ Graduate School of Integrated Basic Sciences, Nihon University
254
+ cs23033@g.nihon-u.ac.jp
mortm-4.5/README.md ADDED
@@ -0,0 +1,241 @@
1
+ Welcome to the MORTM Library!
2
+
3
+ I am your personal guide to MORTM (Metric-Oriented Rhythmic Transformer for Music Generation). This document provides a beginner-friendly overview of MORTM, its main features, installation instructions, and basic usage. MORTM is a Transformer-based melody generation model that focuses on the metric structure of music.
4
+
5
+ ---
6
+
7
+ ### 🎵 MORTM: Metric-Oriented Rhythmic Transformer for Music Generation
8
+
9
+ MORTM (Metric-Oriented Rhythmic Transformer for Music Generation) is a Transformer-based melody generation model that focuses on the **metric structure** of music. It generates musical sequences autoregressively, one bar at a time, while preserving rhythmic consistency. MORTM also includes V_MORTM for audio-based generation and BERTM for music classification tasks.
10
+
11
+ #### ✨ Key Features
12
+
13
+ * **Bar-level Autoregressive Generation**: Each bar is normalized to 96 ticks (or 64 ticks in some contexts) and generated one bar at a time. It sequentially predicts one bar and uses it as the next input.
14
+ * **High-Quality Music Generation**: Utilizes a custom tokenizer to capture musical structure, including pitch, duration, relative timing, and bars, leading to coherent outputs.
15
+ * **Efficient Transformer Architecture**:
16
+ * **Decoder-Only (GPT-style)**: Optimized for autoregressive generation.
17
+ * **FlashAttention2 & ALiBi**: Offers memory-efficient, high-speed attention with excellent long-sequence generalization. FlashAttention2 resolves computational bottlenecks, allowing deeper models to be trained. ALiBi (Attention with Linear Biases) adds linear biases for relative positions to handle long-range dependencies and is compatible with FlashAttention2 as an alternative to Relative Positional Encoding (RPE).
18
+ * **Mixture of Experts (MoE)**: Employs sparsely activated Feed-Forward Network (FFN) layers, typically with Top-2 routing, to significantly increase model capacity while maintaining computational efficiency.
19
+ * **Structured Tokenization**: Uses tokens for Pitch, Duration, and Position, along with structural tokens like `<SME>` (End of Bar), `<TS>` (Track Start), and `<TE>` (Track End). Position tokens represent the start position within a bar (0-95 ticks).
20
+ * **Multimodal Support (V_MORTM)**: Can directly process audio features such as Mel spectrograms.
21
+ * **Classification (BERTM)**: Features a BERT-like encoder for music classification tasks.
22
+ * **Versatile Applications**: Applicable for melody generation, improvisation assistance, education, human-AI co-creation, and audio style transfer.
23
+
24
+ #### 🚀 Why MORTM?
25
+
26
+ * **State-of-the-Art**: Combines advanced techniques such as FlashAttention2, MoE, and ALiBi.
27
+ * **Musical Understanding**: Its custom tokenizer effectively captures core musical elements.
28
+ * **Scalability**: Supports diverse styles and long musical sequences.
29
+ * **Audio Domain**: V_MORTM enables richer audio-based generation.
30
+ * **Modular**: Facilitates easy prototyping and comparative experiments.
31
+
32
+ ---
33
+
34
+ ### 🛠️ Installation
35
+
36
+ To set up your environment for MORTM, follow these steps:
37
+
38
+ #### Prerequisites
39
+
40
+ * Python 3.8+
41
+ * NVIDIA GPU (for FlashAttention2)
42
+ * CUDA Toolkit (compatible with PyTorch)
43
+
44
+ #### 1. Install PyTorch
45
+
46
+ Follow the instructions at [pytorch.org](https://pytorch.org). For example:
47
+ ```bash
48
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
49
+ ```
50
+
51
+ #### 2. Install FlashAttention2
52
+
53
+ ```bash
54
+ pip install flash-attn --no-build-isolation
55
+ ```
56
+
57
+ #### 3. Install Other Dependencies
58
+
59
+ ```bash
60
+ pip install numpy einops pretty_midi midi2audio soundfile torchaudio PyYAML
61
+ ```
62
+ **Note**: `midi2audio` requires FluidSynth and a soundfont (e.g., `.sf2` file).
63
+
64
+ #### 4. Optional: Gmail Notifications
65
+
66
+ If you wish to receive training progress updates via Gmail:
67
+ ```bash
68
+ pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib
69
+ ```
70
+ This requires OAuth2 setup (`client_secret.json`).
71
+
72
+ ---
73
+
74
+ ### ⚡ Quick Start
75
+
76
+ #### Data Preparation
77
+
78
+ Convert MIDI files into tokenized `.npz` format:
79
+
80
+ ```python
81
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_TOKEN
82
+ from mortm.convert import MIDI2Seq
83
+
84
+ # Initialize tokenizer
85
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_TOKEN)) #
86
+ # Convert MIDI to sequence
87
+ converter = MIDI2Seq(tokenizer, "midi_dir", "your_midi.mid", program_list=, split_measure=12) #
88
+ converter.convert() #
89
+ # Save converted data
90
+ converter.save("output_npz_dir") #
91
+ # Save tokenizer vocabulary
92
+ tokenizer.save("vocab_output_dir") #
93
+ ```
94
+
95
+ #### Inference
96
+
97
+ ##### MORTM: Melody Generation
98
+
99
+ ```python
100
+ import torch
101
+ import numpy as np
102
+ from mortm.models.mortm import MORTM, MORTMArgs
103
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
104
+ from mortm.de_convert import ct_token_to_midi
105
+ from mortm.models.modules.progress import _DefaultLearningProgress
106
+
107
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
108
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
109
+ args = MORTMArgs("configs/models/mortm/A.json") #
110
+ model = MORTM(progress=_DefaultLearningProgress(), args=args) #
111
+ model.load_state_dict(torch.load("trained_mortm.pth", map_location=DEVICE)) #
112
+ model.to(DEVICE).eval() #
113
+
114
+ seed_ids = torch.tensor([tokenizer.get("<MGEN>"), tokenizer.get("<TS>")], device=DEVICE) #
115
+ with torch.no_grad(): #
116
+ _, full_seq = model.top_p_sampling_measure_kv_cache(seed_ids, p=0.95, max_measure=8, temperature=0.7) #
117
+ ct_token_to_midi(tokenizer, full_seq, "generated_melody.mid", program=0, tempo=120) #
118
+ ```
119
+
120
+ ##### BERTM: Music Classification
121
+
122
+ ```python
123
+ import torch
124
+ import numpy as np
125
+ import torch.nn.functional as F
126
+ from mortm.models.bertm import BERTM, MORTMArgs as BERTMArgs
127
+ from mortm.train.tokenizer import Tokenizer, get_token_converter_pro, TO_MUSIC
128
+ from mortm.models.modules.progress import _DefaultLearningProgress
129
+
130
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #
131
+ tokenizer = Tokenizer(music_token=get_token_converter_pro(TO_MUSIC), load_data="vocab_list.json") #
132
+ args = BERTMArgs("configs/models/bertm/class_file.json") #
133
+ model = BERTM(progress=_DefaultLearningProgress(), args=args) #
134
+ model.load_state_dict(torch.load("trained_bertm.pth", map_location=DEVICE)) #
135
+ model.to(DEVICE).eval() #
136
+
137
+ input_npz = np.load("input_music.npz")['array1'] #
138
+ input_ids = torch.tensor(input_npz, dtype=torch.long, device=DEVICE).unsqueeze(0) #
139
+
140
+ with torch.no_grad(): #
141
+ logits = model(input_ids) #
142
+ probs = F.softmax(logits, dim=-1) #
143
+ pred = "Human" if probs.argmax() == 0 else "AI" #
144
+ print(f"Prediction: {pred}, Probabilities: {probs.squeeze().tolist()}") #
145
+ ```
146
+
147
+ #### Training
148
+
149
+ ##### Train MORTM
150
+ ```bash
151
+ python run_train.py --model_config configs/models/mortm/A.json \
152
+ --train_config configs/train/pre_training.json \
153
+ --root_directory path/to/npz_dataset \
154
+ --save_directory out/models_mortm \
155
+ --version MyMORTM_v1
156
+ ```
157
+
158
+ ##### Train V_MORTM
159
+ ```bash
160
+ python run_v_train.py --model_config configs/models/v_mortm/A.json \
161
+ --train_config configs/train/pre_training.json \
162
+ --root_directory path/to/wav_dataset \
163
+ --save_directory out/models_v_mortm \
164
+ --version MyV_MORTM_v1
165
+ ```
166
+
167
+ ##### Train BERTM
168
+ ```bash
169
+ python class_train.py --model_config configs/models/bertm/class_file.json \
170
+ --train_config configs/train/pre_training.json \
171
+ --human_dir path/to/human_npz \
172
+ --ai_dir path/to/ai_npz \
173
+ --save_directory out/models_bertm \
174
+ --version MyBERTM_v1
175
+ ```
176
+
177
+ ---
178
+
179
+ ### Token Format (Example)
180
+
181
+ MORTM represents musical events as structured tokens. For example:
182
+ `<MGEN> <TS> Pitch=64 Duration=8 Position=0 Pitch=66 Duration=8 Position=8 ... <TE> <SME>`
183
+
184
+ * `Pitch`: MIDI note number (e.g., 64 = E4)
185
+ * `Duration`: Length in ticks (8 ticks = eighth note)
186
+ * `Position`: Start position within the bar (0–95 ticks)
187
+ * `<SME>`: Special token indicating the end of a bar
188
+ * `<TS>` / `<TE>`: Track start/end tokens
189
+ * `<MGEN>`: Generation start token
190
+ * `<ESEQ>`: Sequence end token
191
+ * `<BLANK>`: Blank token
192
+ * `<CLS>`: Classification token
193
+ * `<Query_M>` / `</Query_M>`: Query melody start/end tokens
194
+ * `<Query_C>` / `</Query_C>`: Query chord start/end tokens
195
+
196
+ ---
197
+
198
+ ### Troubleshooting
199
+
200
+ * **`load_state_dict` errors**: Check configuration and `map_location`.
201
+ * **Inference errors**: Ensure correct tensor shapes and vocabulary are used.
202
+ * **CUDA OOM (Out of Memory)**: Reduce batch size or use a smaller model.
203
+ * **FlashAttention2 issues**: Verify CUDA and compiler compatibility.
204
+
205
+ ---
206
+
207
+ ### Model Variants
208
+
209
+ Model parameters are defined in JSON configuration files (e.g., `configs/models/...`). Key parameters and model types include:
210
+
211
+ | Parameter | Value | Description |
212
+ | :---------------- | :---- | :-------------------- |
213
+ | `d_model` | 512 | Embedding dimension |
214
+ | `num_heads` | 8 | Number of attention heads |
215
+ | `num_layers` | 12 | Number of decoder layers |
216
+ | `dim_feedforward` | 2048 | FFN dimension |
217
+ | `num_experts` | 16 | Number of MoE experts |
218
+ | `topk_experts` | 2 | Number of active experts per token |
219
+ | `vocab_size` | ... | Obtained from `vocab_list.json` |
220
+
221
+ Since MORTM 3.0, models are provided based on the number of experts.
222
+
223
+ | Model | Layers | Experts | Shared Experts | Embedding Dim | Heads |
224
+ | :------ | :----- | :------ | :------------- | :------------ | :---- |
225
+ | MORTM-C | 12 | 6 | 1 | 512 | 8 |
226
+ | MORTM-B | 12 | 12 | 1 | 512 | 8 |
227
+ | MORTM-A | 12 | 16 | 1 | 512 | 8 |
228
+ | MORTM-S | 12 | 24 | 1 | 512 | 8 |
229
+ | MORTM-SS| 12 | 64 | 1 | 512 | 8 |
230
+
231
+ ---
232
+
233
+ ### License
234
+
235
+ MIT License
236
+
237
+ ### Author
238
+
239
+ Takaaki Nagoshi
240
+ Graduate School of Integrated Basic Sciences, Nihon University
241
+ cs23033@g.nihon-u.ac.jp
File without changes
@@ -0,0 +1,31 @@
1
+ import torch
2
+
3
+ PITCH_MAX = 128
4
+ VELO = 128
5
+ LENGTH = 999
6
+ LENGTH_HALF = 999
7
+ BEGIN = 999
8
+ BEGIN_HALF = 999
9
+ ROOT = 99
10
+ START_SEQ_TOKEN = "<S_SEQ>"
11
+ END_SEQ_TOKEN = "<E_SEQ>"
12
+ PADDING_TOKEN = "<PAD>"
13
+
14
+ MODEL_NAME = "MORTM"
15
+
16
+ # 前回のトークンのID + 前回のトークンの使用個数
17
+ PADDING_BEGIN_ID = 0
18
+ SPECIAL_BEGIN_ID = PADDING_BEGIN_ID + 1
19
+ PITCH_BEGIN_ID = SPECIAL_BEGIN_ID + 2
20
+ VELOCITY_BEGIN_ID = PITCH_BEGIN_ID + 128
21
+ DURATION_BEGIN_ID = VELOCITY_BEGIN_ID + 128
22
+ START_BEGIN_ID = DURATION_BEGIN_ID + 100
23
+ SHIFT_BEGIN_ID = START_BEGIN_ID + 32
24
+
25
+
26
+ PITCH_GROUP = range(PITCH_BEGIN_ID, PITCH_BEGIN_ID + 128 + 1)
27
+ VELOCITY_GROUP = range(VELOCITY_BEGIN_ID, VELOCITY_BEGIN_ID + 128 + 1)
28
+ DURATION_GROUP = range(DURATION_BEGIN_ID, DURATION_BEGIN_ID + 100 + 1)
29
+ START_GROUP = range(START_BEGIN_ID, START_BEGIN_ID + 32 + 1)
30
+ SHIFT_GROUP = range(SHIFT_BEGIN_ID, SHIFT_BEGIN_ID + 4 + 1)
31
+
File without changes