phoonnx 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/version.py +1 -1
- {phoonnx-0.2.3.dist-info → phoonnx-0.2.4.dist-info}/METADATA +1 -1
- {phoonnx-0.2.3.dist-info → phoonnx-0.2.4.dist-info}/RECORD +7 -7
- phoonnx_train/preprocess.py +9 -6
- phoonnx_train/train.py +57 -9
- {phoonnx-0.2.3.dist-info → phoonnx-0.2.4.dist-info}/WHEEL +0 -0
- {phoonnx-0.2.3.dist-info → phoonnx-0.2.4.dist-info}/top_level.txt +0 -0
phoonnx/version.py
CHANGED
@@ -2,7 +2,7 @@ phoonnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
phoonnx/config.py,sha256=DKgsU03g8jrAuMcVqbu-w3MWPXOUihFtRnavg6WGQ1Y,19983
|
3
3
|
phoonnx/phoneme_ids.py,sha256=FiNgZwV6naEsBh6XwFLh3_FyOgPiCsK9qo7S0v-CmI4,13667
|
4
4
|
phoonnx/util.py,sha256=XSjFEoqSFcujFTHxednacgC9GrSYyF-Il5L6Utmxmu4,25909
|
5
|
-
phoonnx/version.py,sha256=
|
5
|
+
phoonnx/version.py,sha256=28DZfjsylGY3KkroBs-6sispnnXZDH3ZHCzHcKoK5L0,237
|
6
6
|
phoonnx/voice.py,sha256=JXjmbrhJd4mmTiLgz4O_Pa5_rKGUC9xzuBfqxYDw3Mg,19420
|
7
7
|
phoonnx/locale/ca/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
|
8
8
|
phoonnx/locale/en/phonetic_spellings.txt,sha256=xGQlWOABLzbttpQvopl9CU-NnwEJRqKx8iuylsdUoQA,27
|
@@ -63,8 +63,8 @@ phoonnx/thirdparty/tashkeel/input_id_map.json,sha256=cnpJqjx-k53AbzKyfC4GxMS771l
|
|
63
63
|
phoonnx/thirdparty/tashkeel/model.onnx,sha256=UsQNQsoJT_n_B6CR0KHq_XuqXPI4jmCpzIm6zY5elV8,4788213
|
64
64
|
phoonnx/thirdparty/tashkeel/target_id_map.json,sha256=baNAJL_UwP9U91mLt01aAEBRRNdGr-csFB_O6roh7TA,181
|
65
65
|
phoonnx_train/export_onnx.py,sha256=CPfgNEm0hnXPSlgme0R9jr-6jZ5fKFpG5DZJFMkC-h4,12820
|
66
|
-
phoonnx_train/preprocess.py,sha256=
|
67
|
-
phoonnx_train/train.py,sha256=
|
66
|
+
phoonnx_train/preprocess.py,sha256=T1YcM89bizARKMaqjDtr3JzidoNvAbm-3n3eClbzMsI,21880
|
67
|
+
phoonnx_train/train.py,sha256=nsINvDQ3dYvBne5UWPgLZ0a4qZFdSsOKk8HzZHGTLY4,8757
|
68
68
|
phoonnx_train/norm_audio/__init__.py,sha256=Al_YwqMnENXRWp0c79cDZqbdd7pFYARXKxCfBaedr1c,3030
|
69
69
|
phoonnx_train/norm_audio/trim.py,sha256=_ZsE3SYhahQSdEdBLeSwyFJGcvEbt-5E_lnWwTT4tcY,1698
|
70
70
|
phoonnx_train/norm_audio/vad.py,sha256=DXHfRD0qqFJ52FjPvrL5LlN6keJWuc9Nf6TNhxpwC_4,1600
|
@@ -83,7 +83,7 @@ phoonnx_train/vits/utils.py,sha256=exiyrtPHbnnGvcHWSbaH9-gR6srH5ZPHlKiqV2IHUrQ,4
|
|
83
83
|
phoonnx_train/vits/wavfile.py,sha256=oQZiTIrdw0oLTbcVwKfGXye1WtKte6qK_52qVwiMvfc,26396
|
84
84
|
phoonnx_train/vits/monotonic_align/__init__.py,sha256=5IdAOD1Z7UloMb6d_9NRFsXoNIjEQ3h9mvOSh_AtO3k,636
|
85
85
|
phoonnx_train/vits/monotonic_align/setup.py,sha256=0K5iJJ2mKIklx6ncEfCQS34skm5hHPiz9vRlQEvevvY,266
|
86
|
-
phoonnx-0.2.
|
87
|
-
phoonnx-0.2.
|
88
|
-
phoonnx-0.2.
|
89
|
-
phoonnx-0.2.
|
86
|
+
phoonnx-0.2.4.dist-info/METADATA,sha256=UcIFJkCXmN-YrP-_QXTIS63sJne3AnVhr_qrrZxmr68,8248
|
87
|
+
phoonnx-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
88
|
+
phoonnx-0.2.4.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
|
89
|
+
phoonnx-0.2.4.dist-info/RECORD,,
|
phoonnx_train/preprocess.py
CHANGED
@@ -32,7 +32,7 @@ DEFAULT_SPECIAL_PHONEME_ID_MAP: Dict[str, int] = {
|
|
32
32
|
DEFAULT_EOS_TOKEN: 2,
|
33
33
|
DEFAULT_BLANK_WORD_TOKEN: 3,
|
34
34
|
}
|
35
|
-
|
35
|
+
MAX_PHONEMES = 256
|
36
36
|
# -----------------------------------------------------------------------------
|
37
37
|
|
38
38
|
@dataclass
|
@@ -517,12 +517,15 @@ def cli(
|
|
517
517
|
|
518
518
|
if prev_config:
|
519
519
|
with open(prev_config) as f:
|
520
|
-
|
520
|
+
cfg = json.load(f)
|
521
|
+
prev_phoneme_id_map = cfg["phoneme_id_map"]
|
522
|
+
prev_num_symbols = cfg.get("num_symbols", MAX_PHONEMES)
|
521
523
|
_LOGGER.info(f"Loaded phoneme map from previous config: '{prev_config}'")
|
522
524
|
all_phonemes.update(prev_phoneme_id_map.keys())
|
523
525
|
final_phoneme_id_map = prev_phoneme_id_map
|
524
|
-
_LOGGER.info("previous phoneme map contains %d
|
526
|
+
_LOGGER.info("previous phoneme map contains %d phonemes.", len(final_phoneme_id_map))
|
525
527
|
else:
|
528
|
+
prev_num_symbols = MAX_PHONEMES
|
526
529
|
final_phoneme_id_map: Dict[str, int] = DEFAULT_SPECIAL_PHONEME_ID_MAP.copy()
|
527
530
|
if phonemizer.alphabet == Alphabet.IPA:
|
528
531
|
all_phonemes.update(DEFAULT_IPA_PHONEME_ID_MAP.keys())
|
@@ -533,7 +536,7 @@ def cli(
|
|
533
536
|
if p not in existing_keys]
|
534
537
|
)
|
535
538
|
|
536
|
-
_LOGGER.info("Collected %d new
|
539
|
+
_LOGGER.info("Collected %d new phonemes.", len(new_phonemes))
|
537
540
|
|
538
541
|
finetune_error = prev_config and len(new_phonemes)
|
539
542
|
if finetune_error:
|
@@ -553,7 +556,7 @@ def cli(
|
|
553
556
|
_LOGGER.debug(f"New phoneme: {pho}")
|
554
557
|
|
555
558
|
if new_phonemes:
|
556
|
-
_LOGGER.info("Final phoneme map contains %d
|
559
|
+
_LOGGER.info("Final phoneme map contains %d phonemes.", len(final_phoneme_id_map))
|
557
560
|
|
558
561
|
# --- Write the final config.json ---
|
559
562
|
_LOGGER.info("Writing dataset config...")
|
@@ -575,7 +578,7 @@ def cli(
|
|
575
578
|
"phoneme_type": config.phoneme_type.value,
|
576
579
|
"phonemizer_model": config.phonemizer_model,
|
577
580
|
"phoneme_id_map": final_phoneme_id_map,
|
578
|
-
"num_symbols": len(final_phoneme_id_map),
|
581
|
+
"num_symbols": prev_num_symbols if prev_config else len(final_phoneme_id_map),
|
579
582
|
"num_speakers": len(speaker_counts) if is_multispeaker else 1,
|
580
583
|
"speaker_id_map": speaker_ids,
|
581
584
|
"phoonnx_version": VERSION_STR,
|
phoonnx_train/train.py
CHANGED
@@ -44,6 +44,7 @@ def load_state_dict(model, saved_state_dict):
|
|
44
44
|
@click.option('--batch-size', type=int, default=16, help='Training batch size (default: 16)')
|
45
45
|
@click.option('--num-workers', type=click.IntRange(min=1), default=os.cpu_count() or 1, help='Number of data loader workers (default: CPU count)')
|
46
46
|
@click.option('--validation-split', type=float, default=0.05, help='Proportion of data used for validation (default: 0.05)')
|
47
|
+
@click.option('--discard-encoder', type=bool, default=False, help='Discard the encoder weights from base checkpoint (default: False)')
|
47
48
|
def main(
|
48
49
|
dataset_dir,
|
49
50
|
checkpoint_epochs,
|
@@ -60,6 +61,7 @@ def main(
|
|
60
61
|
batch_size,
|
61
62
|
num_workers,
|
62
63
|
validation_split,
|
64
|
+
discard_encoder
|
63
65
|
):
|
64
66
|
logging.basicConfig(level=logging.DEBUG)
|
65
67
|
|
@@ -73,22 +75,18 @@ def main(
|
|
73
75
|
config_path = dataset_dir / 'config.json'
|
74
76
|
dataset_path = dataset_dir / 'dataset.jsonl'
|
75
77
|
|
76
|
-
|
77
|
-
|
78
|
+
_LOGGER.info(f"config_path: '{config_path}'")
|
79
|
+
_LOGGER.info(f"dataset_path: '{dataset_path}'")
|
78
80
|
|
79
81
|
with open(config_path, 'r', encoding='utf-8') as config_file:
|
80
82
|
config = json.load(config_file)
|
81
|
-
num_symbols = int(config['num_symbols'])
|
82
|
-
num_speakers = int(config['num_speakers'])
|
83
|
-
sample_rate = int(config['audio']['sample_rate'])
|
84
83
|
|
85
84
|
trainer = Trainer(
|
86
85
|
max_epochs=max_epochs,
|
87
86
|
devices=devices,
|
88
87
|
accelerator=accelerator,
|
89
88
|
default_root_dir=default_root_dir,
|
90
|
-
precision=precision
|
91
|
-
resume_from_checkpoint=resume_from_checkpoint
|
89
|
+
precision=precision
|
92
90
|
)
|
93
91
|
|
94
92
|
if checkpoint_epochs is not None:
|
@@ -119,7 +117,32 @@ def main(
|
|
119
117
|
'upsample_kernel_sizes': (16, 16, 4, 4),
|
120
118
|
})
|
121
119
|
|
122
|
-
|
120
|
+
num_symbols = int(config['num_symbols'])
|
121
|
+
num_speakers = int(config['num_speakers'])
|
122
|
+
sample_rate = int(config['audio']['sample_rate'])
|
123
|
+
_LOGGER.debug(f"Config params: num_symbols={num_symbols} num_speakers={num_speakers} sample_rate={sample_rate}")
|
124
|
+
|
125
|
+
if resume_from_checkpoint:
|
126
|
+
# TODO (?) - add a flag to use params from config vs from checkpoint in case of mismatch
|
127
|
+
ckpt = VitsModel.load_from_checkpoint(resume_from_checkpoint, dataset=None)
|
128
|
+
_LOGGER.debug(f"Checkpoint params: num_symbols={ckpt.model_g.n_vocab} num_speakers={ckpt.model_g.n_speakers} sample_rate={ckpt.hparams.sample_rate}")
|
129
|
+
if ckpt.model_g.n_vocab != num_symbols:
|
130
|
+
_LOGGER.warning(f"Checkpoint num_symbols={ckpt.model_g.n_vocab} does not match config num_symbols={num_symbols}")
|
131
|
+
#-------------
|
132
|
+
# commented out this code because this is not supposed to happen if you used the preprocess.py script
|
133
|
+
# uncomment if you want to use the encoder from checkpoint + update num_symbols in the .json file manually
|
134
|
+
#-------------
|
135
|
+
#if ckpt.model_g.n_vocab > num_symbols and not discard_encoder:
|
136
|
+
# num_symbols = ckpt.model_g.n_vocab
|
137
|
+
# _LOGGER.info(f"Training with num_symbols={num_symbols}")
|
138
|
+
###############
|
139
|
+
if ckpt.model_g.n_speakers != num_speakers:
|
140
|
+
_LOGGER.warning(f"Checkpoint num_speakers={ckpt.model_g.n_speakers} does not match config num_speakers={num_speakers}")
|
141
|
+
#num_speakers = ckpt.model_g.n_speakers
|
142
|
+
if ckpt.hparams.sample_rate != sample_rate:
|
143
|
+
_LOGGER.warning(f"Checkpoint sample_rate={ckpt.hparams.sample_rate} does not match config sample_rate={sample_rate}")
|
144
|
+
#sample_rate = ckpt.hparams.sample_rate
|
145
|
+
|
123
146
|
model = VitsModel(
|
124
147
|
num_symbols=num_symbols,
|
125
148
|
num_speakers=num_speakers,
|
@@ -127,6 +150,31 @@ def main(
|
|
127
150
|
dataset=[dataset_path],
|
128
151
|
**dict_args,
|
129
152
|
)
|
153
|
+
_LOGGER.info(f"VitsModel params: num_symbols={num_symbols} num_speakers={num_speakers} sample_rate={sample_rate}")
|
154
|
+
|
155
|
+
if resume_from_checkpoint:
|
156
|
+
saved_state_dict = ckpt.state_dict()
|
157
|
+
|
158
|
+
# Filter the state dictionary by removing the encoder weights
|
159
|
+
enc_key = 'model_g.enc_p.emb.weight'
|
160
|
+
if enc_key in saved_state_dict:
|
161
|
+
saved_shape = saved_state_dict[enc_key].shape
|
162
|
+
current_shape = model.state_dict()[enc_key].shape
|
163
|
+
if saved_shape[0] != current_shape[0]:
|
164
|
+
_LOGGER.warning(
|
165
|
+
"Size mismatch detected for '%s': saved shape %s vs current shape %s. ",
|
166
|
+
enc_key, saved_shape, current_shape
|
167
|
+
)
|
168
|
+
discard_encoder = True
|
169
|
+
|
170
|
+
if discard_encoder:
|
171
|
+
_LOGGER.warning(
|
172
|
+
"Skipping encoder weights from the checkpoint. (will be randomly initialized)"
|
173
|
+
)
|
174
|
+
saved_state_dict.pop(enc_key)
|
175
|
+
|
176
|
+
load_state_dict(model, saved_state_dict)
|
177
|
+
_LOGGER.info("Successfully loaded model weights.")
|
130
178
|
|
131
179
|
if resume_from_single_speaker_checkpoint:
|
132
180
|
assert num_speakers > 1, "--resume-from-single-speaker-checkpoint is only for multi-speaker models."
|
@@ -143,7 +191,7 @@ def main(
|
|
143
191
|
load_state_dict(model.model_d, model_single.model_d.state_dict())
|
144
192
|
_LOGGER.info('Successfully converted single-speaker checkpoint to multi-speaker')
|
145
193
|
|
146
|
-
|
194
|
+
_LOGGER.info('training started!!')
|
147
195
|
trainer.fit(model)
|
148
196
|
|
149
197
|
|
File without changes
|
File without changes
|