phoonnx 0.2.3a1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phoonnx/version.py CHANGED
@@ -1,8 +1,8 @@
1
1
  # START_VERSION_BLOCK
2
2
  VERSION_MAJOR = 0
3
3
  VERSION_MINOR = 2
4
- VERSION_BUILD = 3
5
- VERSION_ALPHA = 1
4
+ VERSION_BUILD = 4
5
+ VERSION_ALPHA = 0
6
6
  # END_VERSION_BLOCK
7
7
 
8
8
  VERSION_STR = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phoonnx
3
- Version: 0.2.3a1
3
+ Version: 0.2.4
4
4
  Home-page: https://github.com/TigreGotico/phoonnx
5
5
  Author: JarbasAi
6
6
  Author-email: jarbasai@mailfence.com
@@ -2,7 +2,7 @@ phoonnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  phoonnx/config.py,sha256=DKgsU03g8jrAuMcVqbu-w3MWPXOUihFtRnavg6WGQ1Y,19983
3
3
  phoonnx/phoneme_ids.py,sha256=FiNgZwV6naEsBh6XwFLh3_FyOgPiCsK9qo7S0v-CmI4,13667
4
4
  phoonnx/util.py,sha256=XSjFEoqSFcujFTHxednacgC9GrSYyF-Il5L6Utmxmu4,25909
5
- phoonnx/version.py,sha256=-0T-wzJguFutLdv6MOk0Uk2SYsPF0Wj49Ml3R_KcMzg,237
5
+ phoonnx/version.py,sha256=28DZfjsylGY3KkroBs-6sispnnXZDH3ZHCzHcKoK5L0,237
6
6
  phoonnx/voice.py,sha256=JXjmbrhJd4mmTiLgz4O_Pa5_rKGUC9xzuBfqxYDw3Mg,19420
7
7
  phoonnx/locale/ca/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
8
8
  phoonnx/locale/en/phonetic_spellings.txt,sha256=xGQlWOABLzbttpQvopl9CU-NnwEJRqKx8iuylsdUoQA,27
@@ -63,8 +63,8 @@ phoonnx/thirdparty/tashkeel/input_id_map.json,sha256=cnpJqjx-k53AbzKyfC4GxMS771l
63
63
  phoonnx/thirdparty/tashkeel/model.onnx,sha256=UsQNQsoJT_n_B6CR0KHq_XuqXPI4jmCpzIm6zY5elV8,4788213
64
64
  phoonnx/thirdparty/tashkeel/target_id_map.json,sha256=baNAJL_UwP9U91mLt01aAEBRRNdGr-csFB_O6roh7TA,181
65
65
  phoonnx_train/export_onnx.py,sha256=CPfgNEm0hnXPSlgme0R9jr-6jZ5fKFpG5DZJFMkC-h4,12820
66
- phoonnx_train/preprocess.py,sha256=4FJFi7KL-ZUmrbN2NyhxBNpEjDlPRLSDJo2JoyvpR14,21700
67
- phoonnx_train/train.py,sha256=16HAb6Yu51xmbP3VM-tWUCsIhct1JHf56aQLG2UEaIc,6024
66
+ phoonnx_train/preprocess.py,sha256=T1YcM89bizARKMaqjDtr3JzidoNvAbm-3n3eClbzMsI,21880
67
+ phoonnx_train/train.py,sha256=nsINvDQ3dYvBne5UWPgLZ0a4qZFdSsOKk8HzZHGTLY4,8757
68
68
  phoonnx_train/norm_audio/__init__.py,sha256=Al_YwqMnENXRWp0c79cDZqbdd7pFYARXKxCfBaedr1c,3030
69
69
  phoonnx_train/norm_audio/trim.py,sha256=_ZsE3SYhahQSdEdBLeSwyFJGcvEbt-5E_lnWwTT4tcY,1698
70
70
  phoonnx_train/norm_audio/vad.py,sha256=DXHfRD0qqFJ52FjPvrL5LlN6keJWuc9Nf6TNhxpwC_4,1600
@@ -83,7 +83,7 @@ phoonnx_train/vits/utils.py,sha256=exiyrtPHbnnGvcHWSbaH9-gR6srH5ZPHlKiqV2IHUrQ,4
83
83
  phoonnx_train/vits/wavfile.py,sha256=oQZiTIrdw0oLTbcVwKfGXye1WtKte6qK_52qVwiMvfc,26396
84
84
  phoonnx_train/vits/monotonic_align/__init__.py,sha256=5IdAOD1Z7UloMb6d_9NRFsXoNIjEQ3h9mvOSh_AtO3k,636
85
85
  phoonnx_train/vits/monotonic_align/setup.py,sha256=0K5iJJ2mKIklx6ncEfCQS34skm5hHPiz9vRlQEvevvY,266
86
- phoonnx-0.2.3a1.dist-info/METADATA,sha256=bGpbhQiApKr5k7bhmFzbDZiOZTSl6wzvLzEbNG6mDWk,8250
87
- phoonnx-0.2.3a1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
88
- phoonnx-0.2.3a1.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
89
- phoonnx-0.2.3a1.dist-info/RECORD,,
86
+ phoonnx-0.2.4.dist-info/METADATA,sha256=UcIFJkCXmN-YrP-_QXTIS63sJne3AnVhr_qrrZxmr68,8248
87
+ phoonnx-0.2.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
88
+ phoonnx-0.2.4.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
89
+ phoonnx-0.2.4.dist-info/RECORD,,
@@ -32,7 +32,7 @@ DEFAULT_SPECIAL_PHONEME_ID_MAP: Dict[str, int] = {
32
32
  DEFAULT_EOS_TOKEN: 2,
33
33
  DEFAULT_BLANK_WORD_TOKEN: 3,
34
34
  }
35
-
35
+ MAX_PHONEMES = 256
36
36
  # -----------------------------------------------------------------------------
37
37
 
38
38
  @dataclass
@@ -517,12 +517,15 @@ def cli(
517
517
 
518
518
  if prev_config:
519
519
  with open(prev_config) as f:
520
- prev_phoneme_id_map = json.load(f)["phoneme_id_map"]
520
+ cfg = json.load(f)
521
+ prev_phoneme_id_map = cfg["phoneme_id_map"]
522
+ prev_num_symbols = cfg.get("num_symbols", MAX_PHONEMES)
521
523
  _LOGGER.info(f"Loaded phoneme map from previous config: '{prev_config}'")
522
524
  all_phonemes.update(prev_phoneme_id_map.keys())
523
525
  final_phoneme_id_map = prev_phoneme_id_map
524
- _LOGGER.info("previous phoneme map contains %d symbols.", len(final_phoneme_id_map))
526
+ _LOGGER.info("previous phoneme map contains %d phonemes.", len(final_phoneme_id_map))
525
527
  else:
528
+ prev_num_symbols = MAX_PHONEMES
526
529
  final_phoneme_id_map: Dict[str, int] = DEFAULT_SPECIAL_PHONEME_ID_MAP.copy()
527
530
  if phonemizer.alphabet == Alphabet.IPA:
528
531
  all_phonemes.update(DEFAULT_IPA_PHONEME_ID_MAP.keys())
@@ -533,7 +536,7 @@ def cli(
533
536
  if p not in existing_keys]
534
537
  )
535
538
 
536
- _LOGGER.info("Collected %d new symbols.", len(new_phonemes))
539
+ _LOGGER.info("Collected %d new phonemes.", len(new_phonemes))
537
540
 
538
541
  finetune_error = prev_config and len(new_phonemes)
539
542
  if finetune_error:
@@ -553,7 +556,7 @@ def cli(
553
556
  _LOGGER.debug(f"New phoneme: {pho}")
554
557
 
555
558
  if new_phonemes:
556
- _LOGGER.info("Final phoneme map contains %d symbols.", len(final_phoneme_id_map))
559
+ _LOGGER.info("Final phoneme map contains %d phonemes.", len(final_phoneme_id_map))
557
560
 
558
561
  # --- Write the final config.json ---
559
562
  _LOGGER.info("Writing dataset config...")
@@ -575,7 +578,7 @@ def cli(
575
578
  "phoneme_type": config.phoneme_type.value,
576
579
  "phonemizer_model": config.phonemizer_model,
577
580
  "phoneme_id_map": final_phoneme_id_map,
578
- "num_symbols": len(final_phoneme_id_map),
581
+ "num_symbols": prev_num_symbols if prev_config else len(final_phoneme_id_map),
579
582
  "num_speakers": len(speaker_counts) if is_multispeaker else 1,
580
583
  "speaker_id_map": speaker_ids,
581
584
  "phoonnx_version": VERSION_STR,
phoonnx_train/train.py CHANGED
@@ -44,6 +44,7 @@ def load_state_dict(model, saved_state_dict):
44
44
  @click.option('--batch-size', type=int, default=16, help='Training batch size (default: 16)')
45
45
  @click.option('--num-workers', type=click.IntRange(min=1), default=os.cpu_count() or 1, help='Number of data loader workers (default: CPU count)')
46
46
  @click.option('--validation-split', type=float, default=0.05, help='Proportion of data used for validation (default: 0.05)')
47
+ @click.option('--discard-encoder', type=bool, default=False, help='Discard the encoder weights from base checkpoint (default: False)')
47
48
  def main(
48
49
  dataset_dir,
49
50
  checkpoint_epochs,
@@ -60,6 +61,7 @@ def main(
60
61
  batch_size,
61
62
  num_workers,
62
63
  validation_split,
64
+ discard_encoder
63
65
  ):
64
66
  logging.basicConfig(level=logging.DEBUG)
65
67
 
@@ -73,22 +75,18 @@ def main(
73
75
  config_path = dataset_dir / 'config.json'
74
76
  dataset_path = dataset_dir / 'dataset.jsonl'
75
77
 
76
- print(f"INFO - config_path: '{config_path}'")
77
- print(f"INFO - dataset_path: '{dataset_path}'")
78
+ _LOGGER.info(f"config_path: '{config_path}'")
79
+ _LOGGER.info(f"dataset_path: '{dataset_path}'")
78
80
 
79
81
  with open(config_path, 'r', encoding='utf-8') as config_file:
80
82
  config = json.load(config_file)
81
- num_symbols = int(config['num_symbols'])
82
- num_speakers = int(config['num_speakers'])
83
- sample_rate = int(config['audio']['sample_rate'])
84
83
 
85
84
  trainer = Trainer(
86
85
  max_epochs=max_epochs,
87
86
  devices=devices,
88
87
  accelerator=accelerator,
89
88
  default_root_dir=default_root_dir,
90
- precision=precision,
91
- resume_from_checkpoint=resume_from_checkpoint
89
+ precision=precision
92
90
  )
93
91
 
94
92
  if checkpoint_epochs is not None:
@@ -119,7 +117,32 @@ def main(
119
117
  'upsample_kernel_sizes': (16, 16, 4, 4),
120
118
  })
121
119
 
122
- print(f"VitsModel params: num_symbols={num_symbols} num_speakers={num_speakers} sample_rate={sample_rate}")
120
+ num_symbols = int(config['num_symbols'])
121
+ num_speakers = int(config['num_speakers'])
122
+ sample_rate = int(config['audio']['sample_rate'])
123
+ _LOGGER.debug(f"Config params: num_symbols={num_symbols} num_speakers={num_speakers} sample_rate={sample_rate}")
124
+
125
+ if resume_from_checkpoint:
126
+ # TODO (?) - add a flag to use params from config vs from checkpoint in case of mismatch
127
+ ckpt = VitsModel.load_from_checkpoint(resume_from_checkpoint, dataset=None)
128
+ _LOGGER.debug(f"Checkpoint params: num_symbols={ckpt.model_g.n_vocab} num_speakers={ckpt.model_g.n_speakers} sample_rate={ckpt.hparams.sample_rate}")
129
+ if ckpt.model_g.n_vocab != num_symbols:
130
+ _LOGGER.warning(f"Checkpoint num_symbols={ckpt.model_g.n_vocab} does not match config num_symbols={num_symbols}")
131
+ #-------------
132
+ # commented out this code because this is not supposed to happen if you used the preprocess.py script
133
+ # uncomment if you want to use the encoder from checkpoint + update num_symbols in the .json file manually
134
+ #-------------
135
+ #if ckpt.model_g.n_vocab > num_symbols and not discard_encoder:
136
+ # num_symbols = ckpt.model_g.n_vocab
137
+ # _LOGGER.info(f"Training with num_symbols={num_symbols}")
138
+ ###############
139
+ if ckpt.model_g.n_speakers != num_speakers:
140
+ _LOGGER.warning(f"Checkpoint num_speakers={ckpt.model_g.n_speakers} does not match config num_speakers={num_speakers}")
141
+ #num_speakers = ckpt.model_g.n_speakers
142
+ if ckpt.hparams.sample_rate != sample_rate:
143
+ _LOGGER.warning(f"Checkpoint sample_rate={ckpt.hparams.sample_rate} does not match config sample_rate={sample_rate}")
144
+ #sample_rate = ckpt.hparams.sample_rate
145
+
123
146
  model = VitsModel(
124
147
  num_symbols=num_symbols,
125
148
  num_speakers=num_speakers,
@@ -127,6 +150,31 @@ def main(
127
150
  dataset=[dataset_path],
128
151
  **dict_args,
129
152
  )
153
+ _LOGGER.info(f"VitsModel params: num_symbols={num_symbols} num_speakers={num_speakers} sample_rate={sample_rate}")
154
+
155
+ if resume_from_checkpoint:
156
+ saved_state_dict = ckpt.state_dict()
157
+
158
+ # Filter the state dictionary by removing the encoder weights
159
+ enc_key = 'model_g.enc_p.emb.weight'
160
+ if enc_key in saved_state_dict:
161
+ saved_shape = saved_state_dict[enc_key].shape
162
+ current_shape = model.state_dict()[enc_key].shape
163
+ if saved_shape[0] != current_shape[0]:
164
+ _LOGGER.warning(
165
+ "Size mismatch detected for '%s': saved shape %s vs current shape %s. ",
166
+ enc_key, saved_shape, current_shape
167
+ )
168
+ discard_encoder = True
169
+
170
+ if discard_encoder:
171
+ _LOGGER.warning(
172
+ "Skipping encoder weights from the checkpoint. (will be randomly initialized)"
173
+ )
174
+ saved_state_dict.pop(enc_key)
175
+
176
+ load_state_dict(model, saved_state_dict)
177
+ _LOGGER.info("Successfully loaded model weights.")
130
178
 
131
179
  if resume_from_single_speaker_checkpoint:
132
180
  assert num_speakers > 1, "--resume-from-single-speaker-checkpoint is only for multi-speaker models."
@@ -143,7 +191,7 @@ def main(
143
191
  load_state_dict(model.model_d, model_single.model_d.state_dict())
144
192
  _LOGGER.info('Successfully converted single-speaker checkpoint to multi-speaker')
145
193
 
146
- print('training started!!')
194
+ _LOGGER.info('training started!!')
147
195
  trainer.fit(model)
148
196
 
149
197