Anchor-annotator 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
anchor/workers.py CHANGED
@@ -28,6 +28,7 @@ import yaml
28
28
  from _kalpy.feat import compute_pitch
29
29
  from _kalpy.ivector import Plda, ivector_normalize_length
30
30
  from _kalpy.matrix import DoubleVector, FloatVector
31
+ from kalpy.feat.mfcc import MfccComputer
31
32
  from kalpy.feat.pitch import PitchComputer
32
33
  from montreal_forced_aligner import config
33
34
  from montreal_forced_aligner.alignment import PretrainedAligner
@@ -41,6 +42,7 @@ from montreal_forced_aligner.data import (
41
42
  CtmInterval,
42
43
  DatasetType,
43
44
  DistanceMetric,
45
+ Language,
44
46
  ManifoldAlgorithm,
45
47
  TextFileType,
46
48
  WordType,
@@ -63,6 +65,7 @@ from montreal_forced_aligner.db import (
63
65
  Word,
64
66
  WordInterval,
65
67
  bulk_update,
68
+ full_load_utterance,
66
69
  )
67
70
  from montreal_forced_aligner.diarization.multiprocessing import visualize_clusters
68
71
  from montreal_forced_aligner.diarization.speaker_diarizer import SpeakerDiarizer
@@ -79,9 +82,16 @@ from montreal_forced_aligner.online.alignment import (
79
82
  align_utterance_online,
80
83
  update_utterance_intervals,
81
84
  )
85
+ from montreal_forced_aligner.online.transcription import (
86
+ transcribe_utterance_online,
87
+ transcribe_utterance_online_speechbrain,
88
+ transcribe_utterance_online_whisper,
89
+ )
82
90
  from montreal_forced_aligner.transcription import Transcriber
91
+ from montreal_forced_aligner.transcription.models import MfaFasterWhisperPipeline, load_model
83
92
  from montreal_forced_aligner.utils import ProgressCallback, inspect_database
84
- from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter
93
+ from montreal_forced_aligner.vad.models import FOUND_SPEECHBRAIN, MfaVAD
94
+ from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter, VadSegmenter
85
95
  from montreal_forced_aligner.validation.corpus_validator import PretrainedValidator
86
96
  from PySide6 import QtCore
87
97
  from sklearn import discriminant_analysis, metrics, preprocessing
@@ -646,7 +656,7 @@ class ExportFilesWorker(Worker):
646
656
  subqueryload(File.utterances),
647
657
  subqueryload(File.speakers),
648
658
  joinedload(File.sound_file, innerjoin=True).load_only(SoundFile.duration),
649
- joinedload(File.text_file, innerjoin=True).load_only(TextFile.file_type),
659
+ joinedload(File.text_file, innerjoin=False).load_only(TextFile.file_type),
650
660
  )
651
661
  .filter(File.modified == True) # noqa
652
662
  )
@@ -835,7 +845,7 @@ class ChangeSpeakerWorker(Worker):
835
845
  return
836
846
  session.commit()
837
847
  except Exception as e:
838
- print(e)
848
+ logger.warning(e)
839
849
  session.rollback()
840
850
  raise
841
851
  return return_data
@@ -906,7 +916,7 @@ class BreakUpSpeakerWorker(Worker):
906
916
  return
907
917
  session.commit()
908
918
  except Exception as e:
909
- print(e)
919
+ logger.warning(e)
910
920
  session.rollback()
911
921
  raise
912
922
  return self.utterance_ids
@@ -3232,6 +3242,53 @@ class SpectrogramWorker(Worker): # pragma: no cover
3232
3242
  self.signals.result.emit((stft, self.channel, self.begin, self.end, min_db, max_db))
3233
3243
 
3234
3244
 
3245
+ class MfccWorker(Worker): # pragma: no cover
3246
+ def __init__(self, y, sample_rate, begin, end, channel, *args):
3247
+ super().__init__("Generating spectrogram", *args)
3248
+ self.y = y
3249
+ self.sample_rate = sample_rate
3250
+ self.begin = begin
3251
+ self.end = end
3252
+ self.channel = channel
3253
+
3254
+ self.mfcc_computer = MfccComputer(
3255
+ use_energy=False,
3256
+ raw_energy=False,
3257
+ frame_shift=10,
3258
+ frame_length=25,
3259
+ snip_edges=False,
3260
+ low_frequency=20,
3261
+ high_frequency=7800,
3262
+ sample_frequency=16000,
3263
+ allow_downsample=True,
3264
+ allow_upsample=True,
3265
+ dither=0.0,
3266
+ energy_floor=0.0,
3267
+ num_coefficients=13,
3268
+ num_mel_bins=23,
3269
+ cepstral_lifter=22,
3270
+ preemphasis_coefficient=0.97,
3271
+ )
3272
+
3273
+ def run(self):
3274
+ if self.y.shape[0] == 0:
3275
+ self.signals.result.emit(None)
3276
+ return
3277
+ duration = self.y.shape[0] / self.sample_rate
3278
+ if duration > self.settings.value(self.settings.SPEC_MAX_TIME):
3279
+ self.signals.result.emit(None)
3280
+ return
3281
+ if self.sample_rate > self.mfcc_computer.sample_frequency:
3282
+ self.y = scipy.signal.resample(
3283
+ self.y,
3284
+ int(self.y.shape[0] * self.mfcc_computer.sample_frequency / self.sample_rate),
3285
+ )
3286
+ self.sample_rate = self.mfcc_computer.sample_frequency
3287
+ stft = self.mfcc_computer.compute_mfccs(self.y).T
3288
+ min_db, max_db = np.min(stft), np.max(stft)
3289
+ self.signals.result.emit((stft, self.channel, self.begin, self.end, min_db, max_db))
3290
+
3291
+
3235
3292
  class PitchWorker(Worker): # pragma: no cover
3236
3293
  def __init__(self, y, sample_rate, begin, end, channel, normalized_min, normalized_max, *args):
3237
3294
  super().__init__("Generating pitch track", *args)
@@ -3633,11 +3690,48 @@ class ImportAcousticModelWorker(FunctionWorker): # pragma: no cover
3633
3690
  if not self.model_path:
3634
3691
  return
3635
3692
  try:
3636
- acoustic_model = AcousticModel(self.model_path)
3693
+ if str(self.model_path) == "whisper":
3694
+ cuda = self.settings.value(self.settings.CUDA)
3695
+ run_opts = None
3696
+ vad_model = None
3697
+ if cuda:
3698
+ run_opts = {"device": "cuda"}
3699
+ if FOUND_SPEECHBRAIN:
3700
+ vad_model = MfaVAD.from_hparams(
3701
+ source="speechbrain/vad-crdnn-libriparty",
3702
+ savedir=os.path.join(config.TEMPORARY_DIRECTORY, "models", "VAD"),
3703
+ run_opts=run_opts,
3704
+ )
3705
+ vad_options = {
3706
+ "apply_energy_VAD": False,
3707
+ "double_check": False,
3708
+ "activation_th": 0.5,
3709
+ "deactivation_th": 0.25,
3710
+ "en_activation_th": 0.5,
3711
+ "en_deactivation_th": 0.4,
3712
+ "speech_th": 0.5,
3713
+ "close_th": 0.333,
3714
+ "len_th": 0.333,
3715
+ }
3716
+ acoustic_model = load_model(
3717
+ "large-v3",
3718
+ device="cuda" if cuda else "cpu",
3719
+ download_root=os.path.join(
3720
+ config.TEMPORARY_DIRECTORY,
3721
+ "models",
3722
+ "Whisper",
3723
+ ),
3724
+ threads=config.NUM_JOBS,
3725
+ vad_model=vad_model,
3726
+ vad_options=vad_options,
3727
+ )
3728
+ elif str(self.model_path) == "speechbrain":
3729
+ pass
3730
+ else:
3731
+ acoustic_model = AcousticModel(self.model_path)
3637
3732
  except Exception:
3638
- if os.path.exists(self.model_path):
3639
- exctype, value = sys.exc_info()[:2]
3640
- self.signals.error.emit((exctype, value, traceback.format_exc()))
3733
+ exctype, value = sys.exc_info()[:2]
3734
+ self.signals.error.emit((exctype, value, traceback.format_exc()))
3641
3735
  else:
3642
3736
  self.signals.result.emit(acoustic_model) # Return the result of the processing
3643
3737
  finally:
@@ -3777,6 +3871,63 @@ class AlignUtteranceWorker(FunctionWorker): # pragma: no cover
3777
3871
  self.signals.finished.emit() # Done
3778
3872
 
3779
3873
 
3874
+ class TranscribeUtteranceWorker(FunctionWorker): # pragma: no cover
3875
+ def __init__(self, *args):
3876
+ super().__init__("Transcribing utterance", *args)
3877
+ self.corpus_model: typing.Optional[CorpusModel] = None
3878
+ self.utterance_id: typing.Optional[int] = None
3879
+
3880
+ def set_params(self, corpus_model: CorpusModel, utterance_id: int):
3881
+ self.corpus_model = corpus_model
3882
+ self.utterance_id = utterance_id
3883
+
3884
+ def run(self):
3885
+ self.settings.sync()
3886
+ if isinstance(self.corpus_model.acoustic_model, AcousticModel):
3887
+ self.corpus_model.check_align_lexicon_compiler()
3888
+ language = Language[self.settings.value(self.settings.LANGUAGE)]
3889
+ try:
3890
+ with self.corpus_model.corpus.session() as session:
3891
+ utterance = (
3892
+ session.query(Utterance)
3893
+ .options(
3894
+ joinedload(Utterance.file, innerjoin=True).joinedload(
3895
+ File.sound_file, innerjoin=True
3896
+ ),
3897
+ joinedload(Utterance.speaker, innerjoin=True),
3898
+ )
3899
+ .get(self.utterance_id)
3900
+ )
3901
+ if isinstance(self.corpus_model.acoustic_model, AcousticModel):
3902
+ transcription = transcribe_utterance_online(
3903
+ self.corpus_model.acoustic_model,
3904
+ utterance.to_kalpy(),
3905
+ self.corpus_model.align_lexicon_compiler,
3906
+ )
3907
+ elif isinstance(self.corpus_model.acoustic_model, MfaFasterWhisperPipeline):
3908
+ self.corpus_model.acoustic_model.set_language(language)
3909
+ transcription = transcribe_utterance_online_whisper(
3910
+ self.corpus_model.acoustic_model,
3911
+ utterance.to_kalpy().segment,
3912
+ )
3913
+ else:
3914
+ transcription = transcribe_utterance_online_speechbrain(
3915
+ self.corpus_model.acoustic_model,
3916
+ utterance.to_kalpy(),
3917
+ )
3918
+ utterance.transcription_text = transcription
3919
+ session.commit()
3920
+ except Exception:
3921
+ exctype, value = sys.exc_info()[:2]
3922
+ self.signals.error.emit((exctype, value, traceback.format_exc()))
3923
+ else:
3924
+ self.signals.result.emit(
3925
+ (self.utterance_id, transcription)
3926
+ ) # Return the result of the processing
3927
+ finally:
3928
+ self.signals.finished.emit() # Done
3929
+
3930
+
3780
3931
  class SegmentUtteranceWorker(FunctionWorker): # pragma: no cover
3781
3932
  def __init__(self, *args):
3782
3933
  super().__init__("Segmenting utterance", *args)
@@ -3827,6 +3978,64 @@ class SegmentUtteranceWorker(FunctionWorker): # pragma: no cover
3827
3978
  self.signals.finished.emit() # Done
3828
3979
 
3829
3980
 
3981
+ class TrimUtteranceWorker(FunctionWorker): # pragma: no cover
3982
+ def __init__(self, *args):
3983
+ super().__init__("Trimming utterance", *args)
3984
+ self.corpus_model: typing.Optional[CorpusModel] = None
3985
+ self.vad_model: typing.Optional[MfaVAD] = None
3986
+ self.utterance_id = None
3987
+
3988
+ def set_vad_model(self, vad_model):
3989
+ self.vad_model = vad_model
3990
+
3991
+ def set_params(self, corpus_model: CorpusModel, utterance_id: int):
3992
+ self.corpus_model = corpus_model
3993
+ self.utterance_id = utterance_id
3994
+
3995
+ def run(self):
3996
+ self.settings.sync()
3997
+ if self.vad_model is None:
3998
+ segmenter = VadSegmenter(
3999
+ corpus_directory=self.corpus_model.corpus.corpus_directory,
4000
+ )
4001
+ try:
4002
+ segmenter.inspect_database()
4003
+ segments = segmenter.segment_utterance(self.utterance_id, allow_empty=False)
4004
+ begin = segments[0].begin
4005
+ end = segments[-1].end
4006
+ except Exception:
4007
+ exctype, value = sys.exc_info()[:2]
4008
+ self.signals.error.emit((exctype, value, traceback.format_exc()))
4009
+ else:
4010
+ self.signals.result.emit(
4011
+ (self.utterance_id, begin, end)
4012
+ ) # Return the result of the processing
4013
+ finally:
4014
+ segmenter.cleanup_logger()
4015
+ self.signals.finished.emit() # Done
4016
+ else:
4017
+ try:
4018
+ with self.corpus_model.session() as session:
4019
+ utterance = full_load_utterance(session, self.utterance_id)
4020
+ segment = utterance.to_kalpy().segment
4021
+ # Compute the boundaries of the speech segments
4022
+ segments = self.vad_model.segment_utterance(segment, apply_energy_vad=True)
4023
+ try:
4024
+ begin = segments[0].begin
4025
+ end = segments[-1].end
4026
+ except IndexError:
4027
+ begin, end = segment.begin, segment.end
4028
+ except Exception:
4029
+ exctype, value = sys.exc_info()[:2]
4030
+ self.signals.error.emit((exctype, value, traceback.format_exc()))
4031
+ else:
4032
+ self.signals.result.emit(
4033
+ (self.utterance_id, begin, end)
4034
+ ) # Return the result of the processing
4035
+ finally:
4036
+ self.signals.finished.emit() # Done
4037
+
4038
+
3830
4039
  class AlignmentWorker(FunctionWorker): # pragma: no cover
3831
4040
  def __init__(self, *args):
3832
4041
  super().__init__("Aligning", *args)
@@ -3922,6 +4131,7 @@ class AlignmentWorker(FunctionWorker): # pragma: no cover
3922
4131
  aligner.verify_transcripts()
3923
4132
  else:
3924
4133
  aligner.align()
4134
+ aligner.analyze_alignments()
3925
4135
  except Exception:
3926
4136
  exctype, value = sys.exc_info()[:2]
3927
4137
  self.signals.error.emit((exctype, value, traceback.format_exc()))
@@ -1,22 +0,0 @@
1
- anchor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- anchor/__main__.py,sha256=5ufG8lcx2x1am-04xI991AG7saJd24dxPw5JzjmB878,45
3
- anchor/_version.py,sha256=akvr8ObxvMF-aaLBzW41juT4_KL3BjQUrjbwkIuQXMk,411
4
- anchor/command_line.py,sha256=EucG805HyWk_zkMO9RXv9Yj0I0JVdDLZb1_DX2_ISjM,503
5
- anchor/db.py,sha256=LlZzAy4bjmJIu0v4ev5Qjg_Fh2n9sMsKI2nAY1pwd0A,5057
6
- anchor/main.py,sha256=Lyr3ppr-nzxaU7ZmWXc-luMsOtRBbV4ebCzk3rygur4,127781
7
- anchor/models.py,sha256=35l7Kw3LVy-_ozdV_0ApSkKyCPViBwBmAukoq-jw90o,97668
8
- anchor/plot.py,sha256=imNRLI76VgEf4n9UGNvIaTsqn65hqnN396e4iwRTh70,113387
9
- anchor/resources_rc.py,sha256=tzJHrJw3MpjAlnj-DtCmaR4A8gAaLF966XEXs5HNIjc,8464375
10
- anchor/settings.py,sha256=N2gRFQEpY4pLYgcDz1Aq-2c7CfmbNxmRmVcPijrHsCo,52118
11
- anchor/ui_corpus_manager.py,sha256=e3ybOd4UdYarrLBATxI8vIFnioa4R_BHrbsEz5mJ5eA,8564
12
- anchor/ui_error_dialog.py,sha256=HKbjGT_jtdb9jfn9THQMbl1fmcdWyjYDazM4hCwZ5Yo,3931
13
- anchor/ui_main_window.py,sha256=XK91lhFAIEURZ6nwxIA74X-8j-P76JuJsN-ahun65rw,37043
14
- anchor/ui_preferences.py,sha256=g3tcjAMFKIAqUJNEke7ww4LkdeTFA1zb8_lrhF6k5fo,43271
15
- anchor/undo.py,sha256=T8CJpSZVZbItpU7KMZU2F49mNv1wo0rvMWtNIEbieeo,32856
16
- anchor/widgets.py,sha256=NjQAc02QVu97QClhXcylj_P6IP0DsxWae_eiZR5Bw3M,159300
17
- anchor/workers.py,sha256=ciVOlK15MiDq7juAivcQB6PEiEs7DemP0BOrcpnm2to,182624
18
- Anchor_annotator-0.7.0.dist-info/LICENSE,sha256=C0oIsblENEgWQ7XMNdYoXyXsIA5wa3YF0I9lK3H7A1s,1076
19
- Anchor_annotator-0.7.0.dist-info/METADATA,sha256=hvYb1JLmhGJEfwyTNGckZl6tqtj407fmYYdPqPOgwcE,1500
20
- Anchor_annotator-0.7.0.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
21
- Anchor_annotator-0.7.0.dist-info/top_level.txt,sha256=wX6ZKxImGRZKFQjs3f6XYw_TfbAp6Xs3SmbLfLbFAJ0,7
22
- Anchor_annotator-0.7.0.dist-info/RECORD,,