Anchor-annotator 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {Anchor_annotator-0.7.0.dist-info → Anchor_annotator-0.8.0.dist-info}/METADATA +1 -1
- Anchor_annotator-0.8.0.dist-info/RECORD +22 -0
- {Anchor_annotator-0.7.0.dist-info → Anchor_annotator-0.8.0.dist-info}/WHEEL +1 -1
- anchor/_version.py +2 -2
- anchor/main.py +151 -16
- anchor/models.py +76 -45
- anchor/plot.py +155 -66
- anchor/resources_rc.py +32928 -121948
- anchor/settings.py +8 -1
- anchor/ui_main_window.py +81 -18
- anchor/ui_preferences.py +27 -14
- anchor/undo.py +15 -9
- anchor/widgets.py +16 -17
- anchor/workers.py +218 -8
- Anchor_annotator-0.7.0.dist-info/RECORD +0 -22
- {Anchor_annotator-0.7.0.dist-info → Anchor_annotator-0.8.0.dist-info}/LICENSE +0 -0
- {Anchor_annotator-0.7.0.dist-info → Anchor_annotator-0.8.0.dist-info}/top_level.txt +0 -0
anchor/workers.py
CHANGED
@@ -28,6 +28,7 @@ import yaml
|
|
28
28
|
from _kalpy.feat import compute_pitch
|
29
29
|
from _kalpy.ivector import Plda, ivector_normalize_length
|
30
30
|
from _kalpy.matrix import DoubleVector, FloatVector
|
31
|
+
from kalpy.feat.mfcc import MfccComputer
|
31
32
|
from kalpy.feat.pitch import PitchComputer
|
32
33
|
from montreal_forced_aligner import config
|
33
34
|
from montreal_forced_aligner.alignment import PretrainedAligner
|
@@ -41,6 +42,7 @@ from montreal_forced_aligner.data import (
|
|
41
42
|
CtmInterval,
|
42
43
|
DatasetType,
|
43
44
|
DistanceMetric,
|
45
|
+
Language,
|
44
46
|
ManifoldAlgorithm,
|
45
47
|
TextFileType,
|
46
48
|
WordType,
|
@@ -63,6 +65,7 @@ from montreal_forced_aligner.db import (
|
|
63
65
|
Word,
|
64
66
|
WordInterval,
|
65
67
|
bulk_update,
|
68
|
+
full_load_utterance,
|
66
69
|
)
|
67
70
|
from montreal_forced_aligner.diarization.multiprocessing import visualize_clusters
|
68
71
|
from montreal_forced_aligner.diarization.speaker_diarizer import SpeakerDiarizer
|
@@ -79,9 +82,16 @@ from montreal_forced_aligner.online.alignment import (
|
|
79
82
|
align_utterance_online,
|
80
83
|
update_utterance_intervals,
|
81
84
|
)
|
85
|
+
from montreal_forced_aligner.online.transcription import (
|
86
|
+
transcribe_utterance_online,
|
87
|
+
transcribe_utterance_online_speechbrain,
|
88
|
+
transcribe_utterance_online_whisper,
|
89
|
+
)
|
82
90
|
from montreal_forced_aligner.transcription import Transcriber
|
91
|
+
from montreal_forced_aligner.transcription.models import MfaFasterWhisperPipeline, load_model
|
83
92
|
from montreal_forced_aligner.utils import ProgressCallback, inspect_database
|
84
|
-
from montreal_forced_aligner.vad.
|
93
|
+
from montreal_forced_aligner.vad.models import FOUND_SPEECHBRAIN, MfaVAD
|
94
|
+
from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter, VadSegmenter
|
85
95
|
from montreal_forced_aligner.validation.corpus_validator import PretrainedValidator
|
86
96
|
from PySide6 import QtCore
|
87
97
|
from sklearn import discriminant_analysis, metrics, preprocessing
|
@@ -646,7 +656,7 @@ class ExportFilesWorker(Worker):
|
|
646
656
|
subqueryload(File.utterances),
|
647
657
|
subqueryload(File.speakers),
|
648
658
|
joinedload(File.sound_file, innerjoin=True).load_only(SoundFile.duration),
|
649
|
-
joinedload(File.text_file, innerjoin=
|
659
|
+
joinedload(File.text_file, innerjoin=False).load_only(TextFile.file_type),
|
650
660
|
)
|
651
661
|
.filter(File.modified == True) # noqa
|
652
662
|
)
|
@@ -835,7 +845,7 @@ class ChangeSpeakerWorker(Worker):
|
|
835
845
|
return
|
836
846
|
session.commit()
|
837
847
|
except Exception as e:
|
838
|
-
|
848
|
+
logger.warning(e)
|
839
849
|
session.rollback()
|
840
850
|
raise
|
841
851
|
return return_data
|
@@ -906,7 +916,7 @@ class BreakUpSpeakerWorker(Worker):
|
|
906
916
|
return
|
907
917
|
session.commit()
|
908
918
|
except Exception as e:
|
909
|
-
|
919
|
+
logger.warning(e)
|
910
920
|
session.rollback()
|
911
921
|
raise
|
912
922
|
return self.utterance_ids
|
@@ -3232,6 +3242,53 @@ class SpectrogramWorker(Worker): # pragma: no cover
|
|
3232
3242
|
self.signals.result.emit((stft, self.channel, self.begin, self.end, min_db, max_db))
|
3233
3243
|
|
3234
3244
|
|
3245
|
+
class MfccWorker(Worker): # pragma: no cover
|
3246
|
+
def __init__(self, y, sample_rate, begin, end, channel, *args):
|
3247
|
+
super().__init__("Generating spectrogram", *args)
|
3248
|
+
self.y = y
|
3249
|
+
self.sample_rate = sample_rate
|
3250
|
+
self.begin = begin
|
3251
|
+
self.end = end
|
3252
|
+
self.channel = channel
|
3253
|
+
|
3254
|
+
self.mfcc_computer = MfccComputer(
|
3255
|
+
use_energy=False,
|
3256
|
+
raw_energy=False,
|
3257
|
+
frame_shift=10,
|
3258
|
+
frame_length=25,
|
3259
|
+
snip_edges=False,
|
3260
|
+
low_frequency=20,
|
3261
|
+
high_frequency=7800,
|
3262
|
+
sample_frequency=16000,
|
3263
|
+
allow_downsample=True,
|
3264
|
+
allow_upsample=True,
|
3265
|
+
dither=0.0,
|
3266
|
+
energy_floor=0.0,
|
3267
|
+
num_coefficients=13,
|
3268
|
+
num_mel_bins=23,
|
3269
|
+
cepstral_lifter=22,
|
3270
|
+
preemphasis_coefficient=0.97,
|
3271
|
+
)
|
3272
|
+
|
3273
|
+
def run(self):
|
3274
|
+
if self.y.shape[0] == 0:
|
3275
|
+
self.signals.result.emit(None)
|
3276
|
+
return
|
3277
|
+
duration = self.y.shape[0] / self.sample_rate
|
3278
|
+
if duration > self.settings.value(self.settings.SPEC_MAX_TIME):
|
3279
|
+
self.signals.result.emit(None)
|
3280
|
+
return
|
3281
|
+
if self.sample_rate > self.mfcc_computer.sample_frequency:
|
3282
|
+
self.y = scipy.signal.resample(
|
3283
|
+
self.y,
|
3284
|
+
int(self.y.shape[0] * self.mfcc_computer.sample_frequency / self.sample_rate),
|
3285
|
+
)
|
3286
|
+
self.sample_rate = self.mfcc_computer.sample_frequency
|
3287
|
+
stft = self.mfcc_computer.compute_mfccs(self.y).T
|
3288
|
+
min_db, max_db = np.min(stft), np.max(stft)
|
3289
|
+
self.signals.result.emit((stft, self.channel, self.begin, self.end, min_db, max_db))
|
3290
|
+
|
3291
|
+
|
3235
3292
|
class PitchWorker(Worker): # pragma: no cover
|
3236
3293
|
def __init__(self, y, sample_rate, begin, end, channel, normalized_min, normalized_max, *args):
|
3237
3294
|
super().__init__("Generating pitch track", *args)
|
@@ -3633,11 +3690,48 @@ class ImportAcousticModelWorker(FunctionWorker): # pragma: no cover
|
|
3633
3690
|
if not self.model_path:
|
3634
3691
|
return
|
3635
3692
|
try:
|
3636
|
-
|
3693
|
+
if str(self.model_path) == "whisper":
|
3694
|
+
cuda = self.settings.value(self.settings.CUDA)
|
3695
|
+
run_opts = None
|
3696
|
+
vad_model = None
|
3697
|
+
if cuda:
|
3698
|
+
run_opts = {"device": "cuda"}
|
3699
|
+
if FOUND_SPEECHBRAIN:
|
3700
|
+
vad_model = MfaVAD.from_hparams(
|
3701
|
+
source="speechbrain/vad-crdnn-libriparty",
|
3702
|
+
savedir=os.path.join(config.TEMPORARY_DIRECTORY, "models", "VAD"),
|
3703
|
+
run_opts=run_opts,
|
3704
|
+
)
|
3705
|
+
vad_options = {
|
3706
|
+
"apply_energy_VAD": False,
|
3707
|
+
"double_check": False,
|
3708
|
+
"activation_th": 0.5,
|
3709
|
+
"deactivation_th": 0.25,
|
3710
|
+
"en_activation_th": 0.5,
|
3711
|
+
"en_deactivation_th": 0.4,
|
3712
|
+
"speech_th": 0.5,
|
3713
|
+
"close_th": 0.333,
|
3714
|
+
"len_th": 0.333,
|
3715
|
+
}
|
3716
|
+
acoustic_model = load_model(
|
3717
|
+
"large-v3",
|
3718
|
+
device="cuda" if cuda else "cpu",
|
3719
|
+
download_root=os.path.join(
|
3720
|
+
config.TEMPORARY_DIRECTORY,
|
3721
|
+
"models",
|
3722
|
+
"Whisper",
|
3723
|
+
),
|
3724
|
+
threads=config.NUM_JOBS,
|
3725
|
+
vad_model=vad_model,
|
3726
|
+
vad_options=vad_options,
|
3727
|
+
)
|
3728
|
+
elif str(self.model_path) == "speechbrain":
|
3729
|
+
pass
|
3730
|
+
else:
|
3731
|
+
acoustic_model = AcousticModel(self.model_path)
|
3637
3732
|
except Exception:
|
3638
|
-
|
3639
|
-
|
3640
|
-
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
3733
|
+
exctype, value = sys.exc_info()[:2]
|
3734
|
+
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
3641
3735
|
else:
|
3642
3736
|
self.signals.result.emit(acoustic_model) # Return the result of the processing
|
3643
3737
|
finally:
|
@@ -3777,6 +3871,63 @@ class AlignUtteranceWorker(FunctionWorker): # pragma: no cover
|
|
3777
3871
|
self.signals.finished.emit() # Done
|
3778
3872
|
|
3779
3873
|
|
3874
|
+
class TranscribeUtteranceWorker(FunctionWorker): # pragma: no cover
|
3875
|
+
def __init__(self, *args):
|
3876
|
+
super().__init__("Transcribing utterance", *args)
|
3877
|
+
self.corpus_model: typing.Optional[CorpusModel] = None
|
3878
|
+
self.utterance_id: typing.Optional[int] = None
|
3879
|
+
|
3880
|
+
def set_params(self, corpus_model: CorpusModel, utterance_id: int):
|
3881
|
+
self.corpus_model = corpus_model
|
3882
|
+
self.utterance_id = utterance_id
|
3883
|
+
|
3884
|
+
def run(self):
|
3885
|
+
self.settings.sync()
|
3886
|
+
if isinstance(self.corpus_model.acoustic_model, AcousticModel):
|
3887
|
+
self.corpus_model.check_align_lexicon_compiler()
|
3888
|
+
language = Language[self.settings.value(self.settings.LANGUAGE)]
|
3889
|
+
try:
|
3890
|
+
with self.corpus_model.corpus.session() as session:
|
3891
|
+
utterance = (
|
3892
|
+
session.query(Utterance)
|
3893
|
+
.options(
|
3894
|
+
joinedload(Utterance.file, innerjoin=True).joinedload(
|
3895
|
+
File.sound_file, innerjoin=True
|
3896
|
+
),
|
3897
|
+
joinedload(Utterance.speaker, innerjoin=True),
|
3898
|
+
)
|
3899
|
+
.get(self.utterance_id)
|
3900
|
+
)
|
3901
|
+
if isinstance(self.corpus_model.acoustic_model, AcousticModel):
|
3902
|
+
transcription = transcribe_utterance_online(
|
3903
|
+
self.corpus_model.acoustic_model,
|
3904
|
+
utterance.to_kalpy(),
|
3905
|
+
self.corpus_model.align_lexicon_compiler,
|
3906
|
+
)
|
3907
|
+
elif isinstance(self.corpus_model.acoustic_model, MfaFasterWhisperPipeline):
|
3908
|
+
self.corpus_model.acoustic_model.set_language(language)
|
3909
|
+
transcription = transcribe_utterance_online_whisper(
|
3910
|
+
self.corpus_model.acoustic_model,
|
3911
|
+
utterance.to_kalpy().segment,
|
3912
|
+
)
|
3913
|
+
else:
|
3914
|
+
transcription = transcribe_utterance_online_speechbrain(
|
3915
|
+
self.corpus_model.acoustic_model,
|
3916
|
+
utterance.to_kalpy(),
|
3917
|
+
)
|
3918
|
+
utterance.transcription_text = transcription
|
3919
|
+
session.commit()
|
3920
|
+
except Exception:
|
3921
|
+
exctype, value = sys.exc_info()[:2]
|
3922
|
+
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
3923
|
+
else:
|
3924
|
+
self.signals.result.emit(
|
3925
|
+
(self.utterance_id, transcription)
|
3926
|
+
) # Return the result of the processing
|
3927
|
+
finally:
|
3928
|
+
self.signals.finished.emit() # Done
|
3929
|
+
|
3930
|
+
|
3780
3931
|
class SegmentUtteranceWorker(FunctionWorker): # pragma: no cover
|
3781
3932
|
def __init__(self, *args):
|
3782
3933
|
super().__init__("Segmenting utterance", *args)
|
@@ -3827,6 +3978,64 @@ class SegmentUtteranceWorker(FunctionWorker): # pragma: no cover
|
|
3827
3978
|
self.signals.finished.emit() # Done
|
3828
3979
|
|
3829
3980
|
|
3981
|
+
class TrimUtteranceWorker(FunctionWorker): # pragma: no cover
|
3982
|
+
def __init__(self, *args):
|
3983
|
+
super().__init__("Trimming utterance", *args)
|
3984
|
+
self.corpus_model: typing.Optional[CorpusModel] = None
|
3985
|
+
self.vad_model: typing.Optional[MfaVAD] = None
|
3986
|
+
self.utterance_id = None
|
3987
|
+
|
3988
|
+
def set_vad_model(self, vad_model):
|
3989
|
+
self.vad_model = vad_model
|
3990
|
+
|
3991
|
+
def set_params(self, corpus_model: CorpusModel, utterance_id: int):
|
3992
|
+
self.corpus_model = corpus_model
|
3993
|
+
self.utterance_id = utterance_id
|
3994
|
+
|
3995
|
+
def run(self):
|
3996
|
+
self.settings.sync()
|
3997
|
+
if self.vad_model is None:
|
3998
|
+
segmenter = VadSegmenter(
|
3999
|
+
corpus_directory=self.corpus_model.corpus.corpus_directory,
|
4000
|
+
)
|
4001
|
+
try:
|
4002
|
+
segmenter.inspect_database()
|
4003
|
+
segments = segmenter.segment_utterance(self.utterance_id, allow_empty=False)
|
4004
|
+
begin = segments[0].begin
|
4005
|
+
end = segments[-1].end
|
4006
|
+
except Exception:
|
4007
|
+
exctype, value = sys.exc_info()[:2]
|
4008
|
+
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
4009
|
+
else:
|
4010
|
+
self.signals.result.emit(
|
4011
|
+
(self.utterance_id, begin, end)
|
4012
|
+
) # Return the result of the processing
|
4013
|
+
finally:
|
4014
|
+
segmenter.cleanup_logger()
|
4015
|
+
self.signals.finished.emit() # Done
|
4016
|
+
else:
|
4017
|
+
try:
|
4018
|
+
with self.corpus_model.session() as session:
|
4019
|
+
utterance = full_load_utterance(session, self.utterance_id)
|
4020
|
+
segment = utterance.to_kalpy().segment
|
4021
|
+
# Compute the boundaries of the speech segments
|
4022
|
+
segments = self.vad_model.segment_utterance(segment, apply_energy_vad=True)
|
4023
|
+
try:
|
4024
|
+
begin = segments[0].begin
|
4025
|
+
end = segments[-1].end
|
4026
|
+
except IndexError:
|
4027
|
+
begin, end = segment.begin, segment.end
|
4028
|
+
except Exception:
|
4029
|
+
exctype, value = sys.exc_info()[:2]
|
4030
|
+
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
4031
|
+
else:
|
4032
|
+
self.signals.result.emit(
|
4033
|
+
(self.utterance_id, begin, end)
|
4034
|
+
) # Return the result of the processing
|
4035
|
+
finally:
|
4036
|
+
self.signals.finished.emit() # Done
|
4037
|
+
|
4038
|
+
|
3830
4039
|
class AlignmentWorker(FunctionWorker): # pragma: no cover
|
3831
4040
|
def __init__(self, *args):
|
3832
4041
|
super().__init__("Aligning", *args)
|
@@ -3922,6 +4131,7 @@ class AlignmentWorker(FunctionWorker): # pragma: no cover
|
|
3922
4131
|
aligner.verify_transcripts()
|
3923
4132
|
else:
|
3924
4133
|
aligner.align()
|
4134
|
+
aligner.analyze_alignments()
|
3925
4135
|
except Exception:
|
3926
4136
|
exctype, value = sys.exc_info()[:2]
|
3927
4137
|
self.signals.error.emit((exctype, value, traceback.format_exc()))
|
@@ -1,22 +0,0 @@
|
|
1
|
-
anchor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
anchor/__main__.py,sha256=5ufG8lcx2x1am-04xI991AG7saJd24dxPw5JzjmB878,45
|
3
|
-
anchor/_version.py,sha256=akvr8ObxvMF-aaLBzW41juT4_KL3BjQUrjbwkIuQXMk,411
|
4
|
-
anchor/command_line.py,sha256=EucG805HyWk_zkMO9RXv9Yj0I0JVdDLZb1_DX2_ISjM,503
|
5
|
-
anchor/db.py,sha256=LlZzAy4bjmJIu0v4ev5Qjg_Fh2n9sMsKI2nAY1pwd0A,5057
|
6
|
-
anchor/main.py,sha256=Lyr3ppr-nzxaU7ZmWXc-luMsOtRBbV4ebCzk3rygur4,127781
|
7
|
-
anchor/models.py,sha256=35l7Kw3LVy-_ozdV_0ApSkKyCPViBwBmAukoq-jw90o,97668
|
8
|
-
anchor/plot.py,sha256=imNRLI76VgEf4n9UGNvIaTsqn65hqnN396e4iwRTh70,113387
|
9
|
-
anchor/resources_rc.py,sha256=tzJHrJw3MpjAlnj-DtCmaR4A8gAaLF966XEXs5HNIjc,8464375
|
10
|
-
anchor/settings.py,sha256=N2gRFQEpY4pLYgcDz1Aq-2c7CfmbNxmRmVcPijrHsCo,52118
|
11
|
-
anchor/ui_corpus_manager.py,sha256=e3ybOd4UdYarrLBATxI8vIFnioa4R_BHrbsEz5mJ5eA,8564
|
12
|
-
anchor/ui_error_dialog.py,sha256=HKbjGT_jtdb9jfn9THQMbl1fmcdWyjYDazM4hCwZ5Yo,3931
|
13
|
-
anchor/ui_main_window.py,sha256=XK91lhFAIEURZ6nwxIA74X-8j-P76JuJsN-ahun65rw,37043
|
14
|
-
anchor/ui_preferences.py,sha256=g3tcjAMFKIAqUJNEke7ww4LkdeTFA1zb8_lrhF6k5fo,43271
|
15
|
-
anchor/undo.py,sha256=T8CJpSZVZbItpU7KMZU2F49mNv1wo0rvMWtNIEbieeo,32856
|
16
|
-
anchor/widgets.py,sha256=NjQAc02QVu97QClhXcylj_P6IP0DsxWae_eiZR5Bw3M,159300
|
17
|
-
anchor/workers.py,sha256=ciVOlK15MiDq7juAivcQB6PEiEs7DemP0BOrcpnm2to,182624
|
18
|
-
Anchor_annotator-0.7.0.dist-info/LICENSE,sha256=C0oIsblENEgWQ7XMNdYoXyXsIA5wa3YF0I9lK3H7A1s,1076
|
19
|
-
Anchor_annotator-0.7.0.dist-info/METADATA,sha256=hvYb1JLmhGJEfwyTNGckZl6tqtj407fmYYdPqPOgwcE,1500
|
20
|
-
Anchor_annotator-0.7.0.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
|
21
|
-
Anchor_annotator-0.7.0.dist-info/top_level.txt,sha256=wX6ZKxImGRZKFQjs3f6XYw_TfbAp6Xs3SmbLfLbFAJ0,7
|
22
|
-
Anchor_annotator-0.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|