batchalign 0.7.11b3__tar.gz → 0.7.11b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.11b3/batchalign.egg-info → batchalign-0.7.11b4}/PKG-INFO +1 -1
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/cli.py +19 -7
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/__init__.py +1 -1
- batchalign-0.7.11b4/batchalign/models/wave2vec/__init__.py +1 -0
- batchalign-0.7.11b4/batchalign/models/wave2vec/infer_fa.py +135 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/__init__.py +1 -1
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/dispatch.py +3 -2
- batchalign-0.7.11b4/batchalign/pipelines/fa/__init__.py +2 -0
- batchalign-0.7.11b4/batchalign/pipelines/fa/wave2vec_fa.py +162 -0
- batchalign-0.7.11b4/batchalign/version +3 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4/batchalign.egg-info}/PKG-INFO +1 -1
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/SOURCES.txt +3 -0
- batchalign-0.7.11b3/batchalign/pipelines/fa/__init__.py +0 -1
- batchalign-0.7.11b3/batchalign/version +0 -3
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/LICENSE +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/MANIFEST.in +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/README.md +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/__main__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/constants.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/document.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/errors.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/setup.cfg +0 -0
- {batchalign-0.7.11b3 → batchalign-0.7.11b4}/setup.py +0 -0
@@ -107,10 +107,12 @@ batchalign.add_command(train, "models")
|
|
107
107
|
@common_options
|
108
108
|
@click.option("--whisper/--rev",
|
109
109
|
default=False, help="For utterance timing recovery, OpenAI Whisper (ASR) instead of Rev.AI (default).")
|
110
|
+
@click.option("--wav2vec/--whisper_fa",
|
111
|
+
default=False, help="Use Whisper instead of Wav2Vec for English (defaults for Whisper for non-English)")
|
110
112
|
@click.option("--pauses", type=bool, default=False, help="Should we try to bullet each word or should we try to add pauses in between words by grouping them? Default: no pauses.", is_flag=True)
|
111
113
|
|
112
114
|
@click.pass_context
|
113
|
-
def align(ctx, in_dir, out_dir, whisper, **kwargs):
|
115
|
+
def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
|
114
116
|
"""Align transcripts against corresponding media files."""
|
115
117
|
def loader(file):
|
116
118
|
return (
|
@@ -121,12 +123,22 @@ def align(ctx, in_dir, out_dir, whisper, **kwargs):
|
|
121
123
|
def writer(doc, output):
|
122
124
|
CHATFile(doc=doc).write(output)
|
123
125
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
126
|
+
if not wav2vec:
|
127
|
+
_dispatch("align", "eng", 1,
|
128
|
+
["cha"], ctx,
|
129
|
+
in_dir, out_dir,
|
130
|
+
loader, writer, C,
|
131
|
+
fa="whisper_fa",
|
132
|
+
utr="whisper_utr" if whisper else "rev_utr",
|
133
|
+
**kwargs)
|
134
|
+
else:
|
135
|
+
_dispatch("align", "eng", 1,
|
136
|
+
["cha"], ctx,
|
137
|
+
in_dir, out_dir,
|
138
|
+
loader, writer, C,
|
139
|
+
fa="wav2vec_fa",
|
140
|
+
utr="whisper_utr" if whisper else "rev_utr",
|
141
|
+
**kwargs)
|
130
142
|
|
131
143
|
#################### TRANSCRIBE ################################
|
132
144
|
|
@@ -0,0 +1 @@
|
|
1
|
+
from .infer_fa import Wave2VecFAModel
|
@@ -0,0 +1,135 @@
|
|
1
|
+
from transformers import WhisperProcessor, WhisperTokenizer, WhisperForConditionalGeneration
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from torchaudio import load
|
5
|
+
from torchaudio import transforms as T
|
6
|
+
from batchalign.models.utils import ASRAudioFile
|
7
|
+
|
8
|
+
import torchaudio
|
9
|
+
bundle = torchaudio.pipelines.MMS_FA
|
10
|
+
import torchaudio.functional as AF
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
import logging
|
15
|
+
L = logging.getLogger("batchalign")
|
16
|
+
|
17
|
+
# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
18
|
+
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
|
19
|
+
TIME_PRECISION = 0.02
|
20
|
+
|
21
|
+
# inference engine
|
22
|
+
class Wave2VecFAModel(object):
|
23
|
+
"""An Forced Alignment engine built out of whisper
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
model : str
|
28
|
+
The model path to load from.
|
29
|
+
target_sample_rate : optional, int
|
30
|
+
The sample rate to cast to. Defaults 16000 by Whisper.
|
31
|
+
|
32
|
+
Example
|
33
|
+
-------
|
34
|
+
>>> engine = Wave2VecFAModel()
|
35
|
+
>>> file = engine.load("./data/myfile.wav")
|
36
|
+
>>> timestamps = engine(audio=file.chunk(0, 1500), text="this is my transcript") # FA
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(self, target_sample_rate=16000):
|
40
|
+
L.debug("Initializing Wave2vec FA model")
|
41
|
+
self.model = bundle.get_model().to(DEVICE)
|
42
|
+
L.debug("Wave2Vec FA initialization done.")
|
43
|
+
|
44
|
+
# save the target sample rate
|
45
|
+
self.sample_rate = target_sample_rate
|
46
|
+
|
47
|
+
def load(self, f):
|
48
|
+
"""Load an audio file for procesing.
|
49
|
+
|
50
|
+
Parameters
|
51
|
+
----------
|
52
|
+
f : str
|
53
|
+
The audio .wav file name to process.
|
54
|
+
num_speakers : int
|
55
|
+
The number of speakers
|
56
|
+
|
57
|
+
Returns
|
58
|
+
-------
|
59
|
+
Tuple[ASRAudioFile, List[dict]]
|
60
|
+
Return processed audio file and speaker segments.
|
61
|
+
"""
|
62
|
+
|
63
|
+
# function: load and resample audio
|
64
|
+
audio_arr, rate = load(f)
|
65
|
+
|
66
|
+
# resample if needed
|
67
|
+
if rate != self.sample_rate:
|
68
|
+
audio_arr = T.Resample(rate, self.sample_rate)(audio_arr)
|
69
|
+
|
70
|
+
# transpose and mean
|
71
|
+
resampled = torch.mean(audio_arr.transpose(0,1), dim=1)
|
72
|
+
|
73
|
+
# and return the audio file
|
74
|
+
return ASRAudioFile(f, resampled, self.sample_rate)
|
75
|
+
|
76
|
+
def __call__(self, audio, text):
|
77
|
+
"""Run forced alignment on the audio file.
|
78
|
+
|
79
|
+
Arguments
|
80
|
+
----------
|
81
|
+
audio : tensor
|
82
|
+
The audio file to process.
|
83
|
+
text : str
|
84
|
+
The transcript to align to.
|
85
|
+
|
86
|
+
Returns
|
87
|
+
-------
|
88
|
+
List[Tuple[str, Tuple[int, int]]]
|
89
|
+
A list of speaker segments
|
90
|
+
"""
|
91
|
+
|
92
|
+
L.debug("Running Wav2Vec word-level forced alignment...")
|
93
|
+
|
94
|
+
# complete the call function, don't write anything else
|
95
|
+
L.debug("Running Wav2Vec word-level forced alignment...")
|
96
|
+
|
97
|
+
# Move audio to device and normalize
|
98
|
+
audio = audio.to(DEVICE)
|
99
|
+
|
100
|
+
# Get emission matrix from model
|
101
|
+
emission, _ = self.model(audio.unsqueeze(0))
|
102
|
+
emission = emission.cpu().detach()
|
103
|
+
|
104
|
+
# Get tokens and transcript
|
105
|
+
dictionary = bundle.get_dict()
|
106
|
+
|
107
|
+
# Convert text to tokens
|
108
|
+
transcript = torch.tensor([dictionary.get(c, dictionary["*"])
|
109
|
+
for word in text
|
110
|
+
for c in word.lower()])
|
111
|
+
|
112
|
+
# Run forced alignment
|
113
|
+
path, scores = AF.forced_align(emission, transcript.unsqueeze(0))
|
114
|
+
alignments, scores = path[0], scores[0]
|
115
|
+
scores = scores.exp()
|
116
|
+
|
117
|
+
# Merge repeated tokens and remove blanks
|
118
|
+
path = AF.merge_tokens(alignments, scores)
|
119
|
+
|
120
|
+
def unflatten(list_, lengths):
|
121
|
+
assert len(list_) == sum(lengths)
|
122
|
+
i = 0
|
123
|
+
ret = []
|
124
|
+
for l in lengths:
|
125
|
+
ret.append(list_[i : i + l])
|
126
|
+
i += l
|
127
|
+
return ret
|
128
|
+
|
129
|
+
# Unflatten to get character-level alignments
|
130
|
+
word_spans = unflatten(path, [len(word) for word in text])
|
131
|
+
ratio = audio.size(0)/emission.size(1)
|
132
|
+
word_spans = [(int(((spans[0].start*ratio)/self.sample_rate)*1000),
|
133
|
+
int(((spans[-1].end*ratio)/self.sample_rate)*1000)) for spans in word_spans]
|
134
|
+
|
135
|
+
return list(zip(text, word_spans))
|
@@ -6,7 +6,7 @@ from .morphosyntax import StanzaEngine, CorefEngine
|
|
6
6
|
from .cleanup import NgramRetraceEngine, DisfluencyReplacementEngine
|
7
7
|
from .speaker import NemoSpeakerEngine
|
8
8
|
|
9
|
-
from .fa import WhisperFAEngine
|
9
|
+
from .fa import WhisperFAEngine, Wave2VecFAEngine
|
10
10
|
from .utr import WhisperUTREngine, RevUTREngine
|
11
11
|
|
12
12
|
from .analysis import EvaluationEngine
|
@@ -6,7 +6,7 @@ Tabulate default packages and options.
|
|
6
6
|
from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
|
7
7
|
NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
|
8
8
|
RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
|
9
|
-
StanzaUtteranceEngine, CorefEngine)
|
9
|
+
StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
|
10
10
|
from batchalign import BatchalignPipeline
|
11
11
|
from batchalign.models import resolve
|
12
12
|
|
@@ -127,7 +127,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
|
|
127
127
|
engines.append(StanzaUtteranceEngine())
|
128
128
|
elif engine == "stanza_coref":
|
129
129
|
engines.append(CorefEngine())
|
130
|
-
|
130
|
+
elif engine == "wav2vec_fa":
|
131
|
+
engines.append(Wave2VecFAEngine())
|
131
132
|
|
132
133
|
L.debug(f"Done initalizing packages.")
|
133
134
|
return BatchalignPipeline(*engines)
|
@@ -0,0 +1,162 @@
|
|
1
|
+
from batchalign.models import Wave2VecFAModel
|
2
|
+
from batchalign.document import *
|
3
|
+
from batchalign.pipelines.base import *
|
4
|
+
from batchalign.utils import *
|
5
|
+
from batchalign.utils.dp import *
|
6
|
+
from batchalign.constants import *
|
7
|
+
|
8
|
+
import logging
|
9
|
+
L = logging.getLogger("batchalign")
|
10
|
+
|
11
|
+
import re
|
12
|
+
|
13
|
+
import pycountry
|
14
|
+
import warnings
|
15
|
+
|
16
|
+
class Wave2VecFAEngine(BatchalignEngine):
|
17
|
+
tasks = [ Task.FORCED_ALIGNMENT ]
|
18
|
+
|
19
|
+
def _hook_status(self, status_hook):
|
20
|
+
self.status_hook = status_hook
|
21
|
+
|
22
|
+
def __init__(self):
|
23
|
+
self.status_hook = None
|
24
|
+
self.__wav2vec = Wave2VecFAModel()
|
25
|
+
|
26
|
+
def process(self, doc:Document, **kwargs):
|
27
|
+
# check that the document has a media path to align to
|
28
|
+
assert doc.media != None and doc.media.url != None, f"We cannot forced-align something that doesn't have a media path! Provided media tier='{doc.media}'"
|
29
|
+
|
30
|
+
if doc.langs[0] != "eng":
|
31
|
+
warnings.warn("Looks like you are not aligning English with wav2vec; this works for a lot of Roman languages, but outside of that your milage may vary.")
|
32
|
+
|
33
|
+
# load the audio file
|
34
|
+
L.debug(f"Wave2Vec FA is loading url {doc.media.url}...")
|
35
|
+
f = self.__wav2vec.load(doc.media.url)
|
36
|
+
L.debug(f"Wav2Vec FA finished loading media.")
|
37
|
+
|
38
|
+
# collect utterances 30 secondish segments to be aligned for whisper
|
39
|
+
# we have to do this because whisper does poorly with very short segments
|
40
|
+
groups = []
|
41
|
+
group = []
|
42
|
+
seg_start = 0
|
43
|
+
|
44
|
+
L.debug(f"Wav2Vec FA finished loading media.")
|
45
|
+
|
46
|
+
for i in doc.content:
|
47
|
+
if not isinstance(i, Utterance):
|
48
|
+
continue
|
49
|
+
if i.alignment == None:
|
50
|
+
warnings.warn("We found at least one utterance without utterance-level alignment; this is usually not an issue, but if the entire transcript is unaligned, it means that utterance level timing recovery (which is fuzzy using ASR) failed due to the audio clarity. On this transcript, before running forced-alignment, please supply utterance-level links.")
|
51
|
+
continue
|
52
|
+
|
53
|
+
# pop the previous group onto the stack
|
54
|
+
if (i.alignment[-1] - seg_start) > 20*1000:
|
55
|
+
groups.append(group)
|
56
|
+
group = []
|
57
|
+
seg_start = i.alignment[0]
|
58
|
+
|
59
|
+
# append the contents to the running group
|
60
|
+
for word in i.content:
|
61
|
+
group.append((word, i.alignment))
|
62
|
+
|
63
|
+
groups.append(group)
|
64
|
+
|
65
|
+
L.debug(f"Begin Wav2Vec Inference...")
|
66
|
+
|
67
|
+
for indx, grp in enumerate(groups):
|
68
|
+
L.info(f"Wave2Vec FA processing segment {indx+1}/{len(groups)}...")
|
69
|
+
if self.status_hook != None:
|
70
|
+
self.status_hook(indx+1, len(groups))
|
71
|
+
|
72
|
+
# perform alignment
|
73
|
+
# we take a 2 second buffer in each direction
|
74
|
+
try:
|
75
|
+
transcript = [word[0].text for word in grp]
|
76
|
+
# replace ANY punctuation
|
77
|
+
for p in MOR_PUNCT + ENDING_PUNCT:
|
78
|
+
transcript = [i.replace("_", " ") for i in transcript if i.strip() != p]
|
79
|
+
# if "noone's" in detokenized:
|
80
|
+
# breakpoint()
|
81
|
+
res = self.__wav2vec(audio=f.chunk(grp[0][1][0], grp[-1][1][1]), text=transcript)
|
82
|
+
except IndexError:
|
83
|
+
# utterance contains nothing
|
84
|
+
continue
|
85
|
+
|
86
|
+
# create reference backplates, which are the word ids to set the timing for
|
87
|
+
ref_targets = []
|
88
|
+
for indx, (word, _) in enumerate(grp):
|
89
|
+
for char in word.text:
|
90
|
+
ref_targets.append(ReferenceTarget(char, payload=indx))
|
91
|
+
# create target backplates for the timings
|
92
|
+
payload_targets = []
|
93
|
+
timings = []
|
94
|
+
for indx, (word, time) in enumerate(res):
|
95
|
+
timings.append(time)
|
96
|
+
for char in word:
|
97
|
+
payload_targets.append(PayloadTarget(char, payload=indx))
|
98
|
+
# alignment!
|
99
|
+
alignments = align(payload_targets, ref_targets, tqdm=False)
|
100
|
+
|
101
|
+
# set the ids back to the text ids
|
102
|
+
# we do this BACKWARDS because we went to have the first timestamp
|
103
|
+
# we get about a word first
|
104
|
+
alignments.reverse()
|
105
|
+
for indx,elem in enumerate(alignments):
|
106
|
+
if isinstance(elem, Match):
|
107
|
+
grp[elem.reference_payload][0].time = (int(round((timings[elem.payload][0] +
|
108
|
+
grp[0][1][0]))),
|
109
|
+
int(round((timings[elem.payload][1] +
|
110
|
+
grp[0][1][0]))))
|
111
|
+
|
112
|
+
L.debug(f"Correcting text...")
|
113
|
+
|
114
|
+
# we now set the end alignment of each word to the start of the next
|
115
|
+
for doc_ut, ut in enumerate(doc.content):
|
116
|
+
if not isinstance(ut, Utterance):
|
117
|
+
continue
|
118
|
+
|
119
|
+
# correct each word by bumping it forward
|
120
|
+
# and if its not a word we remove the timing
|
121
|
+
for indx, w in enumerate(ut.content):
|
122
|
+
if w.type in [TokenType.PUNCT, TokenType.FEAT, TokenType.ANNOT]:
|
123
|
+
w.time = None
|
124
|
+
elif indx == len(ut.content)-1 and w.text in ENDING_PUNCT:
|
125
|
+
w.time = None
|
126
|
+
elif indx != len(ut.content)-1:
|
127
|
+
# search forward for the next compatible time
|
128
|
+
tmp = indx+1
|
129
|
+
while tmp < len(ut.content)-1 and ut.content[tmp].time == None:
|
130
|
+
tmp += 1
|
131
|
+
if w.time == None:
|
132
|
+
continue
|
133
|
+
if ut.content[tmp].time == None:
|
134
|
+
# seek forward one utterance to find their start time
|
135
|
+
next_ut = doc_ut + 1
|
136
|
+
while next_ut < len(doc.content)-1 and (not isinstance(doc.content, Utterance) or doc.content[next_ut].alignment == None):
|
137
|
+
next_ut += 1
|
138
|
+
if next_ut < len(doc.content) and isinstance(doc.content, Utterance) and doc.content[next_ut].alignment:
|
139
|
+
w.time = (w.time[0], doc.content[next_ut].alignment[0])
|
140
|
+
else:
|
141
|
+
w.time = (w.time[0], w.time[0]+500) # give half a second because we don't know
|
142
|
+
|
143
|
+
# just in case, bound the time by the utterance derived timings
|
144
|
+
if ut.alignment and ut.alignment[0] != None:
|
145
|
+
w.time = (max(w.time[0], ut.alignment[0]), min(w.time[1], ut.alignment[1]))
|
146
|
+
# if we ended up with timings that don't make sense, drop it
|
147
|
+
if w.time and w.time[0] >= w.time[1]:
|
148
|
+
w.time = None
|
149
|
+
|
150
|
+
# clear any built-in timing (i.e. we should use utterance-derived timing)
|
151
|
+
ut.time = None
|
152
|
+
# correct the text
|
153
|
+
if ut.alignment and ut.text != None:
|
154
|
+
if '\x15' not in ut.text:
|
155
|
+
ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
|
156
|
+
else:
|
157
|
+
ut.text = re.sub("\x15\d+_\d+\x15",
|
158
|
+
f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
|
159
|
+
elif ut.text != None:
|
160
|
+
ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
|
161
|
+
|
162
|
+
return doc
|
@@ -45,6 +45,8 @@ batchalign/models/utterance/execute.py
|
|
45
45
|
batchalign/models/utterance/infer.py
|
46
46
|
batchalign/models/utterance/prep.py
|
47
47
|
batchalign/models/utterance/train.py
|
48
|
+
batchalign/models/wave2vec/__init__.py
|
49
|
+
batchalign/models/wave2vec/infer_fa.py
|
48
50
|
batchalign/models/whisper/__init__.py
|
49
51
|
batchalign/models/whisper/infer_asr.py
|
50
52
|
batchalign/models/whisper/infer_fa.py
|
@@ -68,6 +70,7 @@ batchalign/pipelines/cleanup/support/filled_pauses.eng
|
|
68
70
|
batchalign/pipelines/cleanup/support/replacements.eng
|
69
71
|
batchalign/pipelines/cleanup/support/test.test
|
70
72
|
batchalign/pipelines/fa/__init__.py
|
73
|
+
batchalign/pipelines/fa/wave2vec_fa.py
|
71
74
|
batchalign/pipelines/fa/whisper_fa.py
|
72
75
|
batchalign/pipelines/morphosyntax/__init__.py
|
73
76
|
batchalign/pipelines/morphosyntax/coref.py
|
@@ -1 +0,0 @@
|
|
1
|
-
from .whisper_fa import WhisperFAEngine
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/ja/verbforms.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|