batchalign 0.7.11b3__tar.gz → 0.7.11b4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {batchalign-0.7.11b3/batchalign.egg-info → batchalign-0.7.11b4}/PKG-INFO +1 -1
  2. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/cli.py +19 -7
  3. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/__init__.py +1 -1
  4. batchalign-0.7.11b4/batchalign/models/wave2vec/__init__.py +1 -0
  5. batchalign-0.7.11b4/batchalign/models/wave2vec/infer_fa.py +135 -0
  6. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/__init__.py +1 -1
  7. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/dispatch.py +3 -2
  8. batchalign-0.7.11b4/batchalign/pipelines/fa/__init__.py +2 -0
  9. batchalign-0.7.11b4/batchalign/pipelines/fa/wave2vec_fa.py +162 -0
  10. batchalign-0.7.11b4/batchalign/version +3 -0
  11. {batchalign-0.7.11b3 → batchalign-0.7.11b4/batchalign.egg-info}/PKG-INFO +1 -1
  12. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/SOURCES.txt +3 -0
  13. batchalign-0.7.11b3/batchalign/pipelines/fa/__init__.py +0 -1
  14. batchalign-0.7.11b3/batchalign/version +0 -3
  15. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/LICENSE +0 -0
  16. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/MANIFEST.in +0 -0
  17. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/README.md +0 -0
  18. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/__init__.py +0 -0
  19. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/__main__.py +0 -0
  20. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/__init__.py +0 -0
  21. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/cli/dispatch.py +0 -0
  22. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/constants.py +0 -0
  23. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/document.py +0 -0
  24. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/errors.py +0 -0
  25. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/__init__.py +0 -0
  26. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/base.py +0 -0
  27. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/__init__.py +0 -0
  28. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/file.py +0 -0
  29. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/generator.py +0 -0
  30. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/lexer.py +0 -0
  31. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/parser.py +0 -0
  32. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/chat/utils.py +0 -0
  33. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/__init__.py +0 -0
  34. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/file.py +0 -0
  35. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/generator.py +0 -0
  36. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/formats/textgrid/parser.py +0 -0
  37. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/resolve.py +0 -0
  38. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/__init__.py +0 -0
  39. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/config.yaml +0 -0
  40. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/infer.py +0 -0
  41. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/speaker/utils.py +0 -0
  42. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/__init__.py +0 -0
  43. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/run.py +0 -0
  44. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/training/utils.py +0 -0
  45. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utils.py +0 -0
  46. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/__init__.py +0 -0
  47. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/dataset.py +0 -0
  48. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/execute.py +0 -0
  49. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/infer.py +0 -0
  50. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/prep.py +0 -0
  51. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/utterance/train.py +0 -0
  52. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/__init__.py +0 -0
  53. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/infer_asr.py +0 -0
  54. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/models/whisper/infer_fa.py +0 -0
  55. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/analysis/__init__.py +0 -0
  56. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/analysis/eval.py +0 -0
  57. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/__init__.py +0 -0
  58. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/rev.py +0 -0
  59. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/utils.py +0 -0
  60. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/whisper.py +0 -0
  61. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/asr/whisperx.py +0 -0
  62. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/base.py +0 -0
  63. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/__init__.py +0 -0
  64. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  65. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  66. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  67. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/retrace.py +0 -0
  68. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  69. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  70. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/cleanup/support/test.test +0 -0
  71. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  72. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  73. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  74. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  75. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  76. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  77. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  78. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  79. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  80. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/pipeline.py +0 -0
  81. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/speaker/__init__.py +0 -0
  82. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  83. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/__init__.py +0 -0
  84. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/rev_utr.py +0 -0
  85. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/utils.py +0 -0
  86. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  87. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utterance/__init__.py +0 -0
  88. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  89. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/__init__.py +0 -0
  90. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/conftest.py +0 -0
  91. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  92. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  93. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  94. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  95. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  96. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  97. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  98. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  99. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  100. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  101. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  102. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  103. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/fixures.py +0 -0
  104. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  105. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  106. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/tests/test_document.py +0 -0
  107. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/__init__.py +0 -0
  108. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/config.py +0 -0
  109. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/dp.py +0 -0
  110. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign/utils/utils.py +0 -0
  111. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/dependency_links.txt +0 -0
  112. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/entry_points.txt +0 -0
  113. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/requires.txt +0 -0
  114. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/batchalign.egg-info/top_level.txt +0 -0
  115. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/setup.cfg +0 -0
  116. {batchalign-0.7.11b3 → batchalign-0.7.11b4}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.11b3
3
+ Version: 0.7.11b4
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -107,10 +107,12 @@ batchalign.add_command(train, "models")
107
107
  @common_options
108
108
  @click.option("--whisper/--rev",
109
109
  default=False, help="For utterance timing recovery, OpenAI Whisper (ASR) instead of Rev.AI (default).")
110
+ @click.option("--wav2vec/--whisper_fa",
111
+ default=False, help="Use Whisper instead of Wav2Vec for English (defaults for Whisper for non-English)")
110
112
  @click.option("--pauses", type=bool, default=False, help="Should we try to bullet each word or should we try to add pauses in between words by grouping them? Default: no pauses.", is_flag=True)
111
113
 
112
114
  @click.pass_context
113
- def align(ctx, in_dir, out_dir, whisper, **kwargs):
115
+ def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
114
116
  """Align transcripts against corresponding media files."""
115
117
  def loader(file):
116
118
  return (
@@ -121,12 +123,22 @@ def align(ctx, in_dir, out_dir, whisper, **kwargs):
121
123
  def writer(doc, output):
122
124
  CHATFile(doc=doc).write(output)
123
125
 
124
- _dispatch("align", "eng", 1,
125
- ["cha"], ctx,
126
- in_dir, out_dir,
127
- loader, writer, C,
128
- utr="whisper_utr" if whisper else "rev_utr",
129
- **kwargs)
126
+ if not wav2vec:
127
+ _dispatch("align", "eng", 1,
128
+ ["cha"], ctx,
129
+ in_dir, out_dir,
130
+ loader, writer, C,
131
+ fa="whisper_fa",
132
+ utr="whisper_utr" if whisper else "rev_utr",
133
+ **kwargs)
134
+ else:
135
+ _dispatch("align", "eng", 1,
136
+ ["cha"], ctx,
137
+ in_dir, out_dir,
138
+ loader, writer, C,
139
+ fa="wav2vec_fa",
140
+ utr="whisper_utr" if whisper else "rev_utr",
141
+ **kwargs)
130
142
 
131
143
  #################### TRANSCRIBE ################################
132
144
 
@@ -3,4 +3,4 @@ from .whisper import WhisperASRModel, WhisperFAModel
3
3
  from .speaker import NemoSpeakerModel
4
4
  from .utils import ASRAudioFile
5
5
  from .resolve import resolve
6
-
6
+ from .wave2vec import Wave2VecFAModel
@@ -0,0 +1 @@
1
+ from .infer_fa import Wave2VecFAModel
@@ -0,0 +1,135 @@
1
+ from transformers import WhisperProcessor, WhisperTokenizer, WhisperForConditionalGeneration
2
+
3
+ import torch
4
+ from torchaudio import load
5
+ from torchaudio import transforms as T
6
+ from batchalign.models.utils import ASRAudioFile
7
+
8
+ import torchaudio
9
+ bundle = torchaudio.pipelines.MMS_FA
10
+ import torchaudio.functional as AF
11
+
12
+ import numpy as np
13
+
14
+ import logging
15
+ L = logging.getLogger("batchalign")
16
+
17
+ # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
+ DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
19
+ TIME_PRECISION = 0.02
20
+
21
+ # inference engine
22
+ class Wave2VecFAModel(object):
23
+ """An Forced Alignment engine built out of whisper
24
+
25
+ Parameters
26
+ ----------
27
+ model : str
28
+ The model path to load from.
29
+ target_sample_rate : optional, int
30
+ The sample rate to cast to. Defaults 16000 by Whisper.
31
+
32
+ Example
33
+ -------
34
+ >>> engine = Wave2VecFAModel()
35
+ >>> file = engine.load("./data/myfile.wav")
36
+ >>> timestamps = engine(audio=file.chunk(0, 1500), text="this is my transcript") # FA
37
+ """
38
+
39
+ def __init__(self, target_sample_rate=16000):
40
+ L.debug("Initializing Wave2vec FA model")
41
+ self.model = bundle.get_model().to(DEVICE)
42
+ L.debug("Wave2Vec FA initialization done.")
43
+
44
+ # save the target sample rate
45
+ self.sample_rate = target_sample_rate
46
+
47
+ def load(self, f):
48
+ """Load an audio file for procesing.
49
+
50
+ Parameters
51
+ ----------
52
+ f : str
53
+ The audio .wav file name to process.
54
+ num_speakers : int
55
+ The number of speakers
56
+
57
+ Returns
58
+ -------
59
+ Tuple[ASRAudioFile, List[dict]]
60
+ Return processed audio file and speaker segments.
61
+ """
62
+
63
+ # function: load and resample audio
64
+ audio_arr, rate = load(f)
65
+
66
+ # resample if needed
67
+ if rate != self.sample_rate:
68
+ audio_arr = T.Resample(rate, self.sample_rate)(audio_arr)
69
+
70
+ # transpose and mean
71
+ resampled = torch.mean(audio_arr.transpose(0,1), dim=1)
72
+
73
+ # and return the audio file
74
+ return ASRAudioFile(f, resampled, self.sample_rate)
75
+
76
+ def __call__(self, audio, text):
77
+ """Run forced alignment on the audio file.
78
+
79
+ Arguments
80
+ ----------
81
+ audio : tensor
82
+ The audio file to process.
83
+ text : str
84
+ The transcript to align to.
85
+
86
+ Returns
87
+ -------
88
+ List[Tuple[str, Tuple[int, int]]]
89
+ A list of speaker segments
90
+ """
91
+
92
+ L.debug("Running Wav2Vec word-level forced alignment...")
93
+
94
+ # complete the call function, don't write anything else
95
+ L.debug("Running Wav2Vec word-level forced alignment...")
96
+
97
+ # Move audio to device and normalize
98
+ audio = audio.to(DEVICE)
99
+
100
+ # Get emission matrix from model
101
+ emission, _ = self.model(audio.unsqueeze(0))
102
+ emission = emission.cpu().detach()
103
+
104
+ # Get tokens and transcript
105
+ dictionary = bundle.get_dict()
106
+
107
+ # Convert text to tokens
108
+ transcript = torch.tensor([dictionary.get(c, dictionary["*"])
109
+ for word in text
110
+ for c in word.lower()])
111
+
112
+ # Run forced alignment
113
+ path, scores = AF.forced_align(emission, transcript.unsqueeze(0))
114
+ alignments, scores = path[0], scores[0]
115
+ scores = scores.exp()
116
+
117
+ # Merge repeated tokens and remove blanks
118
+ path = AF.merge_tokens(alignments, scores)
119
+
120
+ def unflatten(list_, lengths):
121
+ assert len(list_) == sum(lengths)
122
+ i = 0
123
+ ret = []
124
+ for l in lengths:
125
+ ret.append(list_[i : i + l])
126
+ i += l
127
+ return ret
128
+
129
+ # Unflatten to get character-level alignments
130
+ word_spans = unflatten(path, [len(word) for word in text])
131
+ ratio = audio.size(0)/emission.size(1)
132
+ word_spans = [(int(((spans[0].start*ratio)/self.sample_rate)*1000),
133
+ int(((spans[-1].end*ratio)/self.sample_rate)*1000)) for spans in word_spans]
134
+
135
+ return list(zip(text, word_spans))
@@ -6,7 +6,7 @@ from .morphosyntax import StanzaEngine, CorefEngine
6
6
  from .cleanup import NgramRetraceEngine, DisfluencyReplacementEngine
7
7
  from .speaker import NemoSpeakerEngine
8
8
 
9
- from .fa import WhisperFAEngine
9
+ from .fa import WhisperFAEngine, Wave2VecFAEngine
10
10
  from .utr import WhisperUTREngine, RevUTREngine
11
11
 
12
12
  from .analysis import EvaluationEngine
@@ -6,7 +6,7 @@ Tabulate default packages and options.
6
6
  from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
7
7
  NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
8
8
  RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
9
- StanzaUtteranceEngine, CorefEngine)
9
+ StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
10
10
  from batchalign import BatchalignPipeline
11
11
  from batchalign.models import resolve
12
12
 
@@ -127,7 +127,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
127
127
  engines.append(StanzaUtteranceEngine())
128
128
  elif engine == "stanza_coref":
129
129
  engines.append(CorefEngine())
130
-
130
+ elif engine == "wav2vec_fa":
131
+ engines.append(Wave2VecFAEngine())
131
132
 
132
133
  L.debug(f"Done initalizing packages.")
133
134
  return BatchalignPipeline(*engines)
@@ -0,0 +1,2 @@
1
+ from .whisper_fa import WhisperFAEngine
2
+ from .wave2vec_fa import Wave2VecFAEngine
@@ -0,0 +1,162 @@
1
+ from batchalign.models import Wave2VecFAModel
2
+ from batchalign.document import *
3
+ from batchalign.pipelines.base import *
4
+ from batchalign.utils import *
5
+ from batchalign.utils.dp import *
6
+ from batchalign.constants import *
7
+
8
+ import logging
9
+ L = logging.getLogger("batchalign")
10
+
11
+ import re
12
+
13
+ import pycountry
14
+ import warnings
15
+
16
+ class Wave2VecFAEngine(BatchalignEngine):
17
+ tasks = [ Task.FORCED_ALIGNMENT ]
18
+
19
+ def _hook_status(self, status_hook):
20
+ self.status_hook = status_hook
21
+
22
+ def __init__(self):
23
+ self.status_hook = None
24
+ self.__wav2vec = Wave2VecFAModel()
25
+
26
+ def process(self, doc:Document, **kwargs):
27
+ # check that the document has a media path to align to
28
+ assert doc.media != None and doc.media.url != None, f"We cannot forced-align something that doesn't have a media path! Provided media tier='{doc.media}'"
29
+
30
+ if doc.langs[0] != "eng":
31
+ warnings.warn("Looks like you are not aligning English with wav2vec; this works for a lot of Roman languages, but outside of that your milage may vary.")
32
+
33
+ # load the audio file
34
+ L.debug(f"Wave2Vec FA is loading url {doc.media.url}...")
35
+ f = self.__wav2vec.load(doc.media.url)
36
+ L.debug(f"Wav2Vec FA finished loading media.")
37
+
38
+ # collect utterances 30 secondish segments to be aligned for whisper
39
+ # we have to do this because whisper does poorly with very short segments
40
+ groups = []
41
+ group = []
42
+ seg_start = 0
43
+
44
+ L.debug(f"Wav2Vec FA finished loading media.")
45
+
46
+ for i in doc.content:
47
+ if not isinstance(i, Utterance):
48
+ continue
49
+ if i.alignment == None:
50
+ warnings.warn("We found at least one utterance without utterance-level alignment; this is usually not an issue, but if the entire transcript is unaligned, it means that utterance level timing recovery (which is fuzzy using ASR) failed due to the audio clarity. On this transcript, before running forced-alignment, please supply utterance-level links.")
51
+ continue
52
+
53
+ # pop the previous group onto the stack
54
+ if (i.alignment[-1] - seg_start) > 20*1000:
55
+ groups.append(group)
56
+ group = []
57
+ seg_start = i.alignment[0]
58
+
59
+ # append the contents to the running group
60
+ for word in i.content:
61
+ group.append((word, i.alignment))
62
+
63
+ groups.append(group)
64
+
65
+ L.debug(f"Begin Wav2Vec Inference...")
66
+
67
+ for indx, grp in enumerate(groups):
68
+ L.info(f"Wave2Vec FA processing segment {indx+1}/{len(groups)}...")
69
+ if self.status_hook != None:
70
+ self.status_hook(indx+1, len(groups))
71
+
72
+ # perform alignment
73
+ # we take a 2 second buffer in each direction
74
+ try:
75
+ transcript = [word[0].text for word in grp]
76
+ # replace ANY punctuation
77
+ for p in MOR_PUNCT + ENDING_PUNCT:
78
+ transcript = [i.replace("_", " ") for i in transcript if i.strip() != p]
79
+ # if "noone's" in detokenized:
80
+ # breakpoint()
81
+ res = self.__wav2vec(audio=f.chunk(grp[0][1][0], grp[-1][1][1]), text=transcript)
82
+ except IndexError:
83
+ # utterance contains nothing
84
+ continue
85
+
86
+ # create reference backplates, which are the word ids to set the timing for
87
+ ref_targets = []
88
+ for indx, (word, _) in enumerate(grp):
89
+ for char in word.text:
90
+ ref_targets.append(ReferenceTarget(char, payload=indx))
91
+ # create target backplates for the timings
92
+ payload_targets = []
93
+ timings = []
94
+ for indx, (word, time) in enumerate(res):
95
+ timings.append(time)
96
+ for char in word:
97
+ payload_targets.append(PayloadTarget(char, payload=indx))
98
+ # alignment!
99
+ alignments = align(payload_targets, ref_targets, tqdm=False)
100
+
101
+ # set the ids back to the text ids
102
+ # we do this BACKWARDS because we went to have the first timestamp
103
+ # we get about a word first
104
+ alignments.reverse()
105
+ for indx,elem in enumerate(alignments):
106
+ if isinstance(elem, Match):
107
+ grp[elem.reference_payload][0].time = (int(round((timings[elem.payload][0] +
108
+ grp[0][1][0]))),
109
+ int(round((timings[elem.payload][1] +
110
+ grp[0][1][0]))))
111
+
112
+ L.debug(f"Correcting text...")
113
+
114
+ # we now set the end alignment of each word to the start of the next
115
+ for doc_ut, ut in enumerate(doc.content):
116
+ if not isinstance(ut, Utterance):
117
+ continue
118
+
119
+ # correct each word by bumping it forward
120
+ # and if its not a word we remove the timing
121
+ for indx, w in enumerate(ut.content):
122
+ if w.type in [TokenType.PUNCT, TokenType.FEAT, TokenType.ANNOT]:
123
+ w.time = None
124
+ elif indx == len(ut.content)-1 and w.text in ENDING_PUNCT:
125
+ w.time = None
126
+ elif indx != len(ut.content)-1:
127
+ # search forward for the next compatible time
128
+ tmp = indx+1
129
+ while tmp < len(ut.content)-1 and ut.content[tmp].time == None:
130
+ tmp += 1
131
+ if w.time == None:
132
+ continue
133
+ if ut.content[tmp].time == None:
134
+ # seek forward one utterance to find their start time
135
+ next_ut = doc_ut + 1
136
+ while next_ut < len(doc.content)-1 and (not isinstance(doc.content, Utterance) or doc.content[next_ut].alignment == None):
137
+ next_ut += 1
138
+ if next_ut < len(doc.content) and isinstance(doc.content, Utterance) and doc.content[next_ut].alignment:
139
+ w.time = (w.time[0], doc.content[next_ut].alignment[0])
140
+ else:
141
+ w.time = (w.time[0], w.time[0]+500) # give half a second because we don't know
142
+
143
+ # just in case, bound the time by the utterance derived timings
144
+ if ut.alignment and ut.alignment[0] != None:
145
+ w.time = (max(w.time[0], ut.alignment[0]), min(w.time[1], ut.alignment[1]))
146
+ # if we ended up with timings that don't make sense, drop it
147
+ if w.time and w.time[0] >= w.time[1]:
148
+ w.time = None
149
+
150
+ # clear any built-in timing (i.e. we should use utterance-derived timing)
151
+ ut.time = None
152
+ # correct the text
153
+ if ut.alignment and ut.text != None:
154
+ if '\x15' not in ut.text:
155
+ ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
156
+ else:
157
+ ut.text = re.sub("\x15\d+_\d+\x15",
158
+ f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
159
+ elif ut.text != None:
160
+ ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
161
+
162
+ return doc
@@ -0,0 +1,3 @@
1
+ 0.7.11-beta.4
2
+ Feburary 6nd, 2025
3
+ Wav2vec support!
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.11b3
3
+ Version: 0.7.11b4
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -45,6 +45,8 @@ batchalign/models/utterance/execute.py
45
45
  batchalign/models/utterance/infer.py
46
46
  batchalign/models/utterance/prep.py
47
47
  batchalign/models/utterance/train.py
48
+ batchalign/models/wave2vec/__init__.py
49
+ batchalign/models/wave2vec/infer_fa.py
48
50
  batchalign/models/whisper/__init__.py
49
51
  batchalign/models/whisper/infer_asr.py
50
52
  batchalign/models/whisper/infer_fa.py
@@ -68,6 +70,7 @@ batchalign/pipelines/cleanup/support/filled_pauses.eng
68
70
  batchalign/pipelines/cleanup/support/replacements.eng
69
71
  batchalign/pipelines/cleanup/support/test.test
70
72
  batchalign/pipelines/fa/__init__.py
73
+ batchalign/pipelines/fa/wave2vec_fa.py
71
74
  batchalign/pipelines/fa/whisper_fa.py
72
75
  batchalign/pipelines/morphosyntax/__init__.py
73
76
  batchalign/pipelines/morphosyntax/coref.py
@@ -1 +0,0 @@
1
- from .whisper_fa import WhisperFAEngine
@@ -1,3 +0,0 @@
1
- 0.7.11-beta.3
2
- Feburary 2nd, 2025
3
- Incorporate additional pauses
File without changes
File without changes
File without changes
File without changes
File without changes