batchalign 0.7.7a4__tar.gz → 0.7.7a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.7a4/batchalign.egg-info → batchalign-0.7.7a5}/PKG-INFO +5 -1
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/README.md +4 -0
- batchalign-0.7.7a5/batchalign/models/utils.py +199 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/ud.py +9 -4
- batchalign-0.7.7a5/batchalign/version +3 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5/batchalign.egg-info}/PKG-INFO +5 -1
- batchalign-0.7.7a4/batchalign/models/utils.py +0 -86
- batchalign-0.7.7a4/batchalign/version +0 -3
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/LICENSE +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/MANIFEST.in +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/__main__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/constants.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/document.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/errors.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/setup.cfg +0 -0
- {batchalign-0.7.7a4 → batchalign-0.7.7a5}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.7a5
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -59,6 +59,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
59
59
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
60
60
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
61
61
|
- your distribution's instructions for Linux
|
62
|
+
|
63
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
62
64
|
|
63
65
|
### Install and Update the Package
|
64
66
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -75,6 +77,8 @@ Windows:
|
|
75
77
|
py -m pip install -U batchalign
|
76
78
|
```
|
77
79
|
|
80
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
81
|
+
|
78
82
|
### Rock and Roll
|
79
83
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
80
84
|
|
@@ -21,6 +21,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
21
21
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
22
22
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
23
23
|
- your distribution's instructions for Linux
|
24
|
+
|
25
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
24
26
|
|
25
27
|
### Install and Update the Package
|
26
28
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -37,6 +39,8 @@ Windows:
|
|
37
39
|
py -m pip install -U batchalign
|
38
40
|
```
|
39
41
|
|
42
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
43
|
+
|
40
44
|
### Rock and Roll
|
41
45
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
42
46
|
|
@@ -0,0 +1,199 @@
|
|
1
|
+
import torch
|
2
|
+
from transformers.models.whisper.generation_whisper import _dynamic_time_warping as _dynamic_time_warping
|
3
|
+
from transformers.models.whisper.generation_whisper import _median_filter as _median_filter
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
def _extract_token_timestamps(
|
9
|
+
self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, num_input_ids=None
|
10
|
+
):
|
11
|
+
"""
|
12
|
+
Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
13
|
+
map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
14
|
+
cross-attentions will be cropped before applying DTW.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
tensor containing the timestamps in seconds for each predicted token
|
18
|
+
"""
|
19
|
+
# Create a list with `decoder_layers` elements, each a tensor of shape
|
20
|
+
# (batch size, attention_heads, output length, input length).
|
21
|
+
cross_attentions = []
|
22
|
+
for i in range(self.config.decoder_layers):
|
23
|
+
cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
24
|
+
|
25
|
+
# Select specific cross-attention layers and heads. This is a tensor
|
26
|
+
# of shape (batch size, num selected, output length, input length).
|
27
|
+
weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
28
|
+
weights = weights.permute([1, 0, 2, 3])
|
29
|
+
|
30
|
+
weight_length = None
|
31
|
+
|
32
|
+
if "beam_indices" in generate_outputs:
|
33
|
+
# If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths
|
34
|
+
# since the beam search strategy chooses the most probable sequences at the end of the search.
|
35
|
+
# In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length
|
36
|
+
weight_length = (generate_outputs.beam_indices != -1).sum(-1).max()
|
37
|
+
weight_length = weight_length if num_input_ids is None else weight_length + num_input_ids
|
38
|
+
|
39
|
+
# beam search takes `decoder_input_ids` into account in the `beam_indices` length
|
40
|
+
# but forgot to shift the beam_indices by the number of `decoder_input_ids`
|
41
|
+
beam_indices = torch.zeros_like(generate_outputs.beam_indices[:, :weight_length], dtype=torch.float32)
|
42
|
+
# we actually shif the beam indices here
|
43
|
+
beam_indices[:, num_input_ids:] = generate_outputs.beam_indices[:, : weight_length - num_input_ids]
|
44
|
+
|
45
|
+
weights = weights[:, :, :weight_length]
|
46
|
+
|
47
|
+
# If beam index is still -1, it means that the associated token id is EOS
|
48
|
+
# We need to replace the index with 0 since index_select gives an error if any of the indexes is -1.
|
49
|
+
beam_indices = beam_indices.masked_fill(beam_indices == -1, 0)
|
50
|
+
|
51
|
+
# Select the cross attention from the right beam for each output sequences
|
52
|
+
weights = torch.stack(
|
53
|
+
[
|
54
|
+
torch.index_select(weights[:, :, i, :], dim=0, index=beam_indices[:, i])
|
55
|
+
for i in range(beam_indices.shape[1])
|
56
|
+
],
|
57
|
+
dim=2,
|
58
|
+
)
|
59
|
+
|
60
|
+
# make sure timestamps are as long as weights
|
61
|
+
input_length = weight_length or cross_attentions[0].shape[2]
|
62
|
+
batch_size = generate_outputs.sequences.shape[0]
|
63
|
+
timestamps = torch.zeros(
|
64
|
+
(batch_size, input_length + 1), dtype=torch.float32, device=generate_outputs.sequences.device
|
65
|
+
)
|
66
|
+
|
67
|
+
if num_frames is not None:
|
68
|
+
# two cases:
|
69
|
+
# 1. num_frames is the same for each sample -> compute the DTW matrix for each sample in parallel
|
70
|
+
# 2. num_frames is different, compute the DTW matrix for each sample sequentially
|
71
|
+
|
72
|
+
# we're using np.unique because num_frames can be int/list/tuple
|
73
|
+
if isinstance(num_frames, int):
|
74
|
+
weights = weights[..., : num_frames // 2]
|
75
|
+
|
76
|
+
elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1:
|
77
|
+
weights = weights[..., : num_frames[0] // 2]
|
78
|
+
|
79
|
+
elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1:
|
80
|
+
weights = weights[..., : num_frames[0] // 2]
|
81
|
+
|
82
|
+
else:
|
83
|
+
# num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
|
84
|
+
repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
|
85
|
+
num_frames = num_frames.cpu() if isinstance(num_frames, (torch.Tensor)) else num_frames
|
86
|
+
num_frames = np.repeat(num_frames, repeat_time)
|
87
|
+
|
88
|
+
if num_frames is None or isinstance(num_frames, int):
|
89
|
+
# Normalize and smoothen the weights.
|
90
|
+
std = torch.std(weights, dim=-2, keepdim=True, unbiased=False)
|
91
|
+
mean = torch.mean(weights, dim=-2, keepdim=True)
|
92
|
+
weights = (weights - mean) / std
|
93
|
+
weights = _median_filter(weights, self.config.median_filter_width)
|
94
|
+
|
95
|
+
# Average the different cross-attention heads.
|
96
|
+
weights = weights.mean(dim=1)
|
97
|
+
|
98
|
+
# Perform dynamic time warping on each element of the batch.
|
99
|
+
for batch_idx in range(batch_size):
|
100
|
+
if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)):
|
101
|
+
matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
|
102
|
+
|
103
|
+
# Normalize and smoothen the weights.
|
104
|
+
std = torch.std(matrix, dim=-2, keepdim=True, unbiased=False)
|
105
|
+
mean = torch.mean(matrix, dim=-2, keepdim=True)
|
106
|
+
matrix = (matrix - mean) / std
|
107
|
+
matrix = _median_filter(matrix, self.config.median_filter_width)
|
108
|
+
|
109
|
+
# Average the different cross-attention heads.
|
110
|
+
matrix = matrix.mean(dim=0)
|
111
|
+
else:
|
112
|
+
matrix = weights[batch_idx]
|
113
|
+
|
114
|
+
text_indices, time_indices = _dynamic_time_warping(-matrix.cpu().double().numpy())
|
115
|
+
jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
116
|
+
jump_times = time_indices[jumps] * time_precision
|
117
|
+
timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
118
|
+
|
119
|
+
return timestamps
|
120
|
+
|
121
|
+
# def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
|
122
|
+
# """
|
123
|
+
# Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
124
|
+
# map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
125
|
+
# cross-attentions will be cropped before applying DTW.
|
126
|
+
|
127
|
+
# Returns:
|
128
|
+
# tensor containing the timestamps in seconds for each predicted token
|
129
|
+
# """
|
130
|
+
# # Create a list with `decoder_layers` elements, each a tensor of shape
|
131
|
+
# # (batch size, attention_heads, output length, input length).
|
132
|
+
# cross_attentions = []
|
133
|
+
# for i in range(self.config.decoder_layers):
|
134
|
+
# cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
135
|
+
|
136
|
+
# # Select specific cross-attention layers and heads. This is a tensor
|
137
|
+
# # of shape (batch size, num selected, output length, input length).
|
138
|
+
# weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
139
|
+
# weights = weights.permute([1, 0, 2, 3])
|
140
|
+
# if num_frames is not None:
|
141
|
+
# weights = weights[..., : num_frames // 2]
|
142
|
+
|
143
|
+
# # Normalize and smoothen the weights.
|
144
|
+
# std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
|
145
|
+
# weights = (weights - mean) / std
|
146
|
+
# weights = _median_filter(weights, self.config.median_filter_width)
|
147
|
+
|
148
|
+
# # Average the different cross-attention heads.
|
149
|
+
# matrix = weights.mean(dim=1)
|
150
|
+
|
151
|
+
# timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
|
152
|
+
|
153
|
+
# # Perform dynamic time warping on each element of the batch.
|
154
|
+
# for batch_idx in range(timestamps.shape[0]):
|
155
|
+
# text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].float().cpu().numpy())
|
156
|
+
# jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
157
|
+
# jump_times = time_indices[jumps] * time_precision
|
158
|
+
# timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
159
|
+
|
160
|
+
# return timestamps
|
161
|
+
|
162
|
+
|
163
|
+
@dataclass
|
164
|
+
class ASRAudioFile:
|
165
|
+
file : str
|
166
|
+
tensor : torch.Tensor
|
167
|
+
rate : int
|
168
|
+
|
169
|
+
def chunk(self,begin_ms, end_ms):
|
170
|
+
"""Get a chunk of the audio.
|
171
|
+
|
172
|
+
Parameters
|
173
|
+
----------
|
174
|
+
begin_ms : int
|
175
|
+
Milliseconds of the start of the slice.
|
176
|
+
end_ms : int
|
177
|
+
Milliseconds of the end of the slice.
|
178
|
+
|
179
|
+
Returns
|
180
|
+
-------
|
181
|
+
torch.Tensor
|
182
|
+
The returned chunk to supply to the ASR engine.
|
183
|
+
"""
|
184
|
+
|
185
|
+
data = self.tensor[int(round((begin_ms/1000)*self.rate)):
|
186
|
+
int(round((end_ms/1000)*self.rate))]
|
187
|
+
|
188
|
+
return data
|
189
|
+
|
190
|
+
def all(self):
|
191
|
+
"""Get the audio in its entirety
|
192
|
+
|
193
|
+
Notes
|
194
|
+
-----
|
195
|
+
like `chunk()` but all of the audio
|
196
|
+
"""
|
197
|
+
|
198
|
+
return self.tensor
|
199
|
+
|
@@ -18,6 +18,7 @@ from stanza import DownloadMethod
|
|
18
18
|
from torch import heaviside
|
19
19
|
|
20
20
|
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
|
21
|
+
from stanza.resources.common import download_resources_json, load_resources_json, get_language_resources
|
21
22
|
|
22
23
|
# the loading bar
|
23
24
|
from tqdm import tqdm
|
@@ -739,13 +740,17 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
739
740
|
else:
|
740
741
|
config["tokenize_postprocessor"] = lambda x:adlist_processor(x)
|
741
742
|
|
743
|
+
download_resources_json()
|
744
|
+
resources = load_resources_json()
|
745
|
+
mwt_exclusion = ["hr", "zh", "zh-hans", "zh-hant", "ja", "ko",
|
746
|
+
"sl", "sr", "bg", "ru", "et", "hu",
|
747
|
+
"eu", "el", "he", "af", "ga", "da", "ro"]
|
748
|
+
|
742
749
|
if "zh" in lang:
|
743
750
|
lang.pop(lang.index("zh"))
|
744
751
|
lang.append("zh-hans")
|
745
|
-
|
746
|
-
elif not any(
|
747
|
-
"sl", "sr", "bg", "ru", "et", "hu",
|
748
|
-
"eu", "el", "he", "af", "ga", "da", "ro"] for i in lang]):
|
752
|
+
|
753
|
+
elif not any(i in mwt_exclusion or "mwt" not in get_language_resources(resources, i) for i in lang):
|
749
754
|
if "en" in lang:
|
750
755
|
config["processors"]["mwt"] = "gum"
|
751
756
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.7a5
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -59,6 +59,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
59
59
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
60
60
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
61
61
|
- your distribution's instructions for Linux
|
62
|
+
|
63
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
62
64
|
|
63
65
|
### Install and Update the Package
|
64
66
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -75,6 +77,8 @@ Windows:
|
|
75
77
|
py -m pip install -U batchalign
|
76
78
|
```
|
77
79
|
|
80
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
81
|
+
|
78
82
|
### Rock and Roll
|
79
83
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
80
84
|
|
@@ -1,86 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
from transformers.models.whisper.generation_whisper import _dynamic_time_warping as _dynamic_time_warping
|
3
|
-
from transformers.models.whisper.generation_whisper import _median_filter as _median_filter
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
import numpy as np
|
7
|
-
|
8
|
-
def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
|
9
|
-
"""
|
10
|
-
Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
11
|
-
map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
12
|
-
cross-attentions will be cropped before applying DTW.
|
13
|
-
|
14
|
-
Returns:
|
15
|
-
tensor containing the timestamps in seconds for each predicted token
|
16
|
-
"""
|
17
|
-
# Create a list with `decoder_layers` elements, each a tensor of shape
|
18
|
-
# (batch size, attention_heads, output length, input length).
|
19
|
-
cross_attentions = []
|
20
|
-
for i in range(self.config.decoder_layers):
|
21
|
-
cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
22
|
-
|
23
|
-
# Select specific cross-attention layers and heads. This is a tensor
|
24
|
-
# of shape (batch size, num selected, output length, input length).
|
25
|
-
weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
26
|
-
weights = weights.permute([1, 0, 2, 3])
|
27
|
-
if num_frames is not None:
|
28
|
-
weights = weights[..., : num_frames // 2]
|
29
|
-
|
30
|
-
# Normalize and smoothen the weights.
|
31
|
-
std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
|
32
|
-
weights = (weights - mean) / std
|
33
|
-
weights = _median_filter(weights, self.config.median_filter_width)
|
34
|
-
|
35
|
-
# Average the different cross-attention heads.
|
36
|
-
matrix = weights.mean(dim=1)
|
37
|
-
|
38
|
-
timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
|
39
|
-
|
40
|
-
# Perform dynamic time warping on each element of the batch.
|
41
|
-
for batch_idx in range(timestamps.shape[0]):
|
42
|
-
text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].float().cpu().numpy())
|
43
|
-
jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
44
|
-
jump_times = time_indices[jumps] * time_precision
|
45
|
-
timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
46
|
-
|
47
|
-
return timestamps
|
48
|
-
|
49
|
-
|
50
|
-
@dataclass
|
51
|
-
class ASRAudioFile:
|
52
|
-
file : str
|
53
|
-
tensor : torch.Tensor
|
54
|
-
rate : int
|
55
|
-
|
56
|
-
def chunk(self,begin_ms, end_ms):
|
57
|
-
"""Get a chunk of the audio.
|
58
|
-
|
59
|
-
Parameters
|
60
|
-
----------
|
61
|
-
begin_ms : int
|
62
|
-
Milliseconds of the start of the slice.
|
63
|
-
end_ms : int
|
64
|
-
Milliseconds of the end of the slice.
|
65
|
-
|
66
|
-
Returns
|
67
|
-
-------
|
68
|
-
torch.Tensor
|
69
|
-
The returned chunk to supply to the ASR engine.
|
70
|
-
"""
|
71
|
-
|
72
|
-
data = self.tensor[int(round((begin_ms/1000)*self.rate)):
|
73
|
-
int(round((end_ms/1000)*self.rate))]
|
74
|
-
|
75
|
-
return data
|
76
|
-
|
77
|
-
def all(self):
|
78
|
-
"""Get the audio in its entirety
|
79
|
-
|
80
|
-
Notes
|
81
|
-
-----
|
82
|
-
like `chunk()` but all of the audio
|
83
|
-
"""
|
84
|
-
|
85
|
-
return self.tensor
|
86
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7a4 → batchalign-0.7.7a5}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|