batchalign 0.7.6a32__tar.gz → 0.7.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.6a32/batchalign.egg-info → batchalign-0.7.7}/PKG-INFO +5 -1
- {batchalign-0.7.6a32 → batchalign-0.7.7}/README.md +4 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/utils.py +1 -1
- batchalign-0.7.7/batchalign/models/utils.py +199 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/analysis/eval.py +44 -6
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/ud.py +14 -7
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utterance/ud_utterance.py +2 -1
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/utils/utils.py +3 -0
- batchalign-0.7.7/batchalign/version +3 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7/batchalign.egg-info}/PKG-INFO +5 -1
- batchalign-0.7.6a32/batchalign/models/utils.py +0 -86
- batchalign-0.7.6a32/batchalign/version +0 -3
- {batchalign-0.7.6a32 → batchalign-0.7.7}/LICENSE +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/MANIFEST.in +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/__main__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/constants.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/document.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/errors.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/setup.cfg +0 -0
- {batchalign-0.7.6a32 → batchalign-0.7.7}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.7
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -59,6 +59,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
59
59
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
60
60
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
61
61
|
- your distribution's instructions for Linux
|
62
|
+
|
63
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
62
64
|
|
63
65
|
### Install and Update the Package
|
64
66
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -75,6 +77,8 @@ Windows:
|
|
75
77
|
py -m pip install -U batchalign
|
76
78
|
```
|
77
79
|
|
80
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
81
|
+
|
78
82
|
### Rock and Roll
|
79
83
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
80
84
|
|
@@ -21,6 +21,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
21
21
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
22
22
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
23
23
|
- your distribution's instructions for Linux
|
24
|
+
|
25
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
24
26
|
|
25
27
|
### Install and Update the Package
|
26
28
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -37,6 +39,8 @@ Windows:
|
|
37
39
|
py -m pip install -U batchalign
|
38
40
|
```
|
39
41
|
|
42
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
43
|
+
|
40
44
|
### Rock and Roll
|
41
45
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
42
46
|
|
@@ -108,7 +108,7 @@ def annotation_clean(content, special=False):
|
|
108
108
|
cleaned_word = re.sub(r"\x15\d+_\d+\x15", '', cleaned_word)
|
109
109
|
if not special:
|
110
110
|
cleaned_word = re.sub(r"&~\w+", '', cleaned_word)
|
111
|
-
cleaned_word = cleaned_word.replace("(","").replace(")","")
|
111
|
+
# cleaned_word = cleaned_word.replace("(","").replace(")","")
|
112
112
|
cleaned_word = cleaned_word.replace("[","").replace("]","")
|
113
113
|
cleaned_word = cleaned_word.replace("<","").replace(">","")
|
114
114
|
cleaned_word = cleaned_word.replace("“","").replace("”","")
|
@@ -0,0 +1,199 @@
|
|
1
|
+
import torch
|
2
|
+
from transformers.models.whisper.generation_whisper import _dynamic_time_warping as _dynamic_time_warping
|
3
|
+
from transformers.models.whisper.generation_whisper import _median_filter as _median_filter
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
def _extract_token_timestamps(
|
9
|
+
self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, num_input_ids=None
|
10
|
+
):
|
11
|
+
"""
|
12
|
+
Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
13
|
+
map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
14
|
+
cross-attentions will be cropped before applying DTW.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
tensor containing the timestamps in seconds for each predicted token
|
18
|
+
"""
|
19
|
+
# Create a list with `decoder_layers` elements, each a tensor of shape
|
20
|
+
# (batch size, attention_heads, output length, input length).
|
21
|
+
cross_attentions = []
|
22
|
+
for i in range(self.config.decoder_layers):
|
23
|
+
cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
24
|
+
|
25
|
+
# Select specific cross-attention layers and heads. This is a tensor
|
26
|
+
# of shape (batch size, num selected, output length, input length).
|
27
|
+
weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
28
|
+
weights = weights.permute([1, 0, 2, 3])
|
29
|
+
|
30
|
+
weight_length = None
|
31
|
+
|
32
|
+
if "beam_indices" in generate_outputs:
|
33
|
+
# If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths
|
34
|
+
# since the beam search strategy chooses the most probable sequences at the end of the search.
|
35
|
+
# In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length
|
36
|
+
weight_length = (generate_outputs.beam_indices != -1).sum(-1).max()
|
37
|
+
weight_length = weight_length if num_input_ids is None else weight_length + num_input_ids
|
38
|
+
|
39
|
+
# beam search takes `decoder_input_ids` into account in the `beam_indices` length
|
40
|
+
# but forgot to shift the beam_indices by the number of `decoder_input_ids`
|
41
|
+
beam_indices = torch.zeros_like(generate_outputs.beam_indices[:, :weight_length], dtype=torch.float32)
|
42
|
+
# we actually shif the beam indices here
|
43
|
+
beam_indices[:, num_input_ids:] = generate_outputs.beam_indices[:, : weight_length - num_input_ids]
|
44
|
+
|
45
|
+
weights = weights[:, :, :weight_length]
|
46
|
+
|
47
|
+
# If beam index is still -1, it means that the associated token id is EOS
|
48
|
+
# We need to replace the index with 0 since index_select gives an error if any of the indexes is -1.
|
49
|
+
beam_indices = beam_indices.masked_fill(beam_indices == -1, 0)
|
50
|
+
|
51
|
+
# Select the cross attention from the right beam for each output sequences
|
52
|
+
weights = torch.stack(
|
53
|
+
[
|
54
|
+
torch.index_select(weights[:, :, i, :], dim=0, index=beam_indices[:, i])
|
55
|
+
for i in range(beam_indices.shape[1])
|
56
|
+
],
|
57
|
+
dim=2,
|
58
|
+
)
|
59
|
+
|
60
|
+
# make sure timestamps are as long as weights
|
61
|
+
input_length = weight_length or cross_attentions[0].shape[2]
|
62
|
+
batch_size = generate_outputs.sequences.shape[0]
|
63
|
+
timestamps = torch.zeros(
|
64
|
+
(batch_size, input_length + 1), dtype=torch.float32, device=generate_outputs.sequences.device
|
65
|
+
)
|
66
|
+
|
67
|
+
if num_frames is not None:
|
68
|
+
# two cases:
|
69
|
+
# 1. num_frames is the same for each sample -> compute the DTW matrix for each sample in parallel
|
70
|
+
# 2. num_frames is different, compute the DTW matrix for each sample sequentially
|
71
|
+
|
72
|
+
# we're using np.unique because num_frames can be int/list/tuple
|
73
|
+
if isinstance(num_frames, int):
|
74
|
+
weights = weights[..., : num_frames // 2]
|
75
|
+
|
76
|
+
elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1:
|
77
|
+
weights = weights[..., : num_frames[0] // 2]
|
78
|
+
|
79
|
+
elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1:
|
80
|
+
weights = weights[..., : num_frames[0] // 2]
|
81
|
+
|
82
|
+
else:
|
83
|
+
# num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
|
84
|
+
repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
|
85
|
+
num_frames = num_frames.cpu() if isinstance(num_frames, (torch.Tensor)) else num_frames
|
86
|
+
num_frames = np.repeat(num_frames, repeat_time)
|
87
|
+
|
88
|
+
if num_frames is None or isinstance(num_frames, int):
|
89
|
+
# Normalize and smoothen the weights.
|
90
|
+
std = torch.std(weights, dim=-2, keepdim=True, unbiased=False)
|
91
|
+
mean = torch.mean(weights, dim=-2, keepdim=True)
|
92
|
+
weights = (weights - mean) / std
|
93
|
+
weights = _median_filter(weights, self.config.median_filter_width)
|
94
|
+
|
95
|
+
# Average the different cross-attention heads.
|
96
|
+
weights = weights.mean(dim=1)
|
97
|
+
|
98
|
+
# Perform dynamic time warping on each element of the batch.
|
99
|
+
for batch_idx in range(batch_size):
|
100
|
+
if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)):
|
101
|
+
matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
|
102
|
+
|
103
|
+
# Normalize and smoothen the weights.
|
104
|
+
std = torch.std(matrix, dim=-2, keepdim=True, unbiased=False)
|
105
|
+
mean = torch.mean(matrix, dim=-2, keepdim=True)
|
106
|
+
matrix = (matrix - mean) / std
|
107
|
+
matrix = _median_filter(matrix, self.config.median_filter_width)
|
108
|
+
|
109
|
+
# Average the different cross-attention heads.
|
110
|
+
matrix = matrix.mean(dim=0)
|
111
|
+
else:
|
112
|
+
matrix = weights[batch_idx]
|
113
|
+
|
114
|
+
text_indices, time_indices = _dynamic_time_warping(-matrix.cpu().double().numpy())
|
115
|
+
jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
116
|
+
jump_times = time_indices[jumps] * time_precision
|
117
|
+
timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
118
|
+
|
119
|
+
return timestamps
|
120
|
+
|
121
|
+
# def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
|
122
|
+
# """
|
123
|
+
# Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
124
|
+
# map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
125
|
+
# cross-attentions will be cropped before applying DTW.
|
126
|
+
|
127
|
+
# Returns:
|
128
|
+
# tensor containing the timestamps in seconds for each predicted token
|
129
|
+
# """
|
130
|
+
# # Create a list with `decoder_layers` elements, each a tensor of shape
|
131
|
+
# # (batch size, attention_heads, output length, input length).
|
132
|
+
# cross_attentions = []
|
133
|
+
# for i in range(self.config.decoder_layers):
|
134
|
+
# cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
135
|
+
|
136
|
+
# # Select specific cross-attention layers and heads. This is a tensor
|
137
|
+
# # of shape (batch size, num selected, output length, input length).
|
138
|
+
# weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
139
|
+
# weights = weights.permute([1, 0, 2, 3])
|
140
|
+
# if num_frames is not None:
|
141
|
+
# weights = weights[..., : num_frames // 2]
|
142
|
+
|
143
|
+
# # Normalize and smoothen the weights.
|
144
|
+
# std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
|
145
|
+
# weights = (weights - mean) / std
|
146
|
+
# weights = _median_filter(weights, self.config.median_filter_width)
|
147
|
+
|
148
|
+
# # Average the different cross-attention heads.
|
149
|
+
# matrix = weights.mean(dim=1)
|
150
|
+
|
151
|
+
# timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
|
152
|
+
|
153
|
+
# # Perform dynamic time warping on each element of the batch.
|
154
|
+
# for batch_idx in range(timestamps.shape[0]):
|
155
|
+
# text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].float().cpu().numpy())
|
156
|
+
# jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
157
|
+
# jump_times = time_indices[jumps] * time_precision
|
158
|
+
# timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
159
|
+
|
160
|
+
# return timestamps
|
161
|
+
|
162
|
+
|
163
|
+
@dataclass
|
164
|
+
class ASRAudioFile:
|
165
|
+
file : str
|
166
|
+
tensor : torch.Tensor
|
167
|
+
rate : int
|
168
|
+
|
169
|
+
def chunk(self,begin_ms, end_ms):
|
170
|
+
"""Get a chunk of the audio.
|
171
|
+
|
172
|
+
Parameters
|
173
|
+
----------
|
174
|
+
begin_ms : int
|
175
|
+
Milliseconds of the start of the slice.
|
176
|
+
end_ms : int
|
177
|
+
Milliseconds of the end of the slice.
|
178
|
+
|
179
|
+
Returns
|
180
|
+
-------
|
181
|
+
torch.Tensor
|
182
|
+
The returned chunk to supply to the ASR engine.
|
183
|
+
"""
|
184
|
+
|
185
|
+
data = self.tensor[int(round((begin_ms/1000)*self.rate)):
|
186
|
+
int(round((end_ms/1000)*self.rate))]
|
187
|
+
|
188
|
+
return data
|
189
|
+
|
190
|
+
def all(self):
|
191
|
+
"""Get the audio in its entirety
|
192
|
+
|
193
|
+
Notes
|
194
|
+
-----
|
195
|
+
like `chunk()` but all of the audio
|
196
|
+
"""
|
197
|
+
|
198
|
+
return self.tensor
|
199
|
+
|
@@ -3,6 +3,7 @@ eval.py
|
|
3
3
|
Engines for transcript evaluation
|
4
4
|
"""
|
5
5
|
|
6
|
+
import re
|
6
7
|
from batchalign.document import *
|
7
8
|
from batchalign.pipelines.base import *
|
8
9
|
from batchalign.pipelines.asr.utils import *
|
@@ -22,11 +23,34 @@ class EvaluationEngine(BatchalignEngine):
|
|
22
23
|
forms = [ j.text.lower() for i in doc.content for j in i.content if isinstance(i, Utterance)]
|
23
24
|
gold_forms = [ j.text.lower() for i in gold.content for j in i.content if isinstance(i, Utterance)]
|
24
25
|
|
25
|
-
forms = [i for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
26
|
-
gold_forms = [i for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
26
|
+
forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
27
|
+
gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
28
|
+
|
29
|
+
forms = [re.sub(r"\((.*)\)",r"", i) for i in forms]
|
30
|
+
gold_forms = [re.sub(r"\((.*)\)",r"", i) for i in gold_forms]
|
31
|
+
|
32
|
+
# if there are single letter frames, we combine them tofgether
|
33
|
+
# until the utterance is done or there isn't any left
|
34
|
+
forms_finished = []
|
35
|
+
|
36
|
+
single_sticky = ""
|
37
|
+
is_single = False
|
38
|
+
|
39
|
+
for i in forms:
|
40
|
+
if len(i) == 1:
|
41
|
+
single_sticky += i
|
42
|
+
else:
|
43
|
+
if single_sticky != "":
|
44
|
+
forms_finished.append(single_sticky)
|
45
|
+
single_sticky = ""
|
46
|
+
forms_finished.append(i)
|
47
|
+
|
48
|
+
if single_sticky != "":
|
49
|
+
forms_finished.append(single_sticky)
|
50
|
+
single_sticky = ""
|
27
51
|
|
28
52
|
# dp!
|
29
|
-
alignment = align(
|
53
|
+
alignment = align(forms_finished, gold_forms, False)
|
30
54
|
|
31
55
|
# calculate each type of error
|
32
56
|
sub = 0
|
@@ -39,14 +63,28 @@ class EvaluationEngine(BatchalignEngine):
|
|
39
63
|
# but if we have <extra.reference> <extra.reference> this is 2 insertions
|
40
64
|
|
41
65
|
cleaned_alignment = []
|
66
|
+
# whether we had a "firstname" in reference document and hence are
|
67
|
+
# anticipating a payload for it (the actual name) in the next entry in the
|
68
|
+
# alignment
|
69
|
+
anticipating_payload = False
|
42
70
|
|
43
71
|
for i in alignment:
|
44
72
|
|
45
73
|
if isinstance(i, Extra):
|
46
|
-
|
47
|
-
|
74
|
+
|
75
|
+
if i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
|
76
|
+
if (isinstance(cleaned_alignment[-1], Extra) and
|
77
|
+
cleaned_alignment[-1].extra_type == ExtraType.PAYLOAD and
|
78
|
+
len(cleaned_alignment) > 0):
|
79
|
+
cleaned_alignment.pop(-1)
|
80
|
+
else:
|
81
|
+
anticipating_payload = True
|
48
82
|
cleaned_alignment.append(Match(i.key, None, None))
|
49
83
|
continue
|
84
|
+
elif i.extra_type == ExtraType.PAYLOAD and anticipating_payload:
|
85
|
+
anticipating_payload = False
|
86
|
+
continue
|
87
|
+
|
50
88
|
|
51
89
|
if prev_error != None and prev_error != i.extra_type:
|
52
90
|
# this is a substitution: we have different "extra"s in
|
@@ -75,7 +113,7 @@ class EvaluationEngine(BatchalignEngine):
|
|
75
113
|
cleaned_alignment.append(i)
|
76
114
|
|
77
115
|
diff = []
|
78
|
-
for i in
|
116
|
+
for i in cleaned_alignment:
|
79
117
|
if isinstance(i, Extra):
|
80
118
|
diff.append(f"{'+' if i.extra_type == ExtraType.REFERENCE else '-'} {i.key}")
|
81
119
|
else:
|
@@ -18,6 +18,7 @@ from stanza import DownloadMethod
|
|
18
18
|
from torch import heaviside
|
19
19
|
|
20
20
|
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
|
21
|
+
from stanza.resources.common import download_resources_json, load_resources_json, get_language_resources
|
21
22
|
|
22
23
|
# the loading bar
|
23
24
|
from tqdm import tqdm
|
@@ -115,6 +116,7 @@ def handler(word, lang=None):
|
|
115
116
|
target = target.replace('/100', '')
|
116
117
|
target = target.replace('/r', '')
|
117
118
|
target = target.replace('(', '')
|
119
|
+
target = target.replace("(","").replace(")","")
|
118
120
|
|
119
121
|
# remove attachments
|
120
122
|
if "|" in target:
|
@@ -217,9 +219,9 @@ def handler__NOUN(word, lang=None):
|
|
217
219
|
type = feats.get("PronType", "")
|
218
220
|
|
219
221
|
apm = ""
|
220
|
-
if lang == "fr":
|
222
|
+
if lang == "fr" and number_str == "-Plur":
|
221
223
|
from batchalign.pipelines.morphosyntax.fr.apm import is_apm_noun
|
222
|
-
apm = "
|
224
|
+
apm = "Apm" if is_apm_noun(word.text) else ""
|
223
225
|
|
224
226
|
|
225
227
|
if word.deprel == "obj" and case.strip() == "":
|
@@ -738,13 +740,17 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
738
740
|
else:
|
739
741
|
config["tokenize_postprocessor"] = lambda x:adlist_processor(x)
|
740
742
|
|
743
|
+
download_resources_json()
|
744
|
+
resources = load_resources_json()
|
745
|
+
mwt_exclusion = ["hr", "zh", "zh-hans", "zh-hant", "ja", "ko",
|
746
|
+
"sl", "sr", "bg", "ru", "et", "hu",
|
747
|
+
"eu", "el", "he", "af", "ga", "da", "ro"]
|
748
|
+
|
741
749
|
if "zh" in lang:
|
742
750
|
lang.pop(lang.index("zh"))
|
743
751
|
lang.append("zh-hans")
|
744
|
-
|
745
|
-
elif not any(
|
746
|
-
"sl", "sr", "bg", "ru", "et", "hu",
|
747
|
-
"eu", "el", "he", "af", "ga", "da", "ro"] for i in lang]):
|
752
|
+
|
753
|
+
elif not any(i in mwt_exclusion or "mwt" not in get_language_resources(resources, i) for i in lang):
|
748
754
|
if "en" in lang:
|
749
755
|
config["processors"]["mwt"] = "gum"
|
750
756
|
else:
|
@@ -848,7 +854,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
848
854
|
inputs.append(line_cut)
|
849
855
|
|
850
856
|
try:
|
851
|
-
sents = nlp(line_cut.strip()).sentences
|
857
|
+
sents = nlp(line_cut.replace("(","").replace(")","").strip()).sentences
|
852
858
|
|
853
859
|
if len(sents) == 0:
|
854
860
|
continue
|
@@ -958,6 +964,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
958
964
|
retokenized_ut = re.sub(r"⁎[⁎ ]*(.*?)[⁎ ]*⁎", r"⁎\1⁎ ", retokenized_ut)
|
959
965
|
retokenized_ut = re.sub(r"\[\*(.)\]", r"[* \1]", retokenized_ut)
|
960
966
|
retokenized_ut = re.sub(r" +", r" ", retokenized_ut)
|
967
|
+
retokenized_ut = re.sub(r"⁎ @", r"⁎@", retokenized_ut)
|
961
968
|
|
962
969
|
# pray to everyone that it works---this will simply crash and ignore
|
963
970
|
# the utterance if it didn't work, so we are doing this as a sanity
|
@@ -84,6 +84,7 @@ def parse_tree(subtree):
|
|
84
84
|
for i in stack]
|
85
85
|
|
86
86
|
def process_ut(ut, nlp):
|
87
|
+
|
87
88
|
# remove punct
|
88
89
|
if (ut.content[-1].type == TokenType.PUNCT or
|
89
90
|
ut.content[-1].text in ENDING_PUNCT):
|
@@ -142,7 +143,7 @@ def process_ut(ut, nlp):
|
|
142
143
|
if isinstance(i, Match):
|
143
144
|
matches.append(i)
|
144
145
|
elif i.extra_type == ExtraType.REFERENCE:
|
145
|
-
new_refs.append(ReferenceTarget(key=i.key, payload=i.payload))
|
146
|
+
new_refs.append(ReferenceTarget(key=i.key, payload=i.payload if i.payload else -1))
|
146
147
|
|
147
148
|
# we now sort the references based on their orignial utterance order
|
148
149
|
matches = matches + new_refs
|
@@ -29,6 +29,7 @@ def word_tokenize(str):
|
|
29
29
|
return tmp.tokenize(str)
|
30
30
|
except LookupError:
|
31
31
|
nltk.download("punkt")
|
32
|
+
nltk.download("punkt_tab")
|
32
33
|
return tmp.tokenize(str)
|
33
34
|
|
34
35
|
def sent_tokenize(str):
|
@@ -49,6 +50,7 @@ def sent_tokenize(str):
|
|
49
50
|
return ST(str)
|
50
51
|
except LookupError:
|
51
52
|
nltk.download("punkt")
|
53
|
+
nltk.download("punkt_tab")
|
52
54
|
return ST(str)
|
53
55
|
|
54
56
|
def detokenize(tokens):
|
@@ -69,6 +71,7 @@ def detokenize(tokens):
|
|
69
71
|
return TreebankWordDetokenizer().detokenize(tokens)
|
70
72
|
except LookupError:
|
71
73
|
nltk.download("punkt")
|
74
|
+
nltk.download("punkt_tab")
|
72
75
|
return TreebankWordDetokenizer().detokenize(tokens)
|
73
76
|
|
74
77
|
def correct_timing(doc):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.7
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -59,6 +59,8 @@ The following instructions provide a quick start to installing Batchalign. For m
|
|
59
59
|
1. Install Python 3.11: [via this link](https://www.python.org/ftp/python/3.11.7/python-3.11.7-amd64.exe)
|
60
60
|
2. If later commands report `pip module not found`, [this page may help](https://github.com/TalkBank/batchalign2/wiki/Troubleshooting-Tips#get-pip-on-windows)
|
61
61
|
- your distribution's instructions for Linux
|
62
|
+
|
63
|
+
For first-time users of Python, note that if you didn't install Python 3.11 (as we recommended above), it may be complex to change Python versions downstream and may cause additional problems. We recommend explicitly installing Python 3.11 by installing it explicitly via specifying a version number as we show above.
|
62
64
|
|
63
65
|
### Install and Update the Package
|
64
66
|
You can get Batchalign from PyPi, and you can update the package in the same way:
|
@@ -75,6 +77,8 @@ Windows:
|
|
75
77
|
py -m pip install -U batchalign
|
76
78
|
```
|
77
79
|
|
80
|
+
Note that if your system reports `pip: command not found`, replace every use of `pip` in the instructions with `pip3` and try again.
|
81
|
+
|
78
82
|
### Rock and Roll
|
79
83
|
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
80
84
|
|
@@ -1,86 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
from transformers.models.whisper.generation_whisper import _dynamic_time_warping as _dynamic_time_warping
|
3
|
-
from transformers.models.whisper.generation_whisper import _median_filter as _median_filter
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
import numpy as np
|
7
|
-
|
8
|
-
def _extract_token_timestamps(self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None):
|
9
|
-
"""
|
10
|
-
Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
|
11
|
-
map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
|
12
|
-
cross-attentions will be cropped before applying DTW.
|
13
|
-
|
14
|
-
Returns:
|
15
|
-
tensor containing the timestamps in seconds for each predicted token
|
16
|
-
"""
|
17
|
-
# Create a list with `decoder_layers` elements, each a tensor of shape
|
18
|
-
# (batch size, attention_heads, output length, input length).
|
19
|
-
cross_attentions = []
|
20
|
-
for i in range(self.config.decoder_layers):
|
21
|
-
cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
|
22
|
-
|
23
|
-
# Select specific cross-attention layers and heads. This is a tensor
|
24
|
-
# of shape (batch size, num selected, output length, input length).
|
25
|
-
weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
|
26
|
-
weights = weights.permute([1, 0, 2, 3])
|
27
|
-
if num_frames is not None:
|
28
|
-
weights = weights[..., : num_frames // 2]
|
29
|
-
|
30
|
-
# Normalize and smoothen the weights.
|
31
|
-
std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
|
32
|
-
weights = (weights - mean) / std
|
33
|
-
weights = _median_filter(weights, self.config.median_filter_width)
|
34
|
-
|
35
|
-
# Average the different cross-attention heads.
|
36
|
-
matrix = weights.mean(dim=1)
|
37
|
-
|
38
|
-
timestamps = torch.zeros_like(generate_outputs.sequences, dtype=torch.float32)
|
39
|
-
|
40
|
-
# Perform dynamic time warping on each element of the batch.
|
41
|
-
for batch_idx in range(timestamps.shape[0]):
|
42
|
-
text_indices, time_indices = _dynamic_time_warping(-matrix[batch_idx].float().cpu().numpy())
|
43
|
-
jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
|
44
|
-
jump_times = time_indices[jumps] * time_precision
|
45
|
-
timestamps[batch_idx, 1:] = torch.tensor(jump_times)
|
46
|
-
|
47
|
-
return timestamps
|
48
|
-
|
49
|
-
|
50
|
-
@dataclass
|
51
|
-
class ASRAudioFile:
|
52
|
-
file : str
|
53
|
-
tensor : torch.Tensor
|
54
|
-
rate : int
|
55
|
-
|
56
|
-
def chunk(self,begin_ms, end_ms):
|
57
|
-
"""Get a chunk of the audio.
|
58
|
-
|
59
|
-
Parameters
|
60
|
-
----------
|
61
|
-
begin_ms : int
|
62
|
-
Milliseconds of the start of the slice.
|
63
|
-
end_ms : int
|
64
|
-
Milliseconds of the end of the slice.
|
65
|
-
|
66
|
-
Returns
|
67
|
-
-------
|
68
|
-
torch.Tensor
|
69
|
-
The returned chunk to supply to the ASR engine.
|
70
|
-
"""
|
71
|
-
|
72
|
-
data = self.tensor[int(round((begin_ms/1000)*self.rate)):
|
73
|
-
int(round((end_ms/1000)*self.rate))]
|
74
|
-
|
75
|
-
return data
|
76
|
-
|
77
|
-
def all(self):
|
78
|
-
"""Get the audio in its entirety
|
79
|
-
|
80
|
-
Notes
|
81
|
-
-----
|
82
|
-
like `chunk()` but all of the audio
|
83
|
-
"""
|
84
|
-
|
85
|
-
return self.tensor
|
86
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.6a32 → batchalign-0.7.7}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|