batchalign 0.7.3b12__tar.gz → 0.7.3b14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.3b12/batchalign.egg-info → batchalign-0.7.3b14}/PKG-INFO +7 -7
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/README.md +6 -6
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/document.py +1 -0
- batchalign-0.7.3b14/batchalign/pipelines/morphosyntax/ja/verbforms.py +56 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/ud.py +10 -4
- batchalign-0.7.3b14/batchalign/version +3 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14/batchalign.egg-info}/PKG-INFO +7 -7
- batchalign-0.7.3b12/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -34
- batchalign-0.7.3b12/batchalign/version +0 -3
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/LICENSE +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/MANIFEST.in +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/__main__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/constants.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/errors.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/setup.cfg +0 -0
- {batchalign-0.7.3b12 → batchalign-0.7.3b14}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.3b14
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -82,7 +82,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
|
|
82
82
|
|
83
83
|
## Quick Start
|
84
84
|
|
85
|
-
The following instructions
|
85
|
+
The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
|
86
86
|
|
87
87
|
### Get Python
|
88
88
|
- We support Python versions 3.9, 3.10, and 3.11.
|
@@ -112,7 +112,7 @@ py -m pip3 install -U batchalign
|
|
112
112
|
```
|
113
113
|
|
114
114
|
### Rock and Roll
|
115
|
-
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
|
115
|
+
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
116
116
|
|
117
117
|
- to get started with the Batchalign program, [tap here](#quick-start-command-line)
|
118
118
|
- to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
|
@@ -121,7 +121,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
|
|
121
121
|
|
122
122
|
### Basic Usage
|
123
123
|
|
124
|
-
Once installed, you can invoke the Batchalign
|
124
|
+
Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
|
125
125
|
|
126
126
|
It is used in the following basic way:
|
127
127
|
|
@@ -131,9 +131,9 @@ batchalign [verb] [input_dir] [output_dir]
|
|
131
131
|
|
132
132
|
Where `verb` includes:
|
133
133
|
|
134
|
-
1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory,
|
135
|
-
2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory,
|
136
|
-
3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
134
|
+
1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
|
135
|
+
2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
|
136
|
+
3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
137
137
|
<!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
|
138
138
|
|
139
139
|
You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
|
@@ -8,7 +8,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
|
|
8
8
|
|
9
9
|
## Quick Start
|
10
10
|
|
11
|
-
The following instructions
|
11
|
+
The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
|
12
12
|
|
13
13
|
### Get Python
|
14
14
|
- We support Python versions 3.9, 3.10, and 3.11.
|
@@ -38,7 +38,7 @@ py -m pip3 install -U batchalign
|
|
38
38
|
```
|
39
39
|
|
40
40
|
### Rock and Roll
|
41
|
-
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
|
41
|
+
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
42
42
|
|
43
43
|
- to get started with the Batchalign program, [tap here](#quick-start-command-line)
|
44
44
|
- to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
|
@@ -47,7 +47,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
|
|
47
47
|
|
48
48
|
### Basic Usage
|
49
49
|
|
50
|
-
Once installed, you can invoke the Batchalign
|
50
|
+
Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
|
51
51
|
|
52
52
|
It is used in the following basic way:
|
53
53
|
|
@@ -57,9 +57,9 @@ batchalign [verb] [input_dir] [output_dir]
|
|
57
57
|
|
58
58
|
Where `verb` includes:
|
59
59
|
|
60
|
-
1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory,
|
61
|
-
2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory,
|
62
|
-
3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
60
|
+
1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
|
61
|
+
2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
|
62
|
+
3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
63
63
|
<!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
|
64
64
|
|
65
65
|
You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
|
@@ -208,6 +208,7 @@ class Utterance(BaseModel):
|
|
208
208
|
# t = re.sub(r"^[^\w\d\s<]+", "", t.strip()).strip()
|
209
209
|
t = re.sub(r",", " , ", t.strip()).strip()
|
210
210
|
t = re.sub(r" +", " ", t.strip()).strip()
|
211
|
+
t = t.replace("+ ,", "+,").strip()
|
211
212
|
return t
|
212
213
|
|
213
214
|
def __repr__(self):
|
@@ -0,0 +1,56 @@
|
|
1
|
+
"""
|
2
|
+
verbforms.py
|
3
|
+
Fix Japanese verb forms.
|
4
|
+
"""
|
5
|
+
|
6
|
+
def verbform(upos, target, text):
|
7
|
+
if "遣" in text and upos == "noun":
|
8
|
+
return "verb", "遣る"
|
9
|
+
if "死" in text:
|
10
|
+
return "verb", "死ぬ"
|
11
|
+
if "立" in text:
|
12
|
+
return "verb", "立つ"
|
13
|
+
if "引" in text:
|
14
|
+
return "verb", "引く"
|
15
|
+
if "出" in text:
|
16
|
+
return "verb", "出す"
|
17
|
+
if "引" in text:
|
18
|
+
return "verb", "引く"
|
19
|
+
if "飲" in text:
|
20
|
+
return "verb", "飲む"
|
21
|
+
if "呼" in text:
|
22
|
+
return "verb", "呼ぶ"
|
23
|
+
if "脱" in text:
|
24
|
+
return "verb", "脱ぐ"
|
25
|
+
if text == "な" and upos == "part":
|
26
|
+
return "aux", "な"
|
27
|
+
if text == "呼ん":
|
28
|
+
return "verb", "呼ぶ"
|
29
|
+
if text == "な" and upos == "aux":
|
30
|
+
return "aux", "な"
|
31
|
+
if text == "だり":
|
32
|
+
return "aux", "たり"
|
33
|
+
if text == "たり":
|
34
|
+
return "aux", "たり"
|
35
|
+
if text == "たら":
|
36
|
+
return "sconj", "たら"
|
37
|
+
if text == "たっ":
|
38
|
+
return "sconj", "たって"
|
39
|
+
# if text == "て" and upos == "sconj":
|
40
|
+
# return "aux", "て"
|
41
|
+
if text == "なさい" and target == "為さる":
|
42
|
+
return "aux", "為さい"
|
43
|
+
if text == "な" and upos == "part":
|
44
|
+
return "aux", "な"
|
45
|
+
if text == "脱" and upos == "noun":
|
46
|
+
return "verb", "脱"
|
47
|
+
if text == "よう" and upos == "aux":
|
48
|
+
return "aux", "よう"
|
49
|
+
if text == "ろ" and upos == "aux" and target == "為る":
|
50
|
+
return "aux", "ろ"
|
51
|
+
# if upos == "verb" and "る" in target:
|
52
|
+
# return "verb", target.replace("る","").strip()
|
53
|
+
|
54
|
+
return upos,target
|
55
|
+
|
56
|
+
|
@@ -233,9 +233,14 @@ def handler__VERB(word, lang=None):
|
|
233
233
|
tense = feats.get("Tense", "")
|
234
234
|
polarity = feats.get("Polarity", "")
|
235
235
|
polite = feats.get("Polite", "")
|
236
|
-
|
237
|
-
|
238
|
-
|
236
|
+
|
237
|
+
res = handler(word, lang)
|
238
|
+
if "sconj" in res:
|
239
|
+
return res
|
240
|
+
else:
|
241
|
+
return res+flag+stringify_feats(aspect, mood,
|
242
|
+
tense, polarity, polite,
|
243
|
+
number[:1]+person)
|
239
244
|
|
240
245
|
def handler__actual_PUNCT(word, lang=None):
|
241
246
|
# actual punctuation handler
|
@@ -692,7 +697,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
692
697
|
|
693
698
|
elif not any([i in ["hr", "zh", "zh-hans", "zh-hant", "ja", "ko",
|
694
699
|
"sl", "sr", "bg", "ru", "et", "hu",
|
695
|
-
"eu", "el", "he", "af", "ga", "da"] for i in lang]):
|
700
|
+
"eu", "el", "he", "af", "ga", "da", "ro"] for i in lang]):
|
696
701
|
if "en" in lang:
|
697
702
|
config["processors"]["mwt"] = "gum"
|
698
703
|
else:
|
@@ -878,6 +883,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
878
883
|
retokenized_ut = retokenized_ut.replace(" >", ">")
|
879
884
|
retokenized_ut = retokenized_ut.replace("< ", "<")
|
880
885
|
retokenized_ut = retokenized_ut.replace(" :", ":")
|
886
|
+
retokenized_ut = retokenized_ut.replace("+ ,", "+,")
|
881
887
|
retokenized_ut = retokenized_ut.replace(": <", ": <")
|
882
888
|
retokenized_ut = retokenized_ut.replace(" ↑", "↑")
|
883
889
|
retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.3b14
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -82,7 +82,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
|
|
82
82
|
|
83
83
|
## Quick Start
|
84
84
|
|
85
|
-
The following instructions
|
85
|
+
The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
|
86
86
|
|
87
87
|
### Get Python
|
88
88
|
- We support Python versions 3.9, 3.10, and 3.11.
|
@@ -112,7 +112,7 @@ py -m pip3 install -U batchalign
|
|
112
112
|
```
|
113
113
|
|
114
114
|
### Rock and Roll
|
115
|
-
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
|
115
|
+
There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
|
116
116
|
|
117
117
|
- to get started with the Batchalign program, [tap here](#quick-start-command-line)
|
118
118
|
- to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
|
@@ -121,7 +121,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
|
|
121
121
|
|
122
122
|
### Basic Usage
|
123
123
|
|
124
|
-
Once installed, you can invoke the Batchalign
|
124
|
+
Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
|
125
125
|
|
126
126
|
It is used in the following basic way:
|
127
127
|
|
@@ -131,9 +131,9 @@ batchalign [verb] [input_dir] [output_dir]
|
|
131
131
|
|
132
132
|
Where `verb` includes:
|
133
133
|
|
134
|
-
1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory,
|
135
|
-
2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory,
|
136
|
-
3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
134
|
+
1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
|
135
|
+
2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
|
136
|
+
3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
|
137
137
|
<!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
|
138
138
|
|
139
139
|
You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
|
@@ -1,34 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
verbforms.py
|
3
|
-
Fix Japanese verb forms.
|
4
|
-
"""
|
5
|
-
|
6
|
-
def verbform(upos, target, text):
|
7
|
-
if text == "な" and upos == "part":
|
8
|
-
return "aux", "うな"
|
9
|
-
if text == "呼ん":
|
10
|
-
return upos, "呼ん"
|
11
|
-
if text == "たり":
|
12
|
-
return "aux", "たり"
|
13
|
-
if text == "たら":
|
14
|
-
return "sconj", "たら"
|
15
|
-
if text == "たっ":
|
16
|
-
return "sconj", "たって"
|
17
|
-
if text == "て" and upos == "sconj":
|
18
|
-
return "aux", "て"
|
19
|
-
if text == "なさい" and target == "為さる":
|
20
|
-
return "aux", "為さい"
|
21
|
-
if text == "な" and upos == "part":
|
22
|
-
return "aux", "な"
|
23
|
-
if text == "脱" and upos == "noun":
|
24
|
-
return "verb", "脱"
|
25
|
-
if text == "よう" and upos == "aux":
|
26
|
-
return "aux", "よう"
|
27
|
-
if text == "ろ" and upos == "aux" and target == "為る":
|
28
|
-
return "aux", "ろ"
|
29
|
-
if upos == "verb" and "る" in target:
|
30
|
-
return "verb", target.replace("る","").strip()
|
31
|
-
|
32
|
-
return upos,target
|
33
|
-
|
34
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|