batchalign 0.7.3b12__tar.gz → 0.7.3b14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {batchalign-0.7.3b12/batchalign.egg-info → batchalign-0.7.3b14}/PKG-INFO +7 -7
  2. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/README.md +6 -6
  3. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/document.py +1 -0
  4. batchalign-0.7.3b14/batchalign/pipelines/morphosyntax/ja/verbforms.py +56 -0
  5. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/ud.py +10 -4
  6. batchalign-0.7.3b14/batchalign/version +3 -0
  7. {batchalign-0.7.3b12 → batchalign-0.7.3b14/batchalign.egg-info}/PKG-INFO +7 -7
  8. batchalign-0.7.3b12/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -34
  9. batchalign-0.7.3b12/batchalign/version +0 -3
  10. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/LICENSE +0 -0
  11. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/MANIFEST.in +0 -0
  12. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/__init__.py +0 -0
  13. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/__main__.py +0 -0
  14. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/__init__.py +0 -0
  15. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/cli.py +0 -0
  16. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/cli/dispatch.py +0 -0
  17. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/constants.py +0 -0
  18. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/errors.py +0 -0
  19. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/__init__.py +0 -0
  20. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/base.py +0 -0
  21. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/__init__.py +0 -0
  22. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/file.py +0 -0
  23. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/generator.py +0 -0
  24. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/lexer.py +0 -0
  25. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/parser.py +0 -0
  26. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/chat/utils.py +0 -0
  27. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/__init__.py +0 -0
  28. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/file.py +0 -0
  29. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/generator.py +0 -0
  30. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/formats/textgrid/parser.py +0 -0
  31. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/__init__.py +0 -0
  32. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/resolve.py +0 -0
  33. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/__init__.py +0 -0
  34. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/config.yaml +0 -0
  35. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/infer.py +0 -0
  36. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/speaker/utils.py +0 -0
  37. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/__init__.py +0 -0
  38. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/run.py +0 -0
  39. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/training/utils.py +0 -0
  40. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utils.py +0 -0
  41. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/__init__.py +0 -0
  42. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/dataset.py +0 -0
  43. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/execute.py +0 -0
  44. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/infer.py +0 -0
  45. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/prep.py +0 -0
  46. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/utterance/train.py +0 -0
  47. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/__init__.py +0 -0
  48. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/infer_asr.py +0 -0
  49. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/models/whisper/infer_fa.py +0 -0
  50. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/__init__.py +0 -0
  51. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/analysis/__init__.py +0 -0
  52. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/analysis/eval.py +0 -0
  53. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/__init__.py +0 -0
  54. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/rev.py +0 -0
  55. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/utils.py +0 -0
  56. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/whisper.py +0 -0
  57. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/asr/whisperx.py +0 -0
  58. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/base.py +0 -0
  59. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/__init__.py +0 -0
  60. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  61. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  62. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  63. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/retrace.py +0 -0
  64. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  65. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  66. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/cleanup/support/test.test +0 -0
  67. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/dispatch.py +0 -0
  68. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/fa/__init__.py +0 -0
  69. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  70. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  71. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  72. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/pipeline.py +0 -0
  73. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/speaker/__init__.py +0 -0
  74. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  75. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/__init__.py +0 -0
  76. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/rev_utr.py +0 -0
  77. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/utils.py +0 -0
  78. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  79. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utterance/__init__.py +0 -0
  80. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  81. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/__init__.py +0 -0
  82. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/conftest.py +0 -0
  83. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  84. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  85. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  86. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  87. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  88. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  89. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  90. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  91. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  92. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  93. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  94. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  95. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/fixures.py +0 -0
  96. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  97. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  98. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/tests/test_document.py +0 -0
  99. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/__init__.py +0 -0
  100. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/config.py +0 -0
  101. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/dp.py +0 -0
  102. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign/utils/utils.py +0 -0
  103. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/SOURCES.txt +0 -0
  104. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/dependency_links.txt +0 -0
  105. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/entry_points.txt +0 -0
  106. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/requires.txt +0 -0
  107. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/batchalign.egg-info/top_level.txt +0 -0
  108. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/setup.cfg +0 -0
  109. {batchalign-0.7.3b12 → batchalign-0.7.3b14}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.3b12
3
+ Version: 0.7.3b14
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -82,7 +82,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
82
82
 
83
83
  ## Quick Start
84
84
 
85
- The following instructions is a quick start to install Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
85
+ The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
86
86
 
87
87
  ### Get Python
88
88
  - We support Python versions 3.9, 3.10, and 3.11.
@@ -112,7 +112,7 @@ py -m pip3 install -U batchalign
112
112
  ```
113
113
 
114
114
  ### Rock and Roll
115
- There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
115
+ There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
116
116
 
117
117
  - to get started with the Batchalign program, [tap here](#quick-start-command-line)
118
118
  - to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
@@ -121,7 +121,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
121
121
 
122
122
  ### Basic Usage
123
123
 
124
- Once installed, you can invoke the Batchalign CLI program via the `batchalign` command.
124
+ Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
125
125
 
126
126
  It is used in the following basic way:
127
127
 
@@ -131,9 +131,9 @@ batchalign [verb] [input_dir] [output_dir]
131
131
 
132
132
  Where `verb` includes:
133
133
 
134
- 1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, perform ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generate word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
135
- 2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, generate word-level alignments
136
- 3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
134
+ 1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
135
+ 2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
136
+ 3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
137
137
  <!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
138
138
 
139
139
  You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
@@ -8,7 +8,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
8
8
 
9
9
  ## Quick Start
10
10
 
11
- The following instructions is a quick start to install Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
11
+ The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
12
12
 
13
13
  ### Get Python
14
14
  - We support Python versions 3.9, 3.10, and 3.11.
@@ -38,7 +38,7 @@ py -m pip3 install -U batchalign
38
38
  ```
39
39
 
40
40
  ### Rock and Roll
41
- There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
41
+ There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
42
42
 
43
43
  - to get started with the Batchalign program, [tap here](#quick-start-command-line)
44
44
  - to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
@@ -47,7 +47,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
47
47
 
48
48
  ### Basic Usage
49
49
 
50
- Once installed, you can invoke the Batchalign CLI program via the `batchalign` command.
50
+ Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
51
51
 
52
52
  It is used in the following basic way:
53
53
 
@@ -57,9 +57,9 @@ batchalign [verb] [input_dir] [output_dir]
57
57
 
58
58
  Where `verb` includes:
59
59
 
60
- 1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, perform ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generate word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
61
- 2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, generate word-level alignments
62
- 3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
60
+ 1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
61
+ 2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
62
+ 3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
63
63
  <!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
64
64
 
65
65
  You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
@@ -208,6 +208,7 @@ class Utterance(BaseModel):
208
208
  # t = re.sub(r"^[^\w\d\s<]+", "", t.strip()).strip()
209
209
  t = re.sub(r",", " , ", t.strip()).strip()
210
210
  t = re.sub(r" +", " ", t.strip()).strip()
211
+ t = t.replace("+ ,", "+,").strip()
211
212
  return t
212
213
 
213
214
  def __repr__(self):
@@ -0,0 +1,56 @@
1
+ """
2
+ verbforms.py
3
+ Fix Japanese verb forms.
4
+ """
5
+
6
+ def verbform(upos, target, text):
7
+ if "遣" in text and upos == "noun":
8
+ return "verb", "遣る"
9
+ if "死" in text:
10
+ return "verb", "死ぬ"
11
+ if "立" in text:
12
+ return "verb", "立つ"
13
+ if "引" in text:
14
+ return "verb", "引く"
15
+ if "出" in text:
16
+ return "verb", "出す"
17
+ if "引" in text:
18
+ return "verb", "引く"
19
+ if "飲" in text:
20
+ return "verb", "飲む"
21
+ if "呼" in text:
22
+ return "verb", "呼ぶ"
23
+ if "脱" in text:
24
+ return "verb", "脱ぐ"
25
+ if text == "な" and upos == "part":
26
+ return "aux", "な"
27
+ if text == "呼ん":
28
+ return "verb", "呼ぶ"
29
+ if text == "な" and upos == "aux":
30
+ return "aux", "な"
31
+ if text == "だり":
32
+ return "aux", "たり"
33
+ if text == "たり":
34
+ return "aux", "たり"
35
+ if text == "たら":
36
+ return "sconj", "たら"
37
+ if text == "たっ":
38
+ return "sconj", "たって"
39
+ # if text == "て" and upos == "sconj":
40
+ # return "aux", "て"
41
+ if text == "なさい" and target == "為さる":
42
+ return "aux", "為さい"
43
+ if text == "な" and upos == "part":
44
+ return "aux", "な"
45
+ if text == "脱" and upos == "noun":
46
+ return "verb", "脱"
47
+ if text == "よう" and upos == "aux":
48
+ return "aux", "よう"
49
+ if text == "ろ" and upos == "aux" and target == "為る":
50
+ return "aux", "ろ"
51
+ # if upos == "verb" and "る" in target:
52
+ # return "verb", target.replace("る","").strip()
53
+
54
+ return upos,target
55
+
56
+
@@ -233,9 +233,14 @@ def handler__VERB(word, lang=None):
233
233
  tense = feats.get("Tense", "")
234
234
  polarity = feats.get("Polarity", "")
235
235
  polite = feats.get("Polite", "")
236
- return handler(word, lang)+flag+stringify_feats(aspect, mood,
237
- tense, polarity, polite,
238
- number[:1]+person)
236
+
237
+ res = handler(word, lang)
238
+ if "sconj" in res:
239
+ return res
240
+ else:
241
+ return res+flag+stringify_feats(aspect, mood,
242
+ tense, polarity, polite,
243
+ number[:1]+person)
239
244
 
240
245
  def handler__actual_PUNCT(word, lang=None):
241
246
  # actual punctuation handler
@@ -692,7 +697,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
692
697
 
693
698
  elif not any([i in ["hr", "zh", "zh-hans", "zh-hant", "ja", "ko",
694
699
  "sl", "sr", "bg", "ru", "et", "hu",
695
- "eu", "el", "he", "af", "ga", "da"] for i in lang]):
700
+ "eu", "el", "he", "af", "ga", "da", "ro"] for i in lang]):
696
701
  if "en" in lang:
697
702
  config["processors"]["mwt"] = "gum"
698
703
  else:
@@ -878,6 +883,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
878
883
  retokenized_ut = retokenized_ut.replace(" >", ">")
879
884
  retokenized_ut = retokenized_ut.replace("< ", "<")
880
885
  retokenized_ut = retokenized_ut.replace(" :", ":")
886
+ retokenized_ut = retokenized_ut.replace("+ ,", "+,")
881
887
  retokenized_ut = retokenized_ut.replace(": <", ": <")
882
888
  retokenized_ut = retokenized_ut.replace(" ↑", "↑")
883
889
  retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
@@ -0,0 +1,3 @@
1
+ 0.7.3-beta.14
2
+ July 6th, 2024
3
+ UD Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.3b12
3
+ Version: 0.7.3b14
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -82,7 +82,7 @@ The TalkBank Project, of which Batchalign is a part, is supported by NIH grant H
82
82
 
83
83
  ## Quick Start
84
84
 
85
- The following instructions is a quick start to install Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
85
+ The following instructions provide a quick start to installing Batchalign. For most users aiming to process CHAT and audio with Batchalign, we recommend more detailed usage instructions: for [usage](https://talkbank.org/info/BA2-usage.pdf) and [human transcript cleanup](https://talkbank.org/info/BA2-cleanup.pdf). The following provides a quick start guide for the program.
86
86
 
87
87
  ### Get Python
88
88
  - We support Python versions 3.9, 3.10, and 3.11.
@@ -112,7 +112,7 @@ py -m pip3 install -U batchalign
112
112
  ```
113
113
 
114
114
  ### Rock and Roll
115
- There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or a Python LSA library.
115
+ There are two main ways of interacting with Batchalign. Batchalign can be used as a program to batch-process CHAT (hence the name), or as a Python LSA library.
116
116
 
117
117
  - to get started with the Batchalign program, [tap here](#quick-start-command-line)
118
118
  - to get started on the Batchalign Library (assumes familiarity with Python), [tap here](#quick-start-python)
@@ -121,7 +121,7 @@ There are two main ways of interacting with Batchalign. Batchalign can be used a
121
121
 
122
122
  ### Basic Usage
123
123
 
124
- Once installed, you can invoke the Batchalign CLI program via the `batchalign` command.
124
+ Once installed, you can invoke the Batchalign program by typing `batchalign` into the Terminal (MacOS) or Command Prompt (Windows).
125
125
 
126
126
  It is used in the following basic way:
127
127
 
@@ -131,9 +131,9 @@ batchalign [verb] [input_dir] [output_dir]
131
131
 
132
132
  Where `verb` includes:
133
133
 
134
- 1. `transcribe` - placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, perform ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generate word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
135
- 2. `align` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, generate word-level alignments
136
- 3. `morphotag` - placing a CHAT file in the input directory, uses Stanford NLP Stanza to generate morphological and dependency analyses. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
134
+ 1. `transcribe` - by placing only an audio of video file (`.mp3/.mp4/.wav`) in the input directory, this function performs ASR on the audio, diarizes utterances, identifies some basic conversational features like retracing and filled pauses, and generates word-level alignments. You must supply a language code flag: `--lang=[three letter ISO language code]` for the ASR system to know what language the transcript is in. You can choose the flags `--rev` to use Rev.AI, a commercial ASR service, or `--whisper`, to use a local copy of OpenAI Whisper.
135
+ 2. `align` - by placing both an audio of video file (`.mp3/.mp4/.wav`) and an *utterance-aligned* CHAT file in the input directory, this function recovers utterance-level time alignments (if they are not already annotated) and generates word-level alignments. The @Languages header in the CHAT file tells the program which language is in the transcript.
136
+ 3. `morphotag` - by placing a CHAT file in the input directory, this function uses Stanford NLP Stanza to generate morphological and dependency analyses. The @Languages header in the CHAT file tells the program which language is in the transcript. You must supply a language code flag: `--lang=[three letter ISO language code]` for the alignment system to know what language the transcript is in.
137
137
  <!-- 4. `bulletize` - placing both an audio of video file (`.mp3/.mp4/.wav`) and an *unlinked* CHAT file in the input directory, generate utterance-level alignments through ASR -->
138
138
 
139
139
  You can get a CHAT transcript to experiment with [at the TalkBank website](https://talkbank.org/), under any of the "Banks" that are available. You can also generate and parse a CHAT transcript via [the Python program](https://github.com/TalkBank/batchalign2?tab=readme-ov-file#chat).
@@ -1,34 +0,0 @@
1
- """
2
- verbforms.py
3
- Fix Japanese verb forms.
4
- """
5
-
6
- def verbform(upos, target, text):
7
- if text == "な" and upos == "part":
8
- return "aux", "うな"
9
- if text == "呼ん":
10
- return upos, "呼ん"
11
- if text == "たり":
12
- return "aux", "たり"
13
- if text == "たら":
14
- return "sconj", "たら"
15
- if text == "たっ":
16
- return "sconj", "たって"
17
- if text == "て" and upos == "sconj":
18
- return "aux", "て"
19
- if text == "なさい" and target == "為さる":
20
- return "aux", "為さい"
21
- if text == "な" and upos == "part":
22
- return "aux", "な"
23
- if text == "脱" and upos == "noun":
24
- return "verb", "脱"
25
- if text == "よう" and upos == "aux":
26
- return "aux", "よう"
27
- if text == "ろ" and upos == "aux" and target == "為る":
28
- return "aux", "ろ"
29
- if upos == "verb" and "る" in target:
30
- return "verb", target.replace("る","").strip()
31
-
32
- return upos,target
33
-
34
-
@@ -1,3 +0,0 @@
1
- 0.7.3-beta.12
2
- June 19th, 2024
3
- UD Fixes
File without changes
File without changes
File without changes
File without changes