batchalign 0.7.1b8__tar.gz → 0.7.1b10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.1b8/batchalign.egg-info → batchalign-0.7.1b10}/PKG-INFO +1 -1
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/cli.py +15 -12
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/document.py +7 -3
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/utils.py +1 -1
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/ud.py +3 -0
- batchalign-0.7.1b10/batchalign/version +3 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.1b8/batchalign/version +0 -3
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/LICENSE +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/MANIFEST.in +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/README.md +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/__main__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/constants.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/errors.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/setup.cfg +0 -0
- {batchalign-0.7.1b8 → batchalign-0.7.1b10}/setup.py +0 -0
@@ -119,10 +119,10 @@ batchalign.add_command(train, "models")
|
|
119
119
|
def align(ctx, in_dir, out_dir, lang, num_speakers, whisper, **kwargs):
|
120
120
|
"""Align transcripts against corresponding media files."""
|
121
121
|
def loader(file):
|
122
|
-
return CHATFile(path=os.path.abspath(file)
|
122
|
+
return CHATFile(path=os.path.abspath(file)).doc
|
123
123
|
|
124
124
|
def writer(doc, output):
|
125
|
-
CHATFile(doc=doc
|
125
|
+
CHATFile(doc=doc).write(output)
|
126
126
|
|
127
127
|
_dispatch("align", lang, num_speakers,
|
128
128
|
["cha"], ctx,
|
@@ -159,11 +159,11 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
159
159
|
def writer(doc, output):
|
160
160
|
doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
|
161
161
|
content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
|
162
|
-
CHATFile(doc=doc
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
162
|
+
CHATFile(doc=doc).write(output
|
163
|
+
.replace(".wav", ".cha")
|
164
|
+
.replace(".mp4", ".cha")
|
165
|
+
.replace(".mp3", ".cha"),
|
166
|
+
write_wor=kwargs.get("wor", False))
|
167
167
|
|
168
168
|
if kwargs.get("diarize"):
|
169
169
|
_dispatch("transcribe_s",
|
@@ -192,7 +192,6 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
192
192
|
def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
193
193
|
"""Perform morphosyntactic analysis on transcripts."""
|
194
194
|
|
195
|
-
|
196
195
|
def loader(file):
|
197
196
|
mwt = {}
|
198
197
|
if kwargs.get("lexicon") != None and kwargs.get("lexicon", "").strip() != "":
|
@@ -202,13 +201,17 @@ def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
202
201
|
raw = [i for i in csv.reader(df)]
|
203
202
|
for i in raw:
|
204
203
|
mwt[i[0]] = tuple(i[1:])
|
204
|
+
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
205
|
+
doc = cf.doc
|
206
|
+
if str(cf).count("%mor") > 0:
|
207
|
+
doc.ba_special_["special_mor_notation"] = True
|
205
208
|
return (
|
206
|
-
|
209
|
+
doc,
|
207
210
|
{"retokenize": kwargs["retokenize"], "mwt": mwt}
|
208
211
|
)
|
209
212
|
|
210
213
|
def writer(doc, output):
|
211
|
-
CHATFile(doc=doc, special_mor_=
|
214
|
+
CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
|
212
215
|
|
213
216
|
_dispatch("morphotag", lang, num_speakers, ["cha"], ctx,
|
214
217
|
in_dir, out_dir,
|
@@ -224,10 +227,10 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
224
227
|
"""Perform morphosyntactic analysis on transcripts."""
|
225
228
|
|
226
229
|
def loader(file):
|
227
|
-
return CHATFile(path=os.path.abspath(file)
|
230
|
+
return CHATFile(path=os.path.abspath(file)).doc
|
228
231
|
|
229
232
|
def writer(doc, output):
|
230
|
-
CHATFile(doc=doc
|
233
|
+
CHATFile(doc=doc).write(output)
|
231
234
|
|
232
235
|
_dispatch("utseg", lang, num_speakers, ["cha"], ctx,
|
233
236
|
in_dir, out_dir,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum, IntEnum
|
2
|
-
from typing import Optional, List, Tuple, Union
|
2
|
+
from typing import Optional, List, Tuple, Union, Any, Dict
|
3
3
|
from typing_extensions import Annotated
|
4
4
|
|
5
5
|
from pydantic import BaseModel, Field, computed_field
|
@@ -193,9 +193,12 @@ class Utterance(BaseModel):
|
|
193
193
|
|
194
194
|
def __str__(self):
|
195
195
|
if self.text != None:
|
196
|
-
|
196
|
+
t = self.text
|
197
197
|
else:
|
198
|
-
|
198
|
+
t = self._detokenize()
|
199
|
+
|
200
|
+
t = t.replace(". . .", "+...")
|
201
|
+
return t
|
199
202
|
|
200
203
|
def __repr__(self):
|
201
204
|
return str(self)
|
@@ -331,6 +334,7 @@ class Document(BaseModel):
|
|
331
334
|
langs: List[str] = Field(default=["eng"])
|
332
335
|
# persistent digital identifier
|
333
336
|
pid: Optional[str] = Field(default=None)
|
337
|
+
ba_special_: Optional[Dict] = Field(default={})
|
334
338
|
|
335
339
|
def __repr__(self):
|
336
340
|
return "\n".join(self.transcript())
|
@@ -848,7 +848,10 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
848
848
|
for j in i]
|
849
849
|
|
850
850
|
retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
|
851
|
+
retokenized_ut = retokenized_ut.replace("^", "")
|
851
852
|
retokenized_ut = re.sub(r" +", " ", retokenized_ut)
|
853
|
+
retokenized_ut = retokenized_ut.replace("+ \"", "+\"")
|
854
|
+
retokenized_ut = retokenized_ut.replace(" >", ">")
|
852
855
|
# pray to everyone that it works---this will simply crash and ignore
|
853
856
|
# the utterance if it didn't work, so we are doing this as a sanity
|
854
857
|
# check rather than needing the parsed result
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|