batchalign 0.7.6a12__tar.gz → 0.7.6a14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.6a12/batchalign.egg-info → batchalign-0.7.6a14}/PKG-INFO +1 -1
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/document.py +1 -2
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/utils.py +1 -1
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/ud.py +24 -7
- batchalign-0.7.6a14/batchalign/version +3 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.6a12/batchalign/version +0 -3
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/LICENSE +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/MANIFEST.in +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/README.md +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/__main__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/constants.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/errors.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/setup.cfg +0 -0
- {batchalign-0.7.6a12 → batchalign-0.7.6a14}/setup.py +0 -0
@@ -324,8 +324,7 @@ class Utterance(BaseModel):
|
|
324
324
|
to_include.append(TokenType.RETRACE)
|
325
325
|
if include_fp:
|
326
326
|
to_include.append(TokenType.FP)
|
327
|
-
filtered = filter(lambda x:x.type in to_include,
|
328
|
-
self.content)
|
327
|
+
filtered = filter(lambda x:x.type in to_include, self.content)
|
329
328
|
# chain them together
|
330
329
|
if join_with_spaces:
|
331
330
|
return " ".join([i.text for i in filtered])
|
@@ -146,7 +146,7 @@ def annotation_clean(content, special=False):
|
|
146
146
|
cleaned_word = cleaned_word.replace("~","").replace("&~","")
|
147
147
|
cleaned_word = cleaned_word.replace(">","").replace("<","")
|
148
148
|
cleaned_word = cleaned_word.replace("〕","").replace("//","").replace(";","")
|
149
|
-
cleaned_word = re.sub(r"@[^
|
149
|
+
cleaned_word = re.sub(r"@[^abcefpoqsw]", '', cleaned_word)
|
150
150
|
cleaned_word = re.sub(r"&.", '', cleaned_word)
|
151
151
|
|
152
152
|
return cleaned_word
|
@@ -837,11 +837,17 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
837
837
|
L.debug(f"Encountered an utterance that's likely devoid of morphological information; skipping... utterance='{doc.content[indx]}'")
|
838
838
|
continue
|
839
839
|
|
840
|
+
|
840
841
|
if retokenize:
|
841
842
|
# rewrite the sentence with our desired tokenizations
|
842
843
|
ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
|
843
844
|
mor, gra,
|
844
845
|
None, None)
|
846
|
+
# fix xbxxx
|
847
|
+
for i in ut:
|
848
|
+
if i.text == "xbxxx" and len(i.morphology) > 0:
|
849
|
+
i.text = i.morphology[0].lemma
|
850
|
+
|
845
851
|
# split the text up into previous chunks
|
846
852
|
chunks = list(enumerate(doc.content[indx].text.split(" ")))
|
847
853
|
# filter out everything that could not possibly align
|
@@ -866,6 +872,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
866
872
|
for i,j in enumerate(ut):
|
867
873
|
for k in j.text:
|
868
874
|
ud_chars.append(ReferenceTarget(k, payload=i))
|
875
|
+
creaky = False
|
876
|
+
collected = ""
|
869
877
|
# brrr
|
870
878
|
aligned = align(chunks_chars, ud_chars, tqdm=False)
|
871
879
|
for i in aligned:
|
@@ -873,16 +881,23 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
873
881
|
if i.reference_payload not in chunks_backplate[i.payload]:
|
874
882
|
chunks_backplate[i.payload].append(i.reference_payload)
|
875
883
|
elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
|
876
|
-
|
877
|
-
|
884
|
+
if i.key == "*":
|
885
|
+
creaky = not creaky
|
886
|
+
chunks_backplate[i.payload].append("*"+collected+"*")
|
887
|
+
collected = ""
|
888
|
+
elif creaky:
|
889
|
+
collected += i.key
|
890
|
+
elif not creaky:
|
891
|
+
chunks_backplate[i.payload].append(i.key)
|
878
892
|
# we want to replace the morphology of forms that are not actually
|
879
893
|
# supposed to be analyzed
|
880
894
|
elif isinstance(i, Extra) and i.extra_type == ExtraType.REFERENCE:
|
881
|
-
ut[i.payload].
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
895
|
+
if ut[i.payload].text != ",":
|
896
|
+
ut[i.payload].morphology = [Morphology(
|
897
|
+
lemma = sents[0].tokens[i.payload].text if len(sents) > 0 and len(sents[0].tokens) > i.payload and sents[0].tokens[i.payload].text != "xbxxx" else ut[i.payload].text,
|
898
|
+
pos = "x",
|
899
|
+
feats = ""
|
900
|
+
)]
|
886
901
|
|
887
902
|
poses = [i.morphology[0].pos.upper() for i in ut
|
888
903
|
if i.morphology
|
@@ -908,6 +923,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
908
923
|
retokenized_ut = retokenized_ut.replace(" ↑", "↑")
|
909
924
|
retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
|
910
925
|
retokenized_ut = retokenized_ut.replace(" @", "@")
|
926
|
+
retokenized_ut = re.sub(r"\*[* ]*", "*", retokenized_ut)
|
927
|
+
retokenized_ut = re.sub(r"\*(.*?)\*", r"*\1* ", retokenized_ut)
|
911
928
|
# pray to everyone that it works---this will simply crash and ignore
|
912
929
|
# the utterance if it didn't work, so we are doing this as a sanity
|
913
930
|
# check rather than needing the parsed result
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/ja/verbforms.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|