batchalign 0.7.7.post1__tar.gz → 0.7.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.7.post1/batchalign.egg-info → batchalign-0.7.9}/PKG-INFO +1 -1
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/constants.py +1 -1
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/generator.py +3 -3
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/lexer.py +1 -1
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/parser.py +7 -3
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/ud.py +1 -1
- batchalign-0.7.9/batchalign/version +3 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.7.post1/batchalign/version +0 -3
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/LICENSE +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/MANIFEST.in +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/README.md +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/__main__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/document.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/errors.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/setup.cfg +0 -0
- {batchalign-0.7.7.post1 → batchalign-0.7.9}/setup.py +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
# CHAT punctuation specifications
|
2
|
-
ENDING_PUNCT = [".", "?", "!", "+//.", "+/.", "+...", "+\"/.", "+..?", "+\".", "+//?", "+.", "+!?", "+/?", "..."]
|
2
|
+
ENDING_PUNCT = [".", "?", "!", "+//.", "+/.", "+...", "+\"/.", "+..?", "+\".", "+//?", "+.", "+!?", "+/?", "...", "(.)"]
|
3
3
|
MOR_PUNCT = ["‡", "„", ","]
|
4
4
|
CHAT_IGNORE = ["xxx", "yyy", "www"]
|
5
5
|
|
@@ -41,9 +41,9 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
|
|
41
41
|
gras.append(i.dependency)
|
42
42
|
if i.time:
|
43
43
|
has_wor = True
|
44
|
-
wor_elems.append(f"{i.text} \x15{str(i.time[0])}_{str(i.time[1])}\x15")
|
44
|
+
wor_elems.append(re.sub(r"@\w+", "", f"{i.text} \x15{str(i.time[0])}_{str(i.time[1])}\x15"))
|
45
45
|
else:
|
46
|
-
wor_elems.append(i.text)
|
46
|
+
wor_elems.append(re.sub(r"@\w+", "", i.text))
|
47
47
|
|
48
48
|
if i.coreference:
|
49
49
|
has_coref = True
|
@@ -124,7 +124,7 @@ def generate_chat_preamble(doc, birthdays=[]):
|
|
124
124
|
header = []
|
125
125
|
header.append("@Languages:\t"+", ".join(doc.langs))
|
126
126
|
header.append("@Participants:\t"+", ".join([f"{i.id} {i.name}" for i in doc.tiers]))
|
127
|
-
header.append("@Options:\tmulti")
|
127
|
+
# header.append("@Options:\tmulti")
|
128
128
|
header.append("\n".join([f"@ID:\t{i.lang}|{i.corpus}|{i.id}|{i.birthday}|{i.additional[0]}|{i.additional[1]}|{i.additional[2]}|{i.name}|{i.additional[3]}|{i.additional[4]}|" for i in doc.tiers]))
|
129
129
|
for i in birthdays:
|
130
130
|
header.append(f"@{i.id}:\t{i.content}")
|
@@ -197,7 +197,7 @@ class UtteranceLexer:
|
|
197
197
|
while True:
|
198
198
|
res = self.__pull()
|
199
199
|
try:
|
200
|
-
if res == "" or res == False or res in ENDING_PUNCT or (res[-1] in ENDING_PUNCT and re.findall("\w", res)):
|
200
|
+
if res == "" or res == False or res in ENDING_PUNCT or (res[-1] in ENDING_PUNCT and re.findall(r"\w", res)):
|
201
201
|
break
|
202
202
|
except IndexError:
|
203
203
|
raise CHATValidationException(f"Lexer failed! Utterance ended without ending punct. Utterance: {self.raw}")
|
@@ -39,13 +39,13 @@ def chat_parse_utterance(text, mor, gra, wor, additional):
|
|
39
39
|
|
40
40
|
# scan the timing
|
41
41
|
# lex the utterance
|
42
|
-
to_lex = re.compile("\x15\d+_\d+\x15").sub("", text).strip()
|
42
|
+
to_lex = re.compile(r"\x15\d+_\d+\x15").sub("", text).strip()
|
43
43
|
|
44
44
|
# if the first form has a < in it and has no words,
|
45
45
|
# its probably a beginning delimiter which we do not lex
|
46
46
|
if (len(to_lex) > 0 and
|
47
47
|
("<" in to_lex.split(" ")[0] or "+" in to_lex.split(" ")[0] )
|
48
|
-
and not re.findall("\w", to_lex.split(" ")[0])):
|
48
|
+
and not re.findall(r"\w", to_lex.split(" ")[0])):
|
49
49
|
beg = to_lex.split(" ")[0]
|
50
50
|
to_lex = to_lex.replace(beg, "", 1)
|
51
51
|
|
@@ -64,9 +64,11 @@ def chat_parse_utterance(text, mor, gra, wor, additional):
|
|
64
64
|
# fix commas for people that don't annotate commas with a space
|
65
65
|
to_lex = to_lex.replace(",", " ,")
|
66
66
|
|
67
|
-
to_lex = re.sub(r"\([\d.:]+\)", "", to_lex)
|
67
|
+
to_lex = re.sub(r"\([\d.:]+\)(?!$)", "", to_lex)
|
68
68
|
to_lex = re.sub(r"↫.*?↫", "", to_lex)
|
69
69
|
|
70
|
+
to_lex = re.sub(r"\(.\)$", r"$END_SPC$", to_lex)
|
71
|
+
|
70
72
|
# if there is a punct, move it
|
71
73
|
for end in sorted(ENDING_PUNCT, key=len, reverse=True):
|
72
74
|
if end in to_lex:
|
@@ -76,6 +78,8 @@ def chat_parse_utterance(text, mor, gra, wor, additional):
|
|
76
78
|
to_lex = to_lex.replace(" ", " ")
|
77
79
|
|
78
80
|
tokens = lex(to_lex)
|
81
|
+
if tokens[-1][0] == "END_SPC":
|
82
|
+
tokens = tokens[:-1] + [("(.)", TokenType.PUNCT)]
|
79
83
|
|
80
84
|
# correct 0 forms
|
81
85
|
res = []
|
@@ -793,7 +793,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
793
793
|
except AttributeError:
|
794
794
|
breakpoint()
|
795
795
|
|
796
|
-
if re.findall("\w", ending):
|
796
|
+
if re.findall(r"\w", ending):
|
797
797
|
ending = "."
|
798
798
|
line_cut = i.strip(join_with_spaces=True)
|
799
799
|
else:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/pipelines/morphosyntax/ja/verbforms.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.7.post1 → batchalign-0.7.9}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|