batchalign 0.7.1b12__tar.gz → 0.7.1b14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.1b12/batchalign.egg-info → batchalign-0.7.1b14}/PKG-INFO +1 -1
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/morphosyntax/ud.py +16 -4
- batchalign-0.7.1b14/batchalign/version +3 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.1b12/batchalign/version +0 -3
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/LICENSE +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/MANIFEST.in +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/README.md +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/__main__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/constants.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/document.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/errors.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/setup.cfg +0 -0
- {batchalign-0.7.1b12 → batchalign-0.7.1b14}/setup.py +0 -0
@@ -131,6 +131,9 @@ def handler(word, lang=None):
|
|
131
131
|
# fix dash
|
132
132
|
target = target.replace("-", "–")
|
133
133
|
|
134
|
+
if "“" in target:
|
135
|
+
target = word.text
|
136
|
+
|
134
137
|
return f"{'' if not unknown else '0'}{word.upos.lower()}|{target}"
|
135
138
|
|
136
139
|
# POS specific handler
|
@@ -813,14 +816,14 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
813
816
|
chunks = list(enumerate(doc.content[indx].text.split(" ")))
|
814
817
|
# filter out everything that could not possibly align
|
815
818
|
chunks_align = [(i,j) for i,j in chunks
|
816
|
-
if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"]) and (j[-1] not in ["]"])
|
819
|
+
if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15", "(", ")"]) and (j[-1] not in ["]"])
|
817
820
|
and ("@" not in j)
|
818
|
-
and j.strip() not in
|
821
|
+
and j.strip() not in MOR_PUNCT + CHAT_IGNORE + ["++"]]
|
819
822
|
# hollow out anything we are trying to align, and leave everything else
|
820
823
|
chunks_backplate = [[j]
|
821
|
-
if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"]) and (j[-1] not in ["]"])
|
824
|
+
if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15", "(", ")"]) and (j[-1] not in ["]"])
|
822
825
|
and ("@" not in j)
|
823
|
-
and j.strip() not in
|
826
|
+
and j.strip() not in MOR_PUNCT + CHAT_IGNORE + ["++"])
|
824
827
|
else
|
825
828
|
[]
|
826
829
|
for i,j in chunks]
|
@@ -842,6 +845,14 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
842
845
|
elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
|
843
846
|
# just put it back
|
844
847
|
chunks_backplate[i.payload].append(i.key)
|
848
|
+
# we want to replace the morphology of forms that are not actually
|
849
|
+
# supposed to be analyzed
|
850
|
+
elif isinstance(i, Extra) and i.extra_type == ExtraType.REFERENCE:
|
851
|
+
ut[i.payload].morphology = [Morphology(
|
852
|
+
lemma = sents[0].tokens[i.payload].text,
|
853
|
+
pos = "x",
|
854
|
+
feats = ""
|
855
|
+
)]
|
845
856
|
# resolve all the numbers and flatten
|
846
857
|
chunks_backplate = [j if isinstance(j, str) else ut[j].text
|
847
858
|
for i in chunks_backplate
|
@@ -855,6 +866,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
855
866
|
retokenized_ut = retokenized_ut.replace("< ", "<")
|
856
867
|
retokenized_ut = retokenized_ut.replace(" :", ":")
|
857
868
|
retokenized_ut = retokenized_ut.replace(": <", ": <")
|
869
|
+
retokenized_ut = retokenized_ut.replace(" ↑", "↑")
|
858
870
|
retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
|
859
871
|
retokenized_ut = retokenized_ut.replace(" @", "@")
|
860
872
|
# pray to everyone that it works---this will simply crash and ignore
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.1b12 → batchalign-0.7.1b14}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|