PyPI - batchalign - Versions diffs - 0.7.1b7__tar.gz → 0.7.1b9__tar.gz - Mend

batchalign 0.7.1b7tar.gz → 0.7.1b9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

{batchalign-0.7.1b7/batchalign.egg-info → batchalign-0.7.1b9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.1b7
+Version: 0.7.1b9
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu

{batchalign-0.7.1b7 → batchalign-0.7.1b9}/batchalign/cli/cli.py RENAMED Viewed

@@ -119,10 +119,10 @@ batchalign.add_command(train, "models")
 def align(ctx, in_dir, out_dir, lang, num_speakers, whisper, **kwargs):
     """Align transcripts against corresponding media files."""
     def loader(file):
-        return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
+        return CHATFile(path=os.path.abspath(file)).doc
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc).write(output)
     _dispatch("align", lang, num_speakers,
               ["cha"], ctx,
@@ -159,11 +159,11 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     def writer(doc, output):
         doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
                                          content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
-        CHATFile(doc=doc, special_mor_=True).write(output
-                                                   .replace(".wav", ".cha")
-                                                   .replace(".mp4", ".cha")
-                                                   .replace(".mp3", ".cha"),
-                                                   write_wor=kwargs.get("wor", False))
+        CHATFile(doc=doc).write(output
+                                .replace(".wav", ".cha")
+                                .replace(".mp4", ".cha")
+                                .replace(".mp3", ".cha"),
+                                write_wor=kwargs.get("wor", False))
     if kwargs.get("diarize"):
         _dispatch("transcribe_s",
@@ -192,7 +192,6 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
 def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     """Perform morphosyntactic analysis on transcripts."""
     def loader(file):
         mwt = {}
         if kwargs.get("lexicon") != None and kwargs.get("lexicon", "").strip() != "":
@@ -202,13 +201,17 @@ def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
                 raw = [i for i in csv.reader(df)]
             for i in raw:
                 mwt[i[0]] = tuple(i[1:])
+        cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
+        doc = cf.doc
+        if str(cf).count("%mor") > 0:
+            doc.ba_special_["special_mor_notation"] = True
         return (
-            CHATFile(path=os.path.abspath(file), special_mor_=True).doc,
+            doc,
             {"retokenize": kwargs["retokenize"], "mwt": mwt}
         )
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
     _dispatch("morphotag", lang, num_speakers, ["cha"], ctx,
               in_dir, out_dir,
@@ -224,10 +227,10 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     """Perform morphosyntactic analysis on transcripts."""
     def loader(file):
-        return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
+        return CHATFile(path=os.path.abspath(file)).doc
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc).write(output)
     _dispatch("utseg", lang, num_speakers, ["cha"], ctx,
               in_dir, out_dir,

{batchalign-0.7.1b7 → batchalign-0.7.1b9}/batchalign/document.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum, IntEnum
-from typing import Optional, List, Tuple, Union
+from typing import Optional, List, Tuple, Union, Any, Dict
 from typing_extensions import Annotated
 from pydantic import BaseModel, Field, computed_field
@@ -331,6 +331,7 @@ class Document(BaseModel):
     langs: List[str] = Field(default=["eng"])
     # persistent digital identifier
     pid: Optional[str] = Field(default=None)
+    ba_special_: Optional[Dict] = Field(default={})
     def __repr__(self):
         return "\n".join(self.transcript())

{batchalign-0.7.1b7 → batchalign-0.7.1b9}/batchalign/pipelines/morphosyntax/ud.py RENAMED Viewed

@@ -37,6 +37,7 @@ repath_file = lambda file_path, new_dir: os.path.join(new_dir, pathlib.Path(file
 from batchalign.document import *
+from batchalign.constants import *
 from batchalign.pipelines.base import *
 from batchalign.formats.chat.parser import chat_parse_utterance
@@ -808,27 +809,52 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
                 ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
                                           mor, gra,
                                           None, None)
-                # JANK add a space after every form being analyzed
-                text_fixed = []
-                text_orig = i.text
-                # we do this to force one replacement of the token
-                # for every input token
-                for i in sents[0].tokens:
-                    try:
-                        before, after = text_orig.split(i.text, 1)
-                        text_fixed.append(before.strip())
-                        text_fixed.append(i.text.strip())
-                        text_orig = after
-                    except ValueError:
-                        # we give up on that token; likely not found
-                        # because there we tokenization issues (i.e.
-                        # existing tokenization)
-                        continue
-                text_fixed.append(text_orig.strip())
-                text_fixed = " ".join(text_fixed).strip()
-                text_fixed = re.sub(r" +", " ", text_fixed)
+                # split the text up into previous chunks
+                chunks = list(enumerate(doc.content[indx].text.split(" ")))
+                # filter out everything that could not possibly align
+                chunks_align = [(i,j) for i,j in chunks
+                                if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
+                                                   and (len(j) <= 2 or (j[-2] not in "@"))
+                                and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]]
+                # hollow out anything we are trying to align, and leave everything else
+                chunks_backplate = [[j]
+                                    if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
+                                    and (len(j) <= 2 or (j[-2] not in "@"))
+                                            and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"])
+                                    else
+                                    []
+                                    for i,j in chunks]
+                # render each into a list
+                chunks_chars = []
+                for i,j in chunks_align:
+                    for k in j:
+                        chunks_chars.append(PayloadTarget(k, payload=i))
+                ud_chars = []
+                for i,j in enumerate(ut):
+                    for k in j.text:
+                        ud_chars.append(ReferenceTarget(k, payload=i))
+                # brrr
+                aligned = align(chunks_chars, ud_chars, tqdm=False)
+                for i in aligned:
+                    if isinstance(i, Match):
+                        if i.reference_payload not in chunks_backplate[i.payload]:
+                            chunks_backplate[i.payload].append(i.reference_payload)
+                    elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
+                        # just put it back
+                        chunks_backplate[i.payload].append(i.key)
+                # resolve all the numbers and flatten
+                chunks_backplate = [j if isinstance(j, str) else ut[j].text
+                                    for i in chunks_backplate
+                                    for j in i]
+                retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
+                retokenized_ut = re.sub(r" +", " ", retokenized_ut)
+                # pray to everyone that it works---this will simply crash and ignore
+                # the utterance if it didn't work, so we are doing this as a sanity
+                # check rather than needing the parsed result
+                _1, _2 = chat_parse_utterance(retokenized_ut, mor, gra, None, None)
                 doc.content[indx] = Utterance(content=ut,
-                                              text=text_fixed,
+                                              text=retokenized_ut,
                                               tier=doc.content[indx].tier,
                                               time=doc.content[indx].time,
                                               custom_dependencies=doc.content[indx].custom_dependencies)

batchalign-0.7.1b9/batchalign/version ADDED Viewed

@@ -0,0 +1,3 @@
+0.7.1-beta.9
+May 21st, 2024
+minute %umor implementation changes

{batchalign-0.7.1b7 → batchalign-0.7.1b9/batchalign.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.1b7
+Version: 0.7.1b9
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu