PyPI - batchalign - Versions diffs - 0.7.1b8__tar.gz → 0.7.1b10__tar.gz - Mend

@@ -119,10 +119,10 @@ batchalign.add_command(train, "models")
 def align(ctx, in_dir, out_dir, lang, num_speakers, whisper, **kwargs):
     """Align transcripts against corresponding media files."""
     def loader(file):
-        return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
+        return CHATFile(path=os.path.abspath(file)).doc
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc).write(output)
     _dispatch("align", lang, num_speakers,
               ["cha"], ctx,
@@ -159,11 +159,11 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     def writer(doc, output):
         doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
                                          content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
-        CHATFile(doc=doc, special_mor_=True).write(output
-                                                   .replace(".wav", ".cha")
-                                                   .replace(".mp4", ".cha")
-                                                   .replace(".mp3", ".cha"),
-                                                   write_wor=kwargs.get("wor", False))
+        CHATFile(doc=doc).write(output
+                                .replace(".wav", ".cha")
+                                .replace(".mp4", ".cha")
+                                .replace(".mp3", ".cha"),
+                                write_wor=kwargs.get("wor", False))
     if kwargs.get("diarize"):
         _dispatch("transcribe_s",
@@ -192,7 +192,6 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
 def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     """Perform morphosyntactic analysis on transcripts."""
     def loader(file):
         mwt = {}
         if kwargs.get("lexicon") != None and kwargs.get("lexicon", "").strip() != "":
@@ -202,13 +201,17 @@ def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
                 raw = [i for i in csv.reader(df)]
             for i in raw:
                 mwt[i[0]] = tuple(i[1:])
+        cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
+        doc = cf.doc
+        if str(cf).count("%mor") > 0:
+            doc.ba_special_["special_mor_notation"] = True
         return (
-            CHATFile(path=os.path.abspath(file), special_mor_=True).doc,
+            doc,
             {"retokenize": kwargs["retokenize"], "mwt": mwt}
         )
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
     _dispatch("morphotag", lang, num_speakers, ["cha"], ctx,
               in_dir, out_dir,
@@ -224,10 +227,10 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     """Perform morphosyntactic analysis on transcripts."""
     def loader(file):
-        return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
+        return CHATFile(path=os.path.abspath(file)).doc
     def writer(doc, output):
-        CHATFile(doc=doc, special_mor_=True).write(output)
+        CHATFile(doc=doc).write(output)
     _dispatch("utseg", lang, num_speakers, ["cha"], ctx,
               in_dir, out_dir,

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.1b8
+Version: 0.7.1b10
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu

@@ -1,5 +1,5 @@
 from enum import Enum, IntEnum
-from typing import Optional, List, Tuple, Union
+from typing import Optional, List, Tuple, Union, Any, Dict
 from typing_extensions import Annotated
 from pydantic import BaseModel, Field, computed_field
@@ -193,9 +193,12 @@ class Utterance(BaseModel):
     def __str__(self):
         if self.text != None:
-            return self.text
+            t = self.text
         else:
-            return self._detokenize()
+            t = self._detokenize()
+        t = t.replace(". . .", "+...")
+        return t
     def __repr__(self):
         return str(self)
@@ -331,6 +334,7 @@ class Document(BaseModel):
     langs: List[str] = Field(default=["eng"])
     # persistent digital identifier
     pid: Optional[str] = Field(default=None)
+    ba_special_: Optional[Dict] = Field(default={})
     def __repr__(self):
         return "\n".join(self.transcript())

@@ -188,7 +188,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
         final_utterances.append(Utterance(
             tier=participant,
-            content = words
+            content=words
         ))
     doc.content = final_utterances

@@ -848,7 +848,10 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
                                     for j in i]
                 retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
+                retokenized_ut = retokenized_ut.replace("^", "")
                 retokenized_ut = re.sub(r" +", " ", retokenized_ut)
+                retokenized_ut = retokenized_ut.replace("+ \"", "+\"")
+                retokenized_ut = retokenized_ut.replace(" >", ">")
                 # pray to everyone that it works---this will simply crash and ignore
                 # the utterance if it didn't work, so we are doing this as a sanity
                 # check rather than needing the parsed result

@@ -0,0 +1,3 @@
+0.7.1-beta.10
+May 21st, 2024
+patch minor ud bugs

@@ -1,3 +0,0 @@
-0.7.1-beta.8
-May 21st, 2024
-better retokenize algorithm

batchalign 0.7.1b8__tar.gz → 0.7.1b10__tar.gz

batchalign 0.7.1b8tar.gz → 0.7.1b10tar.gz