PyPI - batchalign - Versions diffs - 0.7.1b6__tar.gz → 0.7.1b7__tar.gz - Mend

batchalign 0.7.1b6tar.gz → 0.7.1b7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

{batchalign-0.7.1b6/batchalign.egg-info → batchalign-0.7.1b7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.1b6
+Version: 0.7.1b7
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu

{batchalign-0.7.1b6 → batchalign-0.7.1b7}/batchalign/cli/cli.py RENAMED Viewed

@@ -149,19 +149,22 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
     def loader(file):
         return file
+    asr = "rev"
+    if kwargs["whisper"]:
+        asr = "whisper"
+    if kwargs["whisperx"]:
+        asr = "whisperx"
     def writer(doc, output):
+        doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
+                                         content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
         CHATFile(doc=doc, special_mor_=True).write(output
                                                    .replace(".wav", ".cha")
                                                    .replace(".mp4", ".cha")
                                                    .replace(".mp3", ".cha"),
                                                    write_wor=kwargs.get("wor", False))
-    asr = "rev"
-    if kwargs["whisper"]:
-        asr = "whisper"
-    if kwargs["whisperx"]:
-        asr = "whisperx"
     if kwargs.get("diarize"):
         _dispatch("transcribe_s",
                   lang, num_speakers, ["mp3", "mp4", "wav"], ctx,

{batchalign-0.7.1b6 → batchalign-0.7.1b7}/batchalign/formats/chat/parser.py RENAMED Viewed

@@ -222,7 +222,7 @@ def chat_parse_doc(lines, special_mor=False):
                 continue
             # we split because there are multiple languages possible
             elif "@Languages" in line.strip():
-                results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().split(",")]
+                results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().replace(" ", ",").strip().split(",") if i.strip() != ""]
                 if len(results["langs"]) > 0 and results["langs"][0] == "eng" and special_mor:
                     use_special_mor = True
             # parse participants; the number of | delinates the metedata field

{batchalign-0.7.1b6 → batchalign-0.7.1b7}/batchalign/pipelines/morphosyntax/ud.py RENAMED Viewed

@@ -808,7 +808,27 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
                 ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
                                           mor, gra,
                                           None, None)
+                # JANK add a space after every form being analyzed
+                text_fixed = []
+                text_orig = i.text
+                # we do this to force one replacement of the token
+                # for every input token
+                for i in sents[0].tokens:
+                    try:
+                        before, after = text_orig.split(i.text, 1)
+                        text_fixed.append(before.strip())
+                        text_fixed.append(i.text.strip())
+                        text_orig = after
+                    except ValueError:
+                        # we give up on that token; likely not found
+                        # because there we tokenization issues (i.e.
+                        # existing tokenization)
+                        continue
+                text_fixed.append(text_orig.strip())
+                text_fixed = " ".join(text_fixed).strip()
+                text_fixed = re.sub(r" +", " ", text_fixed)
                 doc.content[indx] = Utterance(content=ut,
+                                              text=text_fixed,
                                               tier=doc.content[indx].tier,
                                               time=doc.content[indx].time,
                                               custom_dependencies=doc.content[indx].custom_dependencies)

batchalign-0.7.1b7/batchalign/version ADDED Viewed

@@ -0,0 +1,3 @@
+0.7.1-beta.7
+May 21st, 2024
+insert debug info to transcribe file

{batchalign-0.7.1b6 → batchalign-0.7.1b7/batchalign.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.1b6
+Version: 0.7.1b7
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu