PyPI - batchalign - Versions diffs - 0.7.13.post1__tar.gz → 0.7.15__tar.gz - Mend

batchalign 0.7.13.post1tar.gz → 0.7.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

{batchalign-0.7.13.post1/batchalign.egg-info → batchalign-0.7.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.13.post1
+Version: 0.7.15
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
 Requires-Dist: rich-click>=1.7.0
 Requires-Dist: typing-extensions
 Requires-Dist: num2words
+Requires-Dist: tiktoken
+Requires-Dist: blobfile
+Requires-Dist: sentencepiece
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
 Provides-Extra: train

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/cli.py RENAMED Viewed

@@ -196,6 +196,28 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
                   loader, writer, C,
                   asr=asr, **kwargs)
+#################### TRANSLATE ################################
+@batchalign.command()
+@common_options
+@click.pass_context
+def translate(ctx, in_dir, out_dir, **kwargs):
+    """Translate the transcript to English."""
+    def loader(file):
+        cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
+        doc = cf.doc
+        # if str(cf).count("%mor") > 0:
+        #     doc.ba_special_["special_mor_notation"] = True
+        return doc
+    def writer(doc, output):
+        CHATFile(doc=doc).write(output)
+    _dispatch("translate", "eng", 1, ["cha"], ctx,
+              in_dir, out_dir,
+              loader, writer, C)
 #################### MORPHOTAG ################################
 @batchalign.command()

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/dispatch.py RENAMED Viewed

@@ -48,6 +48,7 @@ Cmd2Task = {
     "benchmark": "asr,eval",
     "utseg": "utterance",
     "coref": "coref",
+    "translate": "translate",
 }
 # this is the main runner used by all functions

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/document.py RENAMED Viewed

@@ -31,6 +31,7 @@ class Task(IntEnum):
     MORPHOSYNTAX = 11
     COREF = 12
     WER = 13
+    TRANSLATE = 14
     DEBUG__G = 0
@@ -54,6 +55,7 @@ TypeMap = {
     Task.DISFLUENCY_ANALYSIS: TaskType.PROCESSING,
     Task.COREF: TaskType.PROCESSING,
     Task.WER: TaskType.ANALYSIS,
+    Task.TRANSLATE: TaskType.PROCESSING,
     Task.DEBUG__G: TaskType.GENERATION,
     Task.DEBUG__P: TaskType.PROCESSING,
@@ -73,6 +75,7 @@ TaskFriendlyName = {
     Task.DISFLUENCY_ANALYSIS:  "Disfluncy Analysis",
     Task.COREF:  "Coreference Resolution",
     Task.WER:  "Word Error Rate",
+    Task.TRANSLATE:  "Translation",
     Task.DEBUG__G:  "TEST_GENERATION",
     Task.DEBUG__P:  "TEST_PROCESSING",
     Task.DEBUG__A:   "TEST_ANALYSIS",
@@ -150,6 +153,7 @@ class Utterance(BaseModel):
     tier: Tier = Field(default=Tier())
     content: Sentence
     text: Optional[str] = Field(default=None)
+    translation: Optional[str] = Field(default=None)
     time: Optional[Tuple[int,int]] = Field(default=None)
     custom_dependencies: List[CustomLine]  = Field(default=[])

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/generator.py RENAMED Viewed

@@ -95,7 +95,8 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
         result.append("%wor:\t"+" ".join(wor_elems))
     if has_coref:
         result.append("%coref:\t"+" ".join(coref_elems))
+    if utterance.translation != None:
+        result.append("%xtra:\t"+utterance.translation)
     #### EXTRA LINE GENERATION ####

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/parser.py RENAMED Viewed

@@ -280,6 +280,7 @@ def chat_parse_doc(lines, special_mor=False):
                 mor = None
                 gra = None
                 wor = None
+                translation = None
                 additional = []
                 while raw[0][0] == "%":
@@ -291,6 +292,8 @@ def chat_parse_doc(lines, special_mor=False):
                         gra = line
                     elif beg.strip() == "wor" or beg.strip() == "xwor":
                         wor = line
+                    elif beg.strip() == "xtra":
+                        translation = line
                     else:
                         additional.append(CustomLine(id=beg.strip(),
                                                         type=CustomLineType.DEPENDENT,
@@ -309,7 +312,8 @@ def chat_parse_doc(lines, special_mor=False):
                     "content": parsed,
                     "text": text,
                     "delim": delim,
-                    "custom_dependencies": additional
+                    "custom_dependencies": additional,
+                    "translation": translation
                 })
                 timing = re.findall(rf"\x15(\d+)_(\d+)\x15", text)

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from .utterance import BertUtteranceModel
+from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
 from .whisper import WhisperASRModel, WhisperFAModel
 from .speaker import NemoSpeakerModel
 from .utils import ASRAudioFile

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/resolve.py RENAMED Viewed

@@ -8,7 +8,7 @@ resolver = {
     "utterance": {
         'eng': "talkbank/CHATUtterance-en",
         "zho": "talkbank/CHATUtterance-zh_CN",
-        "yue": "talkbank/CHATUtterance-zh_CN",
+        "yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
     },
     "whisper": {
         'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),

batchalign-0.7.15/batchalign/models/utterance/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .infer import BertUtteranceModel
+from .cantonese_infer import BertCantoneseUtteranceModel

batchalign-0.7.15/batchalign/models/utterance/cantonese_infer.py ADDED Viewed

@@ -0,0 +1,164 @@
+import re
+import string
+import random
+# tokenization utilities
+import nltk
+from nltk import word_tokenize, sent_tokenize
+# torch
+import torch
+from torch.utils.data import dataset
+from torch.utils.data.dataloader import DataLoader
+from torch.optim import AdamW
+# import huggingface utils
+from transformers import AutoTokenizer, BertForTokenClassification
+from transformers import DataCollatorForTokenClassification
+# tqdm
+from tqdm import tqdm
+# seed device and tokens
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+# seed model
+class BertCantoneseUtteranceModel(object):
+    def __init__(self, model):
+        # seed tokenizers and model
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
+        self.max_length = 512
+        self.overlap = 20
+        # eval mode
+        self.model.eval()
+        print(f"Model and tokenizer initialized on device: {DEVICE}")
+        print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
+    def __call__(self, passage):
+        # Step 1: Clean up passage
+        passage = passage.lower()
+        passage = passage.replace('.','')
+        passage = passage.replace(',','')
+        passage = passage.replace('!','')
+        passage = passage.replace('！','')
+        passage = passage.replace('？','')
+        passage = passage.replace('。','')
+        passage = passage.replace('，','')
+        passage = passage.replace('?','')
+        passage = passage.replace('（','')
+        passage = passage.replace('）','')
+        passage = passage.replace('：','')
+        passage = passage.replace('＊','')
+        passage = passage.replace('ｌ','')
+        # Step 2: Define keywords and split the passage based on them
+        keywords = ['呀', '啦', '喎', '嘞', '㗎喇', '囉', '㗎', '啊', '嗯']  # Replace with your desired keywords
+        chunks = []
+        start = 0
+        while start < len(passage):
+            # Find the position of each keyword in the passage starting from the current `start`
+            keyword_positions = [(keyword, passage.find(keyword, start)) for keyword in keywords]
+            # Filter out keywords that are not found (find() returns -1 if not found)
+            keyword_positions = [kp for kp in keyword_positions if kp[1] != -1]
+            if keyword_positions:
+                # Find the keyword that appears first in the passage from current start
+                first_keyword, keyword_pos = min(keyword_positions, key=lambda x: x[1])
+                chunk = passage[start:keyword_pos + len(first_keyword)]
+                chunks.append(chunk)
+                start = keyword_pos + len(first_keyword)
+            else:
+                # No more keywords found, add the rest of the passage as the last chunk
+                chunks.append(passage[start:])
+                break
+        # Debugging: Print number of chunks and their content
+        print(f"Created {len(chunks)} chunks based on keywords.")
+        for i, chunk in enumerate(chunks):
+            print(f"Chunk {i + 1}: {chunk[:100]}...")  # Print the first 100 characters of each chunk
+        # Step 3: Process each chunk and restore punctuation
+        final_passage = []
+        for chunk_index, chunk in enumerate(chunks):
+            print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
+            # Step 3.1: Split chunk by characters (Chinese tokenization)
+            tokenized_chunk = list(chunk)  # Simply split by characters for Chinese text
+            # Step 3.2: Pass chunk through the tokenizer and model
+            tokd = self.tokenizer.batch_encode_plus([tokenized_chunk],
+                                  return_tensors='pt',
+                                  truncation=True,
+                                  padding=True,
+                                  max_length=self.max_length,
+                                  is_split_into_words=True).to(DEVICE)
+            try:
+                # Pass it through the model
+                res = self.model(**tokd).logits
+            except Exception as e:
+                print(f"Error during model inference: {e}")
+                return []
+            # Argmax for classification
+            classified_targets = torch.argmax(res, dim=2).cpu()
+            # Initialize result tokens list for the current chunk
+            res_toks = []
+            prev_word_idx = None
+            # Iterate over tokenized words
+            wids = tokd.word_ids(0)
+            for indx, elem in enumerate(wids):
+                if elem is None or elem == prev_word_idx:
+                    continue
+                prev_word_idx = elem
+                action = classified_targets[0][indx]
+                # Get the word corresponding to the token
+                w = tokenized_chunk[elem]  # Use tokenized chunk here
+                # Fix one word hanging issue (if needed)
+                will_action = False
+                if indx < len(wids) - 2 and classified_targets[0][indx + 1] > 0:
+                    will_action = True
+                if not will_action:
+                    # Perform the edits based on model predictions
+                    if action == 1:  # First capital letter
+                        w = w[0].upper() + w[1:]
+                    elif action == 2:  # Add period
+                        w = w + '.'
+                    elif action == 3:  # Add question mark
+                        w = w + '?'
+                    elif action == 4:  # Add exclamation mark
+                        w = w + '!'
+                    elif action == 5:  # Add comma
+                        w = w + ','
+                # Append modified word to result list
+                res_toks.append(w)
+            # Convert list of tokens back to string and append to final_passage
+            final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
+        # Step 4: Join processed chunks together into the final passage
+        final_text = ' '.join(final_passage)
+        print("Text processing completed. Generating final output...")
+        # Optionally, tokenize the final text into sentences based on punctuation
+        try:
+            split_passage = sent_tokenize(final_text)
+        except LookupError:
+            nltk.download('punkt')
+            split_passage = sent_tokenize(final_text)
+        return split_passage

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/infer_asr.py RENAMED Viewed

@@ -33,6 +33,7 @@ import pycountry
 import logging
 L = logging.getLogger("batchalign")
+# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
 # PYTORCH_ENABLE_MPS_FALLBACK=1

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/__init__.py RENAMED Viewed

@@ -12,3 +12,4 @@ from .utr import WhisperUTREngine, RevUTREngine
 from .analysis import EvaluationEngine
 from .utterance import StanzaUtteranceEngine
+from .translate import SeamlessTranslationModel

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/rev.py RENAMED Viewed

@@ -10,7 +10,7 @@ from batchalign.utils.config import config_read
 from batchalign.errors import *
-from batchalign.models import BertUtteranceModel, resolve
+from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
 import time
 import pathlib
@@ -49,7 +49,11 @@ class RevEngine(BatchalignEngine):
         self.__client = apiclient.RevAiAPIClient(key)
         if resolve("utterance", lang) != None:
             L.debug("Initializing utterance model...")
-            self.__engine = BertUtteranceModel(resolve("utterance", lang))
+            if lang != "yue":
+                self.__engine = BertUtteranceModel(resolve("utterance", lang))
+            else:
+                # we have special inference procedure for cantonese
+                self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
             L.debug("Done.")
         else:
             self.__engine = None

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/utils.py RENAMED Viewed

@@ -94,7 +94,10 @@ def retokenize_with_engine(intermediate_output, engine):
             tmp = []
             for s in new_ut:
-                tmp.append((s, utterance.pop(0)[1]))
+                try:
+                    tmp.append((s, utterance.pop(0)[1]))
+                except IndexError:
+                    continue
             final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
@@ -159,7 +162,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
                 final_words.append([part.strip(), [cur, cur+div]])
                 cur += div
-        lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
+        lang_2 = "yue" if lang == "yue" else pycountry.languages.get(alpha_3=lang).alpha_2
         def catched_num2words(i):
             if not i.isdigit():
                 return i

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/whisper.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from batchalign.document import *
 from batchalign.pipelines.base import *
 from batchalign.pipelines.asr.utils import *
-from batchalign.models import WhisperASRModel, BertUtteranceModel
+from batchalign.models import WhisperASRModel, BertUtteranceModel, BertCantoneseUtteranceModel
 import pycountry
@@ -44,7 +44,11 @@ class WhisperEngine(BatchalignEngine):
         if resolve("utterance", self.__lang) != None:
             L.debug("Initializing utterance model...")
-            self.__engine = BertUtteranceModel(resolve("utterance", self.__lang))
+            if lang != "yue":
+                self.__engine = BertUtteranceModel(resolve("utterance", lang))
+            else:
+                # we have special inference procedure for cantonese
+                self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
             L.debug("Done.")
         else:
             self.__engine = None

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/dispatch.py RENAMED Viewed

@@ -6,7 +6,7 @@ Tabulate default packages and options.
 from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
                         NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
                         RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
-                        StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
+                        StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine, SeamlessTranslationModel)
 from batchalign import BatchalignPipeline
 from batchalign.models import resolve
@@ -28,6 +28,7 @@ DEFAULT_PACKAGES = {
     "eval": "evaluation",
     "utterance": "stanza_utt",
     "coref": "stanza_coref",
+    "translate": "seamless_translate",
 }
 LANGUAGE_OVERRIDE_PACKAGES = {
@@ -129,6 +130,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
             engines.append(CorefEngine())
         elif engine == "wav2vec_fa":
             engines.append(Wave2VecFAEngine())
+        elif engine == "seamless_translate":
+            engines.append(SeamlessTranslationModel())
     L.debug(f"Done initalizing packages.")
     return BatchalignPipeline(*engines)

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/wave2vec_fa.py RENAMED Viewed

@@ -154,9 +154,9 @@ class Wave2VecFAEngine(BatchalignEngine):
                 if '\x15' not in ut.text:
                     ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
                 else:
-                    ut.text = re.sub("\x15\d+_\d+\x15",
+                    ut.text = re.sub(r"\x15\d+_\d+\x15",
                                      f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
             elif ut.text != None:
-                ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
+                ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
         return doc

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/whisper_fa.py RENAMED Viewed

@@ -179,9 +179,9 @@ class WhisperFAEngine(BatchalignEngine):
                 if '\x15' not in ut.text:
                     ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
                 else:
-                    ut.text = re.sub("\x15\d+_\d+\x15",
+                    ut.text = re.sub(r"\x15\d+_\d+\x15",
                                      f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
             elif ut.text != None:
-                ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
+                ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
         return doc

batchalign-0.7.15/batchalign/pipelines/translate/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .seamless import SeamlessTranslationModel

batchalign-0.7.15/batchalign/pipelines/translate/seamless.py ADDED Viewed

@@ -0,0 +1,53 @@
+from batchalign.models import WhisperFAModel
+from batchalign.document import *
+from batchalign.pipelines.base import *
+from batchalign.utils import *
+from batchalign.utils.dp import *
+from batchalign.constants import *
+from transformers import AutoProcessor, SeamlessM4TModel
+import logging
+L = logging.getLogger("batchalign")
+import re
+# !uv pip install sentencepiece
+import pycountry
+import warnings
+class SeamlessTranslationModel(BatchalignEngine):
+    tasks = [ Task.TRANSLATE ]
+    def _hook_status(self, status_hook):
+        self.status_hook = status_hook
+    def __init__(self):
+        self.status_hook = None
+        self.processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
+        self.model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
+    def process(self, doc:Document, **kwargs):
+        for indx, i in enumerate(doc.content):
+            if not isinstance(i, Utterance):
+                continue
+            if i.translation:
+                continue
+            text = i.strip(join_with_spaces=False, include_retrace=True, include_fp=True)
+            text_inputs = self.processor(text=text, src_lang=doc.langs[0] if doc.langs[0] != "zho" else "cmn", return_tensors="pt")
+            output_tokens = self.model.generate(**text_inputs, tgt_lang="eng", generate_speech=False)
+            translated_text_from_text = self.processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+            i.translation = translated_text_from_text
+            for j in MOR_PUNCT + ENDING_PUNCT:
+                i.translation = i.translation.replace(j, " "+j)
+            if self.status_hook != None:
+                self.status_hook(indx+1, len(doc.content))
+        return doc

batchalign-0.7.15/batchalign/version ADDED Viewed

@@ -0,0 +1,3 @@
+0.7.15
+Feburary 23rd, 2025
+Whisper ASR with Cantonese and tokenization!

{batchalign-0.7.13.post1 → batchalign-0.7.15/batchalign.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.13.post1
+Version: 0.7.15
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
 Requires-Dist: rich-click>=1.7.0
 Requires-Dist: typing-extensions
 Requires-Dist: num2words
+Requires-Dist: tiktoken
+Requires-Dist: blobfile
+Requires-Dist: sentencepiece
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
 Provides-Extra: train

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/SOURCES.txt RENAMED Viewed

@@ -40,6 +40,7 @@ batchalign/models/training/__init__.py
 batchalign/models/training/run.py
 batchalign/models/training/utils.py
 batchalign/models/utterance/__init__.py
+batchalign/models/utterance/cantonese_infer.py
 batchalign/models/utterance/dataset.py
 batchalign/models/utterance/execute.py
 batchalign/models/utterance/infer.py
@@ -83,6 +84,8 @@ batchalign/pipelines/morphosyntax/fr/case.py
 batchalign/pipelines/morphosyntax/ja/verbforms.py
 batchalign/pipelines/speaker/__init__.py
 batchalign/pipelines/speaker/nemo_speaker.py
+batchalign/pipelines/translate/__init__.py
+batchalign/pipelines/translate/seamless.py
 batchalign/pipelines/utr/__init__.py
 batchalign/pipelines/utr/rev_utr.py
 batchalign/pipelines/utr/utils.py

{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/requires.txt RENAMED Viewed

@@ -23,6 +23,9 @@ soundfile~=0.12.0
 rich-click>=1.7.0
 typing-extensions
 num2words
+tiktoken
+blobfile
+sentencepiece
 [dev]
 pytest

{batchalign-0.7.13.post1 → batchalign-0.7.15}/setup.py RENAMED Viewed

@@ -52,6 +52,9 @@ setup(
         "rich-click>=1.7.0",
         "typing-extensions",
         "num2words",
+        "tiktoken",
+        "blobfile",
+        "sentencepiece"
     ],
     extras_require={
         'dev': [