PyPI - batchalign - Versions diffs - 0.8.0.post4__tar.gz → 0.8.1__tar.gz - Mend

batchalign 0.8.0.post4tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of batchalign might be problematic. Click here for more details.

Files changed (168) hide show

{batchalign-0.8.0.post4/batchalign.egg-info → batchalign-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.8.0.post4
+Version: 0.8.1
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic>=2.4
+Requires-Dist: platformdirs>=4.3.0
+Requires-Dist: filelock>=3.0.0
 Requires-Dist: nltk>=3.8
 Requires-Dist: praatio<6.1.0,>=6.0.0
 Requires-Dist: torch>=2.6.0

batchalign-0.8.1/batchalign/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = str(1)
+import logging
+# clear all of nemo's loggers
+logging.getLogger().handlers.clear()
+logging.getLogger('nemo_logger').handlers.clear()
+logging.getLogger().setLevel(logging.CRITICAL)
+logging.getLogger('nemo_logger').disabled = True
+from .document import *
+from .constants import *
+from .errors import *
+# Defer slow imports
+# from .formats import *
+# from .pipelines import *
+# from .models import *
+# from .cli import batchalign as cli
+def __getattr__(name):
+    if name == 'cli':
+        from .cli import batchalign
+        return batchalign
+    if name == 'BatchalignPipeline':
+        from .pipelines import BatchalignPipeline
+        return BatchalignPipeline
+    if name == 'CHATFile':
+        from .formats.chat import CHATFile
+        return CHATFile
+    # Add other common engines if needed for dispatch.py
+    if name in ['WhisperEngine', 'WhisperFAEngine', 'StanzaEngine', 'RevEngine',
+                'NgramRetraceEngine', 'DisfluencyReplacementEngine', 'WhisperUTREngine',
+                'RevUTREngine', 'EvaluationEngine', 'WhisperXEngine', 'NemoSpeakerEngine',
+                'StanzaUtteranceEngine', 'CorefEngine', 'Wave2VecFAEngine', 'SeamlessTranslationModel',
+                'GoogleTranslateEngine', 'OAIWhisperEngine', 'PyannoteEngine']:
+        from .pipelines import dispatch
+        # This is a bit recursive, let's just let dispatch import them locally
+        # which it already does now.
+        import importlib
+        # We need to find which subpackage it's in.
+        # Actually, if we use local imports in dispatch.py, we don't need these here.
+        pass
+    raise AttributeError(f"module {__name__} has no attribute {name}")
+logging.getLogger('nemo_logger').disabled = False

batchalign-0.8.1/batchalign/cli/cache.py ADDED Viewed

@@ -0,0 +1,263 @@
+"""
+cache.py
+CLI subcommand for managing the Batchalign cache.
+Provides commands to:
+- Show cache statistics (--stats)
+- Clear all cached data (--clear)
+- Prepopulate cache from existing CHAT files (--warm)
+"""
+import os
+from pathlib import Path
+import rich_click as click
+from rich.console import Console
+C = Console()
+def _format_bytes(count: int | None, precision: int = 2) -> str:
+    """Format byte count as human-readable string."""
+    if count is None:
+        return "unknown"
+    units = ["B", "KB", "MB", "GB", "TB"]
+    idx = 0
+    size = float(count)
+    while size >= 1024 and idx < len(units) - 1:
+        size /= 1024
+        idx += 1
+    if idx == 0:
+        return f"{int(size)} {units[idx]}"
+    return f"{size:.{precision}f} {units[idx]}"
+@click.group(invoke_without_command=True)
+@click.option("--stats", is_flag=True, help="Show cache statistics.")
+@click.option(
+    "--clear",
+    is_flag=True,
+    help="Clear all cached data (requires confirmation)."
+)
+@click.pass_context
+def cache(ctx, stats, clear):
+    """Manage the Batchalign cache.
+    The cache stores per-utterance analysis results to avoid redundant
+    computation when re-processing unchanged content.
+    Examples:
+        batchalign cache --stats
+        batchalign cache --clear
+        batchalign cache warm INPUT_DIR --lang eng
+    """
+    # Handle --stats flag
+    if stats:
+        ctx.invoke(show_stats)
+        return
+    # Handle --clear flag
+    if clear:
+        ctx.invoke(clear_cache)
+        return
+    # If no flags and no subcommand, show help
+    if ctx.invoked_subcommand is None:
+        click.echo(ctx.get_help())
+@cache.command("stats")
+def show_stats():
+    """Show cache statistics."""
+    from batchalign.pipelines.cache import CacheManager
+    manager = CacheManager()
+    stats = manager.stats()
+    C.print()
+    C.print("[bold]Batchalign Cache Statistics[/bold]")
+    C.print("-" * 35)
+    C.print(f"[cyan]Location:[/cyan]     {stats['location']}")
+    C.print(f"[cyan]Size:[/cyan]         {_format_bytes(stats['size_bytes'])}")
+    C.print(f"[cyan]Entries:[/cyan]      {stats['total_entries']:,}")
+    C.print()
+    # Show breakdown by task
+    if stats["by_task"]:
+        C.print("[bold]By task:[/bold]")
+        for task, count in sorted(stats["by_task"].items()):
+            C.print(f"  {task}: {count:,} entries")
+        C.print()
+    # Show breakdown by engine version
+    if stats["by_engine_version"]:
+        # Get current stanza version to mark outdated entries
+        try:
+            import stanza
+            current_stanza = stanza.__version__
+        except ImportError:
+            current_stanza = None
+        C.print("[bold]Engine versions:[/bold]")
+        for key, count in sorted(stats["by_engine_version"].items()):
+            # Check if this version is outdated
+            outdated = ""
+            if current_stanza and "morphosyntax" in key:
+                version_part = key.split()[-1] if " " in key else ""
+                if version_part and version_part != current_stanza:
+                    outdated = " [dim](outdated)[/dim]"
+            C.print(f"  {key}: {count:,} entries{outdated}")
+        C.print()
+@cache.command("clear")
+@click.confirmation_option(
+    prompt="Are you sure you want to clear all cached data?"
+)
+def clear_cache():
+    """Clear all cached data."""
+    from batchalign.pipelines.cache import CacheManager
+    manager = CacheManager()
+    stats = manager.stats()
+    entries_before = stats["total_entries"]
+    bytes_freed = manager.clear()
+    C.print()
+    C.print(f"[bold green]Cache cleared.[/bold green]")
+    C.print(f"  Entries removed: {entries_before:,}")
+    C.print(f"  Space freed: {_format_bytes(bytes_freed)}")
+    C.print()
+@cache.command("warm")
+@click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
+@click.option(
+    "--lang",
+    default="eng",
+    help="Language code (3-letter ISO). Default: eng"
+)
+@click.option(
+    "--retokenize/--keeptokens",
+    default=False,
+    help="Whether files were processed with retokenization."
+)
+def warm_cache(input_dir, lang, retokenize):
+    """Prepopulate cache from existing CHAT files with %mor/%gra tiers.
+    Reads CHAT files that already have morphosyntactic analysis (%mor and %gra
+    tiers) and populates the cache with their content. This allows subsequent
+    processing of identical utterances to use cached results.
+    IMPORTANT: The command trusts the input files. It does not validate that
+    the %mor/%gra content is correct.
+    """
+    from batchalign.pipelines.cache import (
+        CacheManager, MorphotagCacheKey, _get_batchalign_version
+    )
+    from batchalign.formats.chat import CHATFile
+    from batchalign.document import Utterance
+    # Get engine version
+    try:
+        import stanza
+        engine_version = stanza.__version__
+    except ImportError:
+        C.print("[bold red]Error:[/bold red] stanza is not installed. Cannot warm cache.")
+        return
+    manager = CacheManager()
+    key_gen = MorphotagCacheKey()
+    ba_version = _get_batchalign_version()
+    # Collect all .cha files
+    cha_files = []
+    for root, dirs, files in os.walk(input_dir):
+        for f in files:
+            if f.lower().endswith(".cha"):
+                cha_files.append(os.path.join(root, f))
+    if not cha_files:
+        C.print(f"[bold yellow]No .cha files found in {input_dir}[/bold yellow]")
+        return
+    C.print(f"\nWarming cache from {len(cha_files)} CHAT file(s)...")
+    C.print(f"  Language: {lang}")
+    C.print(f"  Retokenize: {retokenize}")
+    C.print(f"  Stanza version: {engine_version}")
+    C.print()
+    entries_added = 0
+    entries_skipped = 0
+    files_processed = 0
+    for cha_path in cha_files:
+        try:
+            cf = CHATFile(path=cha_path, special_mor_=True)
+            doc = cf.doc
+            # Map for batching within a file
+            utterances_to_check = []
+            idx_to_key = {}
+            for idx, item in enumerate(doc.content):
+                if not isinstance(item, Utterance):
+                    continue
+                # Check if utterance has morphology/dependency
+                has_morphology = any(
+                    form.morphology and len(form.morphology) > 0
+                    for form in item.content
+                )
+                has_dependency = any(
+                    form.dependency and len(form.dependency) > 0
+                    for form in item.content
+                )
+                if not (has_morphology or has_dependency):
+                    continue
+                # Generate cache key
+                key = key_gen.generate_key(
+                    item,
+                    lang=lang,
+                    retokenize=retokenize,
+                    mwt={}
+                )
+                utterances_to_check.append((idx, key))
+                idx_to_key[idx] = key
+            if not utterances_to_check:
+                files_processed += 1
+                continue
+            # Batch check
+            keys = [k for _, k in utterances_to_check]
+            cached_results = manager.get_batch(keys, "morphosyntax", engine_version)
+            entries_skipped += len(cached_results)
+            # Filter out already cached ones and prepare for batch put
+            to_put = []
+            for idx, key in utterances_to_check:
+                if key not in cached_results:
+                    item = doc.content[idx]
+                    data = key_gen.serialize_output(item)
+                    to_put.append((key, data))
+            if to_put:
+                manager.put_batch(to_put, "morphosyntax", engine_version, ba_version)
+                entries_added += len(to_put)
+            files_processed += 1
+        except Exception as e:
+            C.print(f"[yellow]Warning:[/yellow] Could not process {cha_path}: {e}")
+            continue
+    C.print(f"[bold green]Cache warming complete.[/bold green]")
+    C.print(f"  Files processed: {files_processed}")
+    C.print(f"  Entries added: {entries_added}")
+    C.print(f"  Entries skipped (already cached): {entries_skipped}")
+    C.print()

{batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/cli.py RENAMED Viewed

@@ -92,6 +92,9 @@ def batchalign(ctx, verbose, workers):
 batchalign.add_command(train, "models")
+from batchalign.cli.cache import cache
+batchalign.add_command(cache, "cache")
 #################### ALIGN ################################
 @batchalign.command()
@@ -230,6 +233,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
               type=click.Path(exists=True,
                               file_okay=True, dir_okay=False),
               help="Comma seperated manual lexicon override")
+@click.option("--override-cache/--use-cache",
+              default=False, help="Bypass cache and recompute all utterances.")
 @click.pass_context
 def morphotag(ctx, in_dir, out_dir, **kwargs):
     """Perform morphosyntactic analysis on transcripts."""

{batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/dispatch.py RENAMED Viewed

@@ -94,8 +94,9 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
     else:
         baL.setLevel(logging.DEBUG)
-    # Always capture output to avoid interleaving with progress rendering.
-    should_capture = True
+    # Always capture output to avoid interleaving with progress rendering,
+    # unless high verbosity is requested for debugging.
+    should_capture = verbose < 2
     if should_capture:
         # Use a temporary file to capture ALL output at the FD level
@@ -128,6 +129,7 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
             mwt = kwargs.pop("mwt", {})
             retokenize = kwargs.pop("retokenize", False)
             skipmultilang = kwargs.pop("skipmultilang", False)
+            override_cache = kwargs.pop("override_cache", False)
             cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
             doc = cf.doc
@@ -138,7 +140,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
             pipeline_kwargs = {
                 "retokenize": retokenize,
                 "skipmultilang": skipmultilang,
-                "mwt": mwt
+                "mwt": mwt,
+                "override_cache": override_cache
             }
             # Add any remaining kwargs
             pipeline_kwargs.update(kwargs)

batchalign-0.8.1/batchalign/formats/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# from .chat import CHATFile
+# from .textgrid import TextGridFile
+def __getattr__(name):
+    if name == 'CHATFile':
+        from .chat import CHATFile
+        return CHATFile
+    if name == 'TextGridFile':
+        from .textgrid import TextGridFile
+        return TextGridFile
+    raise AttributeError(f"module {__name__} has no attribute {name}")

batchalign-0.8.1/batchalign/models/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+# from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
+# from .whisper import WhisperASRModel, WhisperFAModel
+# from .speaker import NemoSpeakerModel
+# from .utils import ASRAudioFile
+# from .resolve import resolve
+# from .wave2vec import Wave2VecFAModel
+def __getattr__(name):
+    if name == 'BertUtteranceModel':
+        from .utterance import BertUtteranceModel
+        return BertUtteranceModel
+    if name == 'BertCantoneseUtteranceModel':
+        from .utterance import BertCantoneseUtteranceModel
+        return BertCantoneseUtteranceModel
+    if name == 'WhisperASRModel':
+        from .whisper import WhisperASRModel
+        return WhisperASRModel
+    if name == 'WhisperFAModel':
+        from .whisper import WhisperFAModel
+        return WhisperFAModel
+    if name == 'NemoSpeakerModel':
+        from .speaker import NemoSpeakerModel
+        return NemoSpeakerModel
+    if name == 'ASRAudioFile':
+        from .utils import ASRAudioFile
+        return ASRAudioFile
+    if name == 'resolve':
+        from .resolve import resolve
+        return resolve
+    if name == 'Wave2VecFAModel':
+        from .wave2vec import Wave2VecFAModel
+        return Wave2VecFAModel
+    raise AttributeError(f"module {__name__} has no attribute {name}")

batchalign-0.8.1/batchalign/models/speaker/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# from .infer import NemoSpeakerModel
+def __getattr__(name):
+    if name == 'NemoSpeakerModel':
+        from .infer import NemoSpeakerModel
+        return NemoSpeakerModel
+    raise AttributeError(f"module {__name__} has no attribute {name}")

{batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utils.py RENAMED Viewed

@@ -187,6 +187,37 @@ class ASRAudioFile:
         return data
+    def hash_chunk(self, begin_ms, end_ms):
+        """Generate a tiny SHA256 hash of a chunk of audio for caching."""
+        import hashlib
+        data = self.chunk(begin_ms, end_ms)
+        num_samples = data.numel()
+        # Tiny fingerprint: 100 samples from the middle + total length
+        if num_samples > 100:
+            mid = num_samples // 2
+            samples = data[mid-50:mid+50]
+        else:
+            samples = data
+        # Include length to catch simple duration changes
+        header = f"{num_samples}|".encode()
+        return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
+    def hash_all(self):
+        """Generate a tiny SHA256 hash of the entire audio file."""
+        import hashlib
+        num_samples = self.tensor.numel()
+        if num_samples > 100:
+            mid = num_samples // 2
+            samples = self.tensor[mid-50:mid+50]
+        else:
+            samples = self.tensor
+        header = f"{num_samples}|".encode()
+        return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
     def all(self):
         """Get the audio in its entirety

batchalign-0.8.1/batchalign/models/utterance/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# from .infer import BertUtteranceModel
+# from .cantonese_infer import BertCantoneseUtteranceModel
+def __getattr__(name):
+    if name == 'BertUtteranceModel':
+        from .infer import BertUtteranceModel
+        return BertUtteranceModel
+    if name == 'BertCantoneseUtteranceModel':
+        from .cantonese_infer import BertCantoneseUtteranceModel
+        return BertCantoneseUtteranceModel
+    raise AttributeError(f"module {__name__} has no attribute {name}")

{batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/cantonese_infer.py RENAMED Viewed

@@ -1,43 +1,35 @@
 import re
 import string
 import random
+import logging
-# tokenization utilities
-import nltk
-from nltk import word_tokenize, sent_tokenize
+L = logging.getLogger("batchalign")
-# torch
-import torch
-from torch.utils.data import dataset
-from torch.utils.data.dataloader import DataLoader
-from torch.optim import AdamW
-# import huggingface utils
-from transformers import AutoTokenizer, BertForTokenClassification
-from transformers import DataCollatorForTokenClassification
-# tqdm
-from tqdm import tqdm
-# seed device and tokens
-DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+# heavy imports moved to local scope
 # seed model
 class BertCantoneseUtteranceModel(object):
     def __init__(self, model):
+        import torch
+        from transformers import AutoTokenizer, BertForTokenClassification
+        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
         # seed tokenizers and model
         self.tokenizer = AutoTokenizer.from_pretrained(model)
-        self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
+        self.model = BertForTokenClassification.from_pretrained(model).to(device)
+        self.device = device
         self.max_length = 512
         self.overlap = 20
         # eval mode
         self.model.eval()
-        print(f"Model and tokenizer initialized on device: {DEVICE}")
-        print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
+        L.debug(f"Model and tokenizer initialized on device: {device}")
+        L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
     def __call__(self, passage):
+        import torch
         # Step 1: Clean up passage
         passage = passage.lower()
         passage = passage.replace('.','')
@@ -78,15 +70,14 @@ class BertCantoneseUtteranceModel(object):
                 chunks.append(passage[start:])
                 break
-        # Debugging: Print number of chunks and their content
-        print(f"Created {len(chunks)} chunks based on keywords.")
+        L.debug(f"Created {len(chunks)} chunks based on keywords.")
         for i, chunk in enumerate(chunks):
-            print(f"Chunk {i + 1}: {chunk[:100]}...")  # Print the first 100 characters of each chunk
+            L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
         # Step 3: Process each chunk and restore punctuation
         final_passage = []
         for chunk_index, chunk in enumerate(chunks):
-            print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
+            L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
             # Step 3.1: Split chunk by characters (Chinese tokenization)
             tokenized_chunk = list(chunk)  # Simply split by characters for Chinese text
@@ -97,13 +88,13 @@ class BertCantoneseUtteranceModel(object):
                                   truncation=True,
                                   padding=True,
                                   max_length=self.max_length,
-                                  is_split_into_words=True).to(DEVICE)
+                                  is_split_into_words=True).to(self.device)
             try:
                 # Pass it through the model
                 res = self.model(**tokd).logits
             except Exception as e:
-                print(f"Error during model inference: {e}")
+                L.error(f"Error during model inference: {e}")
                 return []
             # Argmax for classification
@@ -152,7 +143,7 @@ class BertCantoneseUtteranceModel(object):
         # Step 4: Join processed chunks together into the final passage
         final_passage = ' '.join(final_passage)
-        print("Text processing completed. Generating final output...")
+        L.debug("Text processing completed. Generating final output...")
         # Optionally, tokenize the final text into sentences based on punctuation
         def custom_sent_tokenize(text):
@@ -163,32 +154,29 @@ class BertCantoneseUtteranceModel(object):
             # Split the passage based on punctuation marks and keep them
             parts = re.split(sentence_endings, text)
-            # Debug: Output the parts after splitting
-            print(f"Parts after splitting: {parts}")
+            L.debug(f"Parts after splitting: {parts}")
             # Combine parts and punctuation together
             for i in range(0, len(parts) - 1, 2):
                 sentence = parts[i] + parts[i + 1]  # Join sentence with punctuation
-                print(f"Sentence formed: {sentence}")  # Debug: Output the current sentence
+                L.debug(f"Sentence formed: {sentence}")
                 if sentence.strip():  # Only add non-empty sentences (check for non-whitespace content)
                     split_passage.append(sentence)
             # If the last part doesn't have punctuation, we handle it here
             if len(parts) % 2 != 0:  # If there's no punctuation at the end
                 last_part = parts[-1].strip()
-                print(f"Last part without punctuation: {last_part}")  # Debug: Output the last part
+                L.debug(f"Last part without punctuation: {last_part}")
                 if last_part:  # Only add non-empty sentences
                     split_passage.append(last_part)
-            # Final output
-            print(f"Final split passage: {split_passage}")
+            L.debug(f"Final split passage: {split_passage}")
             return split_passage
         split_passage = custom_sent_tokenize(final_passage)
-        # Debugging: Output the sentences after splitting
-        print(f"Final sentences: {split_passage}")
+        L.debug(f"Final sentences: {split_passage}")
         return split_passage

batchalign 0.8.0.post4__tar.gz → 0.8.1__tar.gz

Potentially problematic release.

batchalign 0.8.0.post4tar.gz → 0.8.1tar.gz