PyPI - easytranscriber - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

easytranscriber 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{easytranscriber-0.2.1/src/easytranscriber.egg-info → easytranscriber-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.1
+Version: 0.2.2
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber

{easytranscriber-0.2.1 → easytranscriber-0.2.2}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 [project]
-version = "0.2.1"
+version = "0.2.2"
 name = "easytranscriber"
 requires-python = ">= 3.10"
 description = "Speech recognition with accurate word-level timestamps."

easytranscriber-0.2.2/src/easytranscriber/asr/cohere.py ADDED Viewed

@@ -0,0 +1,136 @@
+import logging
+from pathlib import Path
+import torch
+from easyaligner.utils import save_metadata_json
+from tqdm import tqdm
+from easytranscriber.data.collators import cohere_transcribe_collate_fn
+logger = logging.getLogger(__name__)
+# The 14 languages Cohere Transcribe was trained on.
+# https://huggingface.co/CohereLabs/cohere-transcribe-03-2026
+COHERE_SUPPORTED_LANGUAGES = frozenset(
+    {"ar", "de", "el", "en", "es", "fr", "it", "ja", "ko", "nl", "pl", "pt", "vi", "zh"}
+)
+def _require_transformers():
+    try:
+        from transformers import CohereAsrForConditionalGeneration
+    except ImportError as e:
+        raise ImportError(
+            "The 'cohere' ASR backend requires transformers>=5.4.0 "
+            "(CohereAsrForConditionalGeneration is not available in the installed version). "
+            "Upgrade with: pip install --upgrade 'transformers>=5.4.0'"
+        ) from e
+    return CohereAsrForConditionalGeneration
+def transcribe(
+    model,
+    processor,
+    file_dataloader: torch.utils.data.DataLoader,
+    language: str,
+    batch_size: int = 4,
+    max_new_tokens: int = 256,
+    punctuation: bool = True,
+    sample_rate: int = 16000,
+    num_workers: int = 2,
+    prefetch_factor: int = 2,
+    output_dir: str = "output/transcriptions",
+    generate_kwargs: dict | None = None,
+):
+    """
+    Transcribe audio files using the Cohere Transcribe model.
+    Parameters
+    ----------
+    model : transformers.CohereAsrForConditionalGeneration
+        Cohere ASR model.
+    processor : transformers.AutoProcessor
+        Cohere ASR processor.
+    file_dataloader : torch.utils.data.DataLoader
+        DataLoader yielding audio file datasets. The underlying
+        ``StreamingAudioFileDataset`` must be constructed with
+        ``return_raw_audio=True`` so the processor can be called on whole
+        batches (per-sample calls return variable-length features).
+    language : str
+        ISO 639-1 language code (e.g. 'en', 'ja'). Required — Cohere has
+        no built-in language detection.
+    batch_size : int, optional
+        Batch size for inference.
+    max_new_tokens : int, optional
+        Maximum number of tokens to generate per chunk. Default is 256.
+    punctuation : bool, optional
+        Emit punctuation in transcriptions. Default is True.
+    sample_rate : int, optional
+        Sample rate of audio passed to the processor. Default is 16000.
+    num_workers : int, optional
+        Number of workers for the feature dataloader.
+    prefetch_factor : int, optional
+        Prefetch factor for the feature dataloader.
+    output_dir : str, optional
+        Directory to save transcription JSON files.
+    generate_kwargs : dict, optional
+        Extra keyword arguments forwarded to ``model.generate()`` (e.g.
+        ``num_beams``, ``length_penalty``).
+    """
+    _require_transformers()
+    if language is None:
+        raise ValueError(
+            "The 'cohere' backend requires an explicit `language` — "
+            "CohereAsrForConditionalGeneration does not perform language detection."
+        )
+    if language not in COHERE_SUPPORTED_LANGUAGES:
+        raise ValueError(
+            f"Language {language!r} is not supported by Cohere Transcribe. "
+            f"Supported: {sorted(COHERE_SUPPORTED_LANGUAGES)}."
+        )
+    generate_kwargs = generate_kwargs or {}
+    for features in tqdm(file_dataloader, desc="Transcribing audio files"):
+        slice_dataset = features[0]["dataset"]
+        metadata = features[0]["dataset"].metadata
+        transcription_texts = []
+        feature_dataloader = torch.utils.data.DataLoader(
+            slice_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            prefetch_factor=prefetch_factor,
+            collate_fn=cohere_transcribe_collate_fn,
+        )
+        logger.info(f"Transcribing {metadata.audio_path} ...")
+        for batch in feature_dataloader:
+            inputs = processor(
+                batch["audio"],
+                sampling_rate=sample_rate,
+                return_tensors="pt",
+                language=language,
+                punctuation=punctuation,
+            )
+            inputs = inputs.to(model.device, dtype=model.dtype)
+            with torch.inference_mode():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    **generate_kwargs,
+                )
+                transcription = processor.batch_decode(outputs, skip_special_tokens=True)
+                transcription_texts.extend(transcription)
+        for i, speech in enumerate(metadata.speeches):
+            for j, chunk in enumerate(speech.chunks):
+                chunk.text = transcription_texts[j].strip()
+        output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        save_metadata_json(metadata, output_dir=output_dir)

easytranscriber-0.2.2/src/easytranscriber/data/collators.py ADDED Viewed

@@ -0,0 +1,57 @@
+import torch
+def transcribe_collate_fn(batch: list[dict]) -> dict:
+    """
+    Collate function for transcription.
+    Parameters
+    ----------
+    batch : list of dict
+        List of samples from the dataset.
+    Returns
+    -------
+    dict
+        Collated batch with 'features', 'start_times', and 'speech_ids'.
+    """
+    # Remove None values
+    speech_ids = [b["speech_id"] for b in batch if b is not None]
+    start_times = [b["start_time_global"] for b in batch if b is not None]
+    batch = [b["feature"] for b in batch if b is not None]
+    # Concat, keep batch dimension
+    batch = torch.cat(batch, dim=0)
+    return {
+        "features": batch,
+        "start_times": start_times,
+        "speech_ids": speech_ids,
+    }
+def cohere_transcribe_collate_fn(batch: list[dict]) -> dict:
+    """
+    Collate function for Cohere ASR transcription.
+    Gathers raw audio arrays into a list so the caller can invoke Cohere's
+    processor on the whole batch — required because per-sample processor
+    calls return variable-length features that cannot be stacked.
+    Parameters
+    ----------
+    batch : list of dict
+        List of samples from the dataset, each with ``"audio"`` (raw waveform).
+    Returns
+    -------
+    dict
+        Collated batch with ``'audio'`` (list of waveforms), ``'start_times'``,
+        and ``'speech_ids'``.
+    """
+    batch = [b for b in batch if b is not None]
+    return {
+        "audio": [b["audio"] for b in batch],
+        "start_times": [b["start_time_global"] for b in batch],
+        "speech_ids": [b["speech_id"] for b in batch],
+    }

{easytranscriber-0.2.1 → easytranscriber-0.2.2}/src/easytranscriber/data/dataset.py RENAMED Viewed

@@ -48,12 +48,14 @@ class StreamingAudioSliceDataset(Dataset):
         processor: Wav2Vec2Processor | WhisperProcessor,
         sample_rate: int = 16000,
         metadata: AudioMetadata | None = None,
+        return_raw_audio: bool = False,
     ):
         self.audio_path = str(audio_path)
         self.chunk_specs = chunk_specs
         self.processor = processor
         self.sample_rate = sample_rate
         self.metadata = metadata
+        self.return_raw_audio = return_raw_audio
         self.processor_attribute = (
             "input_values" if isinstance(processor, Wav2Vec2Processor) else "input_features"
         )
@@ -75,6 +77,14 @@ class StreamingAudioSliceDataset(Dataset):
             sample_rate=self.sample_rate,
         )
+        if self.return_raw_audio:
+            # Caller (e.g. cohere backend) preprocesses with the batch to handle padding.
+            return {
+                "audio": audio,
+                "start_time_global": start_sec,
+                "speech_id": spec["speech_id"],
+            }
         # Convert to tensor and add batch dimension for processor
         if isinstance(self.processor, Wav2Vec2Processor):
             audio = torch.tensor(audio).unsqueeze(0)
@@ -165,6 +175,7 @@ class StreamingAudioFileDataset(Dataset):
         sample_rate: int = 16000,
         chunk_size: int = 30,
         alignment_strategy: str = "chunk",
+        return_raw_audio: bool = False,
     ):
         if isinstance(metadata, AudioMetadata):
             self.metadata = [metadata]
@@ -176,6 +187,7 @@ class StreamingAudioFileDataset(Dataset):
         self.chunk_size = chunk_size
         self.processor = processor
         self.alignment_strategy = alignment_strategy
+        self.return_raw_audio = return_raw_audio
     def _get_speech_chunk_specs(self, metadata: AudioMetadata) -> list[dict]:
         """
@@ -281,6 +293,7 @@ class StreamingAudioFileDataset(Dataset):
             processor=self.processor,
             sample_rate=self.sr,
             metadata=metadata,
+            return_raw_audio=self.return_raw_audio,
         )
         return {

{easytranscriber-0.2.1 → easytranscriber-0.2.2}/src/easytranscriber/pipelines.py RENAMED Viewed

@@ -5,11 +5,7 @@ import ctranslate2
 import torch
 from easyaligner.data.collators import audiofile_collate_fn, metadata_collate_fn
 from easyaligner.data.datamodel import SpeechSegment
-from easyaligner.data.dataset import (
-    AudioFileDataset,
-    JSONMetadataDataset,
-    StreamingAudioFileDataset,
-)
+from easyaligner.data.dataset import AudioFileDataset, JSONMetadataDataset
 from easyaligner.pipelines import alignment_pipeline, emissions_pipeline, vad_pipeline
 from easyaligner.vad.pyannote import load_vad_model as load_pyannote_vad_model
 from easyaligner.vad.silero import load_vad_model as load_silero_vad_model
@@ -22,15 +18,30 @@ from transformers import (
 from easytranscriber.asr.ct2 import transcribe as ct2_transcribe
 from easytranscriber.asr.hf import transcribe as hf_transcribe
+from easytranscriber.data.dataset import StreamingAudioFileDataset
 from easytranscriber.text.normalization import text_normalizer
 from easytranscriber.utils import hf_to_ct2_converter
 logger = logging.getLogger(__name__)
+def _load_cohere_transcribe():
+    """Lazy loader for the cohere backend.
+    Defers the ``CohereAsrForConditionalGeneration`` import (which requires
+    transformers>=5.4.0) so that users on older transformers who only use
+    the ct2/hf backends can still import this module.
+    """
+    from easytranscriber.asr.cohere import transcribe as cohere_transcribe
+    return cohere_transcribe
 # dispatch mapping
 TRANSCRIBE_BACKENDS = {
     "ct2": ct2_transcribe,
     "hf": hf_transcribe,
+    "cohere": _load_cohere_transcribe,
 }
 VAD_BACKENDS = {
@@ -56,10 +67,13 @@ def pipeline(
     task: str = "transcribe",
     beam_size: int = 5,
     max_length: int = 250,
+    max_new_tokens: int = 256,
     repetition_penalty: float = 1.0,
     length_penalty: float = 1.0,
     patience: float = 1.0,
     no_repeat_ngram_size: int = 0,
+    punctuation: bool = True,
+    generate_kwargs: dict | None = None,
     start_wildcard: bool = False,
     end_wildcard: bool = False,
     blank_id: int | None = None,
@@ -103,7 +117,9 @@ def pipeline(
     speeches : list[list[SpeechSegment]], optional
         Existing speech segments for alignment.
     backend : str, optional
-        Backend to use for the transcription model: "ct2" or "hf". Default is "ct2".
+        Backend to use for the transcription model: "ct2", "hf", or "cohere". Default is "ct2".
+        The "cohere" backend requires `transformers>=5.4.0`, `streaming=True`, and an explicit
+        `language` (Cohere has no language detection).
     sample_rate : int, optional
         Sample rate.
     chunk_size : int, optional
@@ -127,7 +143,14 @@ def pipeline(
     repetition_penalty : float, optional
         See HF [source code](https://github.com/huggingface/transformers/blob/v4.57.5/src/transformers/generation/configuration_utils.py#L188-L190) for details.
     max_length : int, optional
-        Maximum length of generated text.
+        Maximum length of generated text. Applies to Whisper backends (ct2, hf).
+    max_new_tokens : int, optional
+        Maximum number of new tokens to generate per chunk. Applies to the cohere backend.
+    punctuation : bool, optional
+        Emit punctuation in Cohere transcriptions. Applies to the cohere backend only.
+    generate_kwargs : dict, optional
+        Extra kwargs forwarded to ``model.generate()`` for the cohere backend
+        (e.g. ``num_beams``, ``length_penalty``).
     start_wildcard : bool, optional
         Add start wildcard to forced alignment.
     end_wildcard : bool, optional
@@ -244,32 +267,68 @@ def pipeline(
     )
     # Step 2: Run Transcription
-    transcription_args = {
-        "language": language,
-        "task": task,
-        "beam_size": beam_size,
-        "max_length": max_length,
-        "repetition_penalty": repetition_penalty,
-        "length_penalty": length_penalty,
-    }
-    if backend == "ct2":
-        model_path = hf_to_ct2_converter(transcription_model, cache_dir=cache_dir)
-        logger.info(f"Loading CTranslate2 model from {model_path}...")
-        model = ctranslate2.models.Whisper(model_path.as_posix(), device=device)
-        transcription_args.update(
-            {
-                "patience": patience,
-                "no_repeat_ngram_size": no_repeat_ngram_size,
-            }
+    dataset_kwargs: dict = {}
+    if backend == "cohere":
+        if language is None:
+            raise ValueError(
+                "The 'cohere' backend requires an explicit `language` — "
+                "CohereAsrForConditionalGeneration does not perform language detection."
+            )
+        if not streaming:
+            raise ValueError(
+                "The 'cohere' backend requires `streaming=True` "
+                "(the non-streaming AudioFileDataset does not support return_raw_audio)."
+            )
+        transcription_args = {
+            "language": language,
+            "max_new_tokens": max_new_tokens,
+            "punctuation": punctuation,
+            "sample_rate": sample_rate,
+            "generate_kwargs": generate_kwargs,
+        }
+        from transformers import AutoProcessor, CohereAsrForConditionalGeneration
+        logger.info(f"Loading Cohere ASR model from {transcription_model}...")
+        model = (
+            CohereAsrForConditionalGeneration.from_pretrained(
+                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+            )
+            .to(device)
+            .eval()
         )
+        processor = AutoProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
+        dataset_kwargs = {"return_raw_audio": True}
     else:
-        logger.info(f"Loading Hugging Face model from {transcription_model}...")
-        model = WhisperForConditionalGeneration.from_pretrained(
-            transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
-        ).to(device)
+        transcription_args = {
+            "language": language,
+            "task": task,
+            "beam_size": beam_size,
+            "max_length": max_length,
+            "repetition_penalty": repetition_penalty,
+            "length_penalty": length_penalty,
+        }
+        if backend == "ct2":
+            model_path = hf_to_ct2_converter(transcription_model, cache_dir=cache_dir)
+            logger.info(f"Loading CTranslate2 model from {model_path}...")
+            model = ctranslate2.models.Whisper(model_path.as_posix(), device=device)
+            transcription_args.update(
+                {
+                    "patience": patience,
+                    "no_repeat_ngram_size": no_repeat_ngram_size,
+                }
+            )
+        else:
+            logger.info(f"Loading Hugging Face model from {transcription_model}...")
+            model = WhisperForConditionalGeneration.from_pretrained(
+                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+            ).to(device)
+        processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
-    processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
     json_dataset = JSONMetadataDataset(
         json_paths=[str(Path(output_vad_dir) / p) for p in json_paths]
     )
@@ -281,6 +340,7 @@ def pipeline(
         sample_rate=sample_rate,
         chunk_size=chunk_size,
         alignment_strategy="chunk",
+        **dataset_kwargs,
     )
     file_dataloader = torch.utils.data.DataLoader(
@@ -293,6 +353,9 @@ def pipeline(
     )
     transcribe = TRANSCRIBE_BACKENDS[backend]
+    if backend == "cohere":
+        transcribe = transcribe()  # lazy-load to avoid importing on older transformers
     transcribe(
         model=model,
         processor=processor,

{easytranscriber-0.2.1 → easytranscriber-0.2.2/src/easytranscriber.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.1
+Version: 0.2.2
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber

{easytranscriber-0.2.1 → easytranscriber-0.2.2}/src/easytranscriber.egg-info/SOURCES.txt RENAMED Viewed

@@ -10,6 +10,7 @@ src/easytranscriber.egg-info/dependency_links.txt
 src/easytranscriber.egg-info/entry_points.txt
 src/easytranscriber.egg-info/requires.txt
 src/easytranscriber.egg-info/top_level.txt
+src/easytranscriber/asr/cohere.py
 src/easytranscriber/asr/ct2.py
 src/easytranscriber/asr/hf.py
 src/easytranscriber/data/__init__.py

easytranscriber-0.2.1/src/easytranscriber/data/collators.py DELETED Viewed

@@ -1,30 +0,0 @@
-import torch
-def transcribe_collate_fn(batch: list[dict]) -> dict:
-    """
-    Collate function for transcription.
-    Parameters
-    ----------
-    batch : list of dict
-        List of samples from the dataset.
-    Returns
-    -------
-    dict
-        Collated batch with 'features', 'start_times', and 'speech_ids'.
-    """
-    # Remove None values
-    speech_ids = [b["speech_id"] for b in batch if b is not None]
-    start_times = [b["start_time_global"] for b in batch if b is not None]
-    batch = [b["feature"] for b in batch if b is not None]
-    # Concat, keep batch dimension
-    batch = torch.cat(batch, dim=0)
-    return {
-        "features": batch,
-        "start_times": start_times,
-        "speech_ids": speech_ids,
-    }