PyPI - easytranscriber - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

easytranscriber 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{easytranscriber-0.2.0/src/easytranscriber.egg-info → easytranscriber-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.0
+Version: 0.2.2
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber
@@ -19,7 +19,7 @@ Requires-Dist: ctranslate2>=4.4.0
 Requires-Dist: msgspec
 Requires-Dist: easyaligner==0.*
 Provides-Extra: search
-Requires-Dist: fastapi>=0.104.0; extra == "search"
+Requires-Dist: fastapi>=0.109.0; extra == "search"
 Requires-Dist: uvicorn[standard]>=0.24.0; extra == "search"
 Requires-Dist: jinja2>=3.1.0; extra == "search"
 Dynamic: license-file
@@ -127,6 +127,9 @@ The documentation is available at [kb-labb.github.io/easytranscriber/](https://k
 * [Text normalization tutorial](https://kb-labb.github.io/easytranscriber/get-started/text-processing.html).
 * [API reference](https://kb-labb.github.io/easytranscriber/reference/).
+> [!TIP]
+> Check out the [`easyaligner`](https://kb-labb.github.io/easyaligner/) library for a user friendly pipeline for forced alignment of text and audio.
 ## Acknowledgements
 `easytranscriber` draws heavy inspiration from [`WhisperX`](https://github.com/m-bain/whisperX) [(Bain et al., 2023)](https://www.isca-archive.org/interspeech_2023/bain23_interspeech.pdf).
@@ -134,3 +137,16 @@ The documentation is available at [kb-labb.github.io/easytranscriber/](https://k
 The forced alignment component of `easytranscriber` is based on Pytorch's forced alignment API, which implements a GPU-accelerated version of the Viterbi algorithm as described in [Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8).
 LibriVox for public domain audiobooks used as tutorial examples.
+## Citation
+```
+@online{rekathati2026,
+  author = {Rekathati, Faton},
+  title = {Easytranscriber: {Speech} Recognition with Precise
+    Timestamps},
+  date = {2026-02-26},
+  url = {https://kb-labb.github.io/posts/2026-02-26-easytranscriber/},
+  langid = {en}
+}
+```

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/README.md RENAMED Viewed

@@ -101,10 +101,26 @@ The documentation is available at [kb-labb.github.io/easytranscriber/](https://k
 * [Text normalization tutorial](https://kb-labb.github.io/easytranscriber/get-started/text-processing.html).
 * [API reference](https://kb-labb.github.io/easytranscriber/reference/).
+> [!TIP]
+> Check out the [`easyaligner`](https://kb-labb.github.io/easyaligner/) library for a user friendly pipeline for forced alignment of text and audio.
 ## Acknowledgements
 `easytranscriber` draws heavy inspiration from [`WhisperX`](https://github.com/m-bain/whisperX) [(Bain et al., 2023)](https://www.isca-archive.org/interspeech_2023/bain23_interspeech.pdf).
 The forced alignment component of `easytranscriber` is based on Pytorch's forced alignment API, which implements a GPU-accelerated version of the Viterbi algorithm as described in [Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8).
-LibriVox for public domain audiobooks used as tutorial examples.
+LibriVox for public domain audiobooks used as tutorial examples.
+## Citation
+```
+@online{rekathati2026,
+  author = {Rekathati, Faton},
+  title = {Easytranscriber: {Speech} Recognition with Precise
+    Timestamps},
+  date = {2026-02-26},
+  url = {https://kb-labb.github.io/posts/2026-02-26-easytranscriber/},
+  langid = {en}
+}
+```

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 [project]
-version = "0.2.0"
+version = "0.2.2"
 name = "easytranscriber"
 requires-python = ">= 3.10"
 description = "Speech recognition with accurate word-level timestamps."
@@ -26,7 +26,7 @@ dependencies = [
 [project.optional-dependencies]
 search = [
-  "fastapi>=0.104.0",
+  "fastapi>=0.109.0",
   "uvicorn[standard]>=0.24.0",
   "jinja2>=3.1.0",
 ]

easytranscriber-0.2.2/src/easytranscriber/asr/cohere.py ADDED Viewed

@@ -0,0 +1,136 @@
+import logging
+from pathlib import Path
+import torch
+from easyaligner.utils import save_metadata_json
+from tqdm import tqdm
+from easytranscriber.data.collators import cohere_transcribe_collate_fn
+logger = logging.getLogger(__name__)
+# The 14 languages Cohere Transcribe was trained on.
+# https://huggingface.co/CohereLabs/cohere-transcribe-03-2026
+COHERE_SUPPORTED_LANGUAGES = frozenset(
+    {"ar", "de", "el", "en", "es", "fr", "it", "ja", "ko", "nl", "pl", "pt", "vi", "zh"}
+)
+def _require_transformers():
+    try:
+        from transformers import CohereAsrForConditionalGeneration
+    except ImportError as e:
+        raise ImportError(
+            "The 'cohere' ASR backend requires transformers>=5.4.0 "
+            "(CohereAsrForConditionalGeneration is not available in the installed version). "
+            "Upgrade with: pip install --upgrade 'transformers>=5.4.0'"
+        ) from e
+    return CohereAsrForConditionalGeneration
+def transcribe(
+    model,
+    processor,
+    file_dataloader: torch.utils.data.DataLoader,
+    language: str,
+    batch_size: int = 4,
+    max_new_tokens: int = 256,
+    punctuation: bool = True,
+    sample_rate: int = 16000,
+    num_workers: int = 2,
+    prefetch_factor: int = 2,
+    output_dir: str = "output/transcriptions",
+    generate_kwargs: dict | None = None,
+):
+    """
+    Transcribe audio files using the Cohere Transcribe model.
+    Parameters
+    ----------
+    model : transformers.CohereAsrForConditionalGeneration
+        Cohere ASR model.
+    processor : transformers.AutoProcessor
+        Cohere ASR processor.
+    file_dataloader : torch.utils.data.DataLoader
+        DataLoader yielding audio file datasets. The underlying
+        ``StreamingAudioFileDataset`` must be constructed with
+        ``return_raw_audio=True`` so the processor can be called on whole
+        batches (per-sample calls return variable-length features).
+    language : str
+        ISO 639-1 language code (e.g. 'en', 'ja'). Required — Cohere has
+        no built-in language detection.
+    batch_size : int, optional
+        Batch size for inference.
+    max_new_tokens : int, optional
+        Maximum number of tokens to generate per chunk. Default is 256.
+    punctuation : bool, optional
+        Emit punctuation in transcriptions. Default is True.
+    sample_rate : int, optional
+        Sample rate of audio passed to the processor. Default is 16000.
+    num_workers : int, optional
+        Number of workers for the feature dataloader.
+    prefetch_factor : int, optional
+        Prefetch factor for the feature dataloader.
+    output_dir : str, optional
+        Directory to save transcription JSON files.
+    generate_kwargs : dict, optional
+        Extra keyword arguments forwarded to ``model.generate()`` (e.g.
+        ``num_beams``, ``length_penalty``).
+    """
+    _require_transformers()
+    if language is None:
+        raise ValueError(
+            "The 'cohere' backend requires an explicit `language` — "
+            "CohereAsrForConditionalGeneration does not perform language detection."
+        )
+    if language not in COHERE_SUPPORTED_LANGUAGES:
+        raise ValueError(
+            f"Language {language!r} is not supported by Cohere Transcribe. "
+            f"Supported: {sorted(COHERE_SUPPORTED_LANGUAGES)}."
+        )
+    generate_kwargs = generate_kwargs or {}
+    for features in tqdm(file_dataloader, desc="Transcribing audio files"):
+        slice_dataset = features[0]["dataset"]
+        metadata = features[0]["dataset"].metadata
+        transcription_texts = []
+        feature_dataloader = torch.utils.data.DataLoader(
+            slice_dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            prefetch_factor=prefetch_factor,
+            collate_fn=cohere_transcribe_collate_fn,
+        )
+        logger.info(f"Transcribing {metadata.audio_path} ...")
+        for batch in feature_dataloader:
+            inputs = processor(
+                batch["audio"],
+                sampling_rate=sample_rate,
+                return_tensors="pt",
+                language=language,
+                punctuation=punctuation,
+            )
+            inputs = inputs.to(model.device, dtype=model.dtype)
+            with torch.inference_mode():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    **generate_kwargs,
+                )
+                transcription = processor.batch_decode(outputs, skip_special_tokens=True)
+                transcription_texts.extend(transcription)
+        for i, speech in enumerate(metadata.speeches):
+            for j, chunk in enumerate(speech.chunks):
+                chunk.text = transcription_texts[j].strip()
+        output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        save_metadata_json(metadata, output_dir=output_dir)

easytranscriber-0.2.2/src/easytranscriber/data/collators.py ADDED Viewed

@@ -0,0 +1,57 @@
+import torch
+def transcribe_collate_fn(batch: list[dict]) -> dict:
+    """
+    Collate function for transcription.
+    Parameters
+    ----------
+    batch : list of dict
+        List of samples from the dataset.
+    Returns
+    -------
+    dict
+        Collated batch with 'features', 'start_times', and 'speech_ids'.
+    """
+    # Remove None values
+    speech_ids = [b["speech_id"] for b in batch if b is not None]
+    start_times = [b["start_time_global"] for b in batch if b is not None]
+    batch = [b["feature"] for b in batch if b is not None]
+    # Concat, keep batch dimension
+    batch = torch.cat(batch, dim=0)
+    return {
+        "features": batch,
+        "start_times": start_times,
+        "speech_ids": speech_ids,
+    }
+def cohere_transcribe_collate_fn(batch: list[dict]) -> dict:
+    """
+    Collate function for Cohere ASR transcription.
+    Gathers raw audio arrays into a list so the caller can invoke Cohere's
+    processor on the whole batch — required because per-sample processor
+    calls return variable-length features that cannot be stacked.
+    Parameters
+    ----------
+    batch : list of dict
+        List of samples from the dataset, each with ``"audio"`` (raw waveform).
+    Returns
+    -------
+    dict
+        Collated batch with ``'audio'`` (list of waveforms), ``'start_times'``,
+        and ``'speech_ids'``.
+    """
+    batch = [b for b in batch if b is not None]
+    return {
+        "audio": [b["audio"] for b in batch],
+        "start_times": [b["start_time_global"] for b in batch],
+        "speech_ids": [b["speech_id"] for b in batch],
+    }

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber/data/dataset.py RENAMED Viewed

@@ -48,12 +48,14 @@ class StreamingAudioSliceDataset(Dataset):
         processor: Wav2Vec2Processor | WhisperProcessor,
         sample_rate: int = 16000,
         metadata: AudioMetadata | None = None,
+        return_raw_audio: bool = False,
     ):
         self.audio_path = str(audio_path)
         self.chunk_specs = chunk_specs
         self.processor = processor
         self.sample_rate = sample_rate
         self.metadata = metadata
+        self.return_raw_audio = return_raw_audio
         self.processor_attribute = (
             "input_values" if isinstance(processor, Wav2Vec2Processor) else "input_features"
         )
@@ -75,6 +77,14 @@ class StreamingAudioSliceDataset(Dataset):
             sample_rate=self.sample_rate,
         )
+        if self.return_raw_audio:
+            # Caller (e.g. cohere backend) preprocesses with the batch to handle padding.
+            return {
+                "audio": audio,
+                "start_time_global": start_sec,
+                "speech_id": spec["speech_id"],
+            }
         # Convert to tensor and add batch dimension for processor
         if isinstance(self.processor, Wav2Vec2Processor):
             audio = torch.tensor(audio).unsqueeze(0)
@@ -165,6 +175,7 @@ class StreamingAudioFileDataset(Dataset):
         sample_rate: int = 16000,
         chunk_size: int = 30,
         alignment_strategy: str = "chunk",
+        return_raw_audio: bool = False,
     ):
         if isinstance(metadata, AudioMetadata):
             self.metadata = [metadata]
@@ -176,6 +187,7 @@ class StreamingAudioFileDataset(Dataset):
         self.chunk_size = chunk_size
         self.processor = processor
         self.alignment_strategy = alignment_strategy
+        self.return_raw_audio = return_raw_audio
     def _get_speech_chunk_specs(self, metadata: AudioMetadata) -> list[dict]:
         """
@@ -281,6 +293,7 @@ class StreamingAudioFileDataset(Dataset):
             processor=self.processor,
             sample_rate=self.sr,
             metadata=metadata,
+            return_raw_audio=self.return_raw_audio,
         )
         return {

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber/pipelines.py RENAMED Viewed

@@ -5,11 +5,7 @@ import ctranslate2
 import torch
 from easyaligner.data.collators import audiofile_collate_fn, metadata_collate_fn
 from easyaligner.data.datamodel import SpeechSegment
-from easyaligner.data.dataset import (
-    AudioFileDataset,
-    JSONMetadataDataset,
-    StreamingAudioFileDataset,
-)
+from easyaligner.data.dataset import AudioFileDataset, JSONMetadataDataset
 from easyaligner.pipelines import alignment_pipeline, emissions_pipeline, vad_pipeline
 from easyaligner.vad.pyannote import load_vad_model as load_pyannote_vad_model
 from easyaligner.vad.silero import load_vad_model as load_silero_vad_model
@@ -22,15 +18,30 @@ from transformers import (
 from easytranscriber.asr.ct2 import transcribe as ct2_transcribe
 from easytranscriber.asr.hf import transcribe as hf_transcribe
+from easytranscriber.data.dataset import StreamingAudioFileDataset
 from easytranscriber.text.normalization import text_normalizer
 from easytranscriber.utils import hf_to_ct2_converter
 logger = logging.getLogger(__name__)
+def _load_cohere_transcribe():
+    """Lazy loader for the cohere backend.
+    Defers the ``CohereAsrForConditionalGeneration`` import (which requires
+    transformers>=5.4.0) so that users on older transformers who only use
+    the ct2/hf backends can still import this module.
+    """
+    from easytranscriber.asr.cohere import transcribe as cohere_transcribe
+    return cohere_transcribe
 # dispatch mapping
 TRANSCRIBE_BACKENDS = {
     "ct2": ct2_transcribe,
     "hf": hf_transcribe,
+    "cohere": _load_cohere_transcribe,
 }
 VAD_BACKENDS = {
@@ -56,10 +67,13 @@ def pipeline(
     task: str = "transcribe",
     beam_size: int = 5,
     max_length: int = 250,
+    max_new_tokens: int = 256,
     repetition_penalty: float = 1.0,
     length_penalty: float = 1.0,
     patience: float = 1.0,
     no_repeat_ngram_size: int = 0,
+    punctuation: bool = True,
+    generate_kwargs: dict | None = None,
     start_wildcard: bool = False,
     end_wildcard: bool = False,
     blank_id: int | None = None,
@@ -103,7 +117,9 @@ def pipeline(
     speeches : list[list[SpeechSegment]], optional
         Existing speech segments for alignment.
     backend : str, optional
-        Backend to use for the transcription model: "ct2" or "hf". Default is "ct2".
+        Backend to use for the transcription model: "ct2", "hf", or "cohere". Default is "ct2".
+        The "cohere" backend requires `transformers>=5.4.0`, `streaming=True`, and an explicit
+        `language` (Cohere has no language detection).
     sample_rate : int, optional
         Sample rate.
     chunk_size : int, optional
@@ -127,7 +143,14 @@ def pipeline(
     repetition_penalty : float, optional
         See HF [source code](https://github.com/huggingface/transformers/blob/v4.57.5/src/transformers/generation/configuration_utils.py#L188-L190) for details.
     max_length : int, optional
-        Maximum length of generated text.
+        Maximum length of generated text. Applies to Whisper backends (ct2, hf).
+    max_new_tokens : int, optional
+        Maximum number of new tokens to generate per chunk. Applies to the cohere backend.
+    punctuation : bool, optional
+        Emit punctuation in Cohere transcriptions. Applies to the cohere backend only.
+    generate_kwargs : dict, optional
+        Extra kwargs forwarded to ``model.generate()`` for the cohere backend
+        (e.g. ``num_beams``, ``length_penalty``).
     start_wildcard : bool, optional
         Add start wildcard to forced alignment.
     end_wildcard : bool, optional
@@ -244,32 +267,68 @@ def pipeline(
     )
     # Step 2: Run Transcription
-    transcription_args = {
-        "language": language,
-        "task": task,
-        "beam_size": beam_size,
-        "max_length": max_length,
-        "repetition_penalty": repetition_penalty,
-        "length_penalty": length_penalty,
-    }
-    if backend == "ct2":
-        model_path = hf_to_ct2_converter(transcription_model, cache_dir=cache_dir)
-        logger.info(f"Loading CTranslate2 model from {model_path}...")
-        model = ctranslate2.models.Whisper(model_path.as_posix(), device=device)
-        transcription_args.update(
-            {
-                "patience": patience,
-                "no_repeat_ngram_size": no_repeat_ngram_size,
-            }
+    dataset_kwargs: dict = {}
+    if backend == "cohere":
+        if language is None:
+            raise ValueError(
+                "The 'cohere' backend requires an explicit `language` — "
+                "CohereAsrForConditionalGeneration does not perform language detection."
+            )
+        if not streaming:
+            raise ValueError(
+                "The 'cohere' backend requires `streaming=True` "
+                "(the non-streaming AudioFileDataset does not support return_raw_audio)."
+            )
+        transcription_args = {
+            "language": language,
+            "max_new_tokens": max_new_tokens,
+            "punctuation": punctuation,
+            "sample_rate": sample_rate,
+            "generate_kwargs": generate_kwargs,
+        }
+        from transformers import AutoProcessor, CohereAsrForConditionalGeneration
+        logger.info(f"Loading Cohere ASR model from {transcription_model}...")
+        model = (
+            CohereAsrForConditionalGeneration.from_pretrained(
+                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+            )
+            .to(device)
+            .eval()
         )
+        processor = AutoProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
+        dataset_kwargs = {"return_raw_audio": True}
     else:
-        logger.info(f"Loading Hugging Face model from {transcription_model}...")
-        model = WhisperForConditionalGeneration.from_pretrained(
-            transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
-        ).to(device)
+        transcription_args = {
+            "language": language,
+            "task": task,
+            "beam_size": beam_size,
+            "max_length": max_length,
+            "repetition_penalty": repetition_penalty,
+            "length_penalty": length_penalty,
+        }
+        if backend == "ct2":
+            model_path = hf_to_ct2_converter(transcription_model, cache_dir=cache_dir)
+            logger.info(f"Loading CTranslate2 model from {model_path}...")
+            model = ctranslate2.models.Whisper(model_path.as_posix(), device=device)
+            transcription_args.update(
+                {
+                    "patience": patience,
+                    "no_repeat_ngram_size": no_repeat_ngram_size,
+                }
+            )
+        else:
+            logger.info(f"Loading Hugging Face model from {transcription_model}...")
+            model = WhisperForConditionalGeneration.from_pretrained(
+                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+            ).to(device)
+        processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
-    processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
     json_dataset = JSONMetadataDataset(
         json_paths=[str(Path(output_vad_dir) / p) for p in json_paths]
     )
@@ -281,6 +340,7 @@ def pipeline(
         sample_rate=sample_rate,
         chunk_size=chunk_size,
         alignment_strategy="chunk",
+        **dataset_kwargs,
     )
     file_dataloader = torch.utils.data.DataLoader(
@@ -293,6 +353,9 @@ def pipeline(
     )
     transcribe = TRANSCRIBE_BACKENDS[backend]
+    if backend == "cohere":
+        transcribe = transcribe()  # lazy-load to avoid importing on older transformers
     transcribe(
         model=model,
         processor=processor,

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber/search/__main__.py RENAMED Viewed

@@ -51,6 +51,18 @@ def main():
     parser.add_argument(
         "--reindex", action="store_true", help="Force full re-index of all JSON files."
     )
+    parser.add_argument(
+        "--index-mode",
+        choices=["chunks", "alignments"],
+        default=None,
+        help=(
+            "How to index transcription JSON files. "
+            "'chunks' indexes VAD chunks produced by the ASR pipeline. "
+            "'alignments' indexes AlignmentSegments, for use with "
+            "easyaligner ground-truth alignment outputs where chunks carry no text. "
+            "If omitted, the mode is detected automatically per file."
+        ),
+    )
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
@@ -75,8 +87,14 @@ def main():
     # Initialize database and index
     conn = init_db(args.db)
-    logger.info("Indexing %s ...", args.alignments_dir)
-    indexed, skipped = index_directory(args.alignments_dir, conn, force=args.reindex)
+    logger.info(
+        "Indexing %s (mode: %s) ...",
+        args.alignments_dir,
+        args.index_mode or "auto",
+    )
+    indexed, skipped = index_directory(
+        args.alignments_dir, conn, force=args.reindex, index_mode=args.index_mode
+    )
     logger.info("Indexed %d file(s), skipped %d unchanged.", indexed, skipped)
     # Create and run the app

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber/search/app.py RENAMED Viewed

@@ -73,9 +73,9 @@ def create_app(
         total_pages = max(1, math.ceil(total / per_page))
         return templates.TemplateResponse(
+            request,
             "search.html",
             {
-                "request": request,
                 "query": q,
                 "results": results,
                 "total": total,
@@ -94,9 +94,9 @@ def create_app(
         total_pages = max(1, math.ceil(total / per_page))
         return templates.TemplateResponse(
+            request,
             "documents.html",
             {
-                "request": request,
                 "results": results,
                 "total": total,
                 "page": page,
@@ -116,9 +116,9 @@ def create_app(
             raise HTTPException(status_code=404, detail="Document not found")
         return templates.TemplateResponse(
+            request,
             "document.html",
             {
-                "request": request,
                 "document": doc,
                 "seek_time": t,
                 "query": q,

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber/search/db.py RENAMED Viewed

@@ -10,6 +10,7 @@ CREATE TABLE IF NOT EXISTS documents (
     sample_rate     INTEGER NOT NULL,
     num_speeches    INTEGER NOT NULL,
     num_chunks      INTEGER NOT NULL,
+    index_mode      TEXT NOT NULL DEFAULT 'chunks',
     mtime           REAL NOT NULL,
     indexed_at      TEXT NOT NULL DEFAULT (datetime('now'))
 );

easytranscriber-0.2.2/src/easytranscriber/search/indexer.py ADDED Viewed

@@ -0,0 +1,180 @@
+import logging
+import sqlite3
+from pathlib import Path
+import msgspec
+from easytranscriber.data.datamodel import AudioMetadata
+logger = logging.getLogger(__name__)
+def _detect_index_mode(metadata: "AudioMetadata") -> str:
+    """Detect whether to index by chunks or alignments.
+    Checks if the first chunk of the first speech has text. If not, the file
+    was produced by easyaligner (ground-truth alignment) and should be indexed
+    by alignments. ASR pipeline output always populates chunk text.
+    """
+    if metadata.speeches:
+        first_speech = metadata.speeches[0]
+        if first_speech.chunks and first_speech.chunks[0].text is None:
+            return "alignments"
+    return "chunks"
+def index_file(conn: sqlite3.Connection, json_path: Path, index_mode: str | None = None) -> bool:
+    """Index a single alignment JSON file. Returns True if the file was (re)indexed.
+    Parameters
+    ----------
+    index_mode : str or None
+        ``"chunks"`` indexes VAD chunks produced by ASR pipelines.
+        ``"alignments"`` indexes sentence-level AlignmentSegments, as produced by
+        ``easyaligner`` when ground-truth text is aligned to audio (chunks have no text).
+        If ``None`` (default), the mode is detected automatically from the file contents.
+    """
+    mtime = json_path.stat().st_mtime
+    # Parse JSON using the project's own data model (needed for auto-detection)
+    raw = json_path.read_bytes()
+    metadata = msgspec.json.decode(raw, type=AudioMetadata)
+    resolved_mode = index_mode if index_mode is not None else _detect_index_mode(metadata)
+    # Check if already indexed with same mtime and same mode
+    existing = conn.execute(
+        "SELECT id, mtime, index_mode FROM documents WHERE json_path = ?", (str(json_path),)
+    ).fetchone()
+    if existing and existing["mtime"] == mtime and existing["index_mode"] == resolved_mode:
+        return False
+    # Remove stale entry if mtime or mode changed
+    if existing:
+        conn.execute("DELETE FROM documents WHERE id = ?", (existing["id"],))
+    num_speeches = len(metadata.speeches) if metadata.speeches else 0
+    num_segments = 0
+    if metadata.speeches:
+        for speech in metadata.speeches:
+            if resolved_mode == "alignments":
+                num_segments += len(speech.alignments)
+            else:
+                num_segments += len(speech.chunks)
+    # Insert document
+    cur = conn.execute(
+        """INSERT INTO documents (audio_path, json_path, duration, sample_rate,
+                                  num_speeches, num_chunks, index_mode, mtime)
+           VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
+        (
+            metadata.audio_path,
+            str(json_path),
+            metadata.duration,
+            metadata.sample_rate,
+            num_speeches,
+            num_segments,
+            resolved_mode,
+            mtime,
+        ),
+    )
+    doc_id = cur.lastrowid
+    if metadata.speeches:
+        rows = []
+        if resolved_mode == "alignments":
+            for speech_idx, speech in enumerate(metadata.speeches):
+                for seg_idx, seg in enumerate(speech.alignments):
+                    rows.append(
+                        (
+                            doc_id,
+                            speech_idx,
+                            seg_idx,
+                            seg.text,
+                            seg.start,
+                            seg.end,
+                            seg.duration,
+                        )
+                    )
+        else:
+            for speech_idx, speech in enumerate(metadata.speeches):
+                for chunk_idx, chunk in enumerate(speech.chunks):
+                    if not chunk.text:
+                        continue
+                    rows.append(
+                        (
+                            doc_id,
+                            speech_idx,
+                            chunk_idx,
+                            chunk.text,
+                            chunk.start,
+                            chunk.end,
+                            chunk.duration,
+                        )
+                    )
+        conn.executemany(
+            """INSERT INTO chunks
+               (document_id, speech_idx, chunk_idx, text, start_time, end_time, duration)
+               VALUES (?, ?, ?, ?, ?, ?, ?)""",
+            rows,
+        )
+    return True
+def index_directory(
+    alignments_dir: Path,
+    conn: sqlite3.Connection,
+    force: bool = False,
+    index_mode: str | None = None,
+) -> tuple[int, int]:
+    """
+    Index all JSON files in the alignments directory.
+    Parameters
+    ----------
+    index_mode : str or None
+        ``"chunks"``, ``"alignments"``, or ``None`` to auto-detect per file.
+        See :func:`index_file`.
+    Returns (indexed_count, skipped_count).
+    """
+    if force:
+        conn.execute("DELETE FROM chunks")
+        conn.execute("DELETE FROM documents")
+        # Rebuild FTS index
+        conn.execute("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')")
+        conn.commit()
+    json_files = sorted(alignments_dir.glob("*.json"))
+    total_files = len(json_files)
+    if not json_files:
+        logger.warning("No JSON files found in %s", alignments_dir)
+        return 0, 0
+    indexed = 0
+    skipped = 0
+    for file_num, json_path in enumerate(json_files, 1):
+        try:
+            was_indexed = index_file(conn, json_path, index_mode=index_mode)
+            if was_indexed:
+                indexed += 1
+            else:
+                skipped += 1
+            status = "indexed" if was_indexed else "skipped (unchanged)"
+            logger.info("[%d/%d] %s — %s", file_num, total_files, json_path.name, status)
+        except Exception:
+            logger.exception("[%d/%d] Failed to index %s", file_num, total_files, json_path)
+    # Remove documents whose JSON files no longer exist
+    existing_paths = {str(p) for p in json_files}
+    all_db_paths = conn.execute("SELECT id, json_path FROM documents").fetchall()
+    stale_ids = [r["id"] for r in all_db_paths if r["json_path"] not in existing_paths]
+    if stale_ids:
+        placeholders = ",".join("?" * len(stale_ids))
+        conn.execute(f"DELETE FROM documents WHERE id IN ({placeholders})", stale_ids)
+        logger.info("Removed %d stale documents from index", len(stale_ids))
+    conn.commit()
+    return indexed, skipped

{easytranscriber-0.2.0 → easytranscriber-0.2.2/src/easytranscriber.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.0
+Version: 0.2.2
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber
@@ -19,7 +19,7 @@ Requires-Dist: ctranslate2>=4.4.0
 Requires-Dist: msgspec
 Requires-Dist: easyaligner==0.*
 Provides-Extra: search
-Requires-Dist: fastapi>=0.104.0; extra == "search"
+Requires-Dist: fastapi>=0.109.0; extra == "search"
 Requires-Dist: uvicorn[standard]>=0.24.0; extra == "search"
 Requires-Dist: jinja2>=3.1.0; extra == "search"
 Dynamic: license-file
@@ -127,6 +127,9 @@ The documentation is available at [kb-labb.github.io/easytranscriber/](https://k
 * [Text normalization tutorial](https://kb-labb.github.io/easytranscriber/get-started/text-processing.html).
 * [API reference](https://kb-labb.github.io/easytranscriber/reference/).
+> [!TIP]
+> Check out the [`easyaligner`](https://kb-labb.github.io/easyaligner/) library for a user friendly pipeline for forced alignment of text and audio.
 ## Acknowledgements
 `easytranscriber` draws heavy inspiration from [`WhisperX`](https://github.com/m-bain/whisperX) [(Bain et al., 2023)](https://www.isca-archive.org/interspeech_2023/bain23_interspeech.pdf).
@@ -134,3 +137,16 @@ The documentation is available at [kb-labb.github.io/easytranscriber/](https://k
 The forced alignment component of `easytranscriber` is based on Pytorch's forced alignment API, which implements a GPU-accelerated version of the Viterbi algorithm as described in [Pratap et al., 2024](https://jmlr.org/papers/volume25/23-1318/23-1318.pdf#page=8).
 LibriVox for public domain audiobooks used as tutorial examples.
+## Citation
+```
+@online{rekathati2026,
+  author = {Rekathati, Faton},
+  title = {Easytranscriber: {Speech} Recognition with Precise
+    Timestamps},
+  date = {2026-02-26},
+  url = {https://kb-labb.github.io/posts/2026-02-26-easytranscriber/},
+  langid = {en}
+}
+```

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber.egg-info/SOURCES.txt RENAMED Viewed

@@ -10,6 +10,7 @@ src/easytranscriber.egg-info/dependency_links.txt
 src/easytranscriber.egg-info/entry_points.txt
 src/easytranscriber.egg-info/requires.txt
 src/easytranscriber.egg-info/top_level.txt
+src/easytranscriber/asr/cohere.py
 src/easytranscriber/asr/ct2.py
 src/easytranscriber/asr/hf.py
 src/easytranscriber/data/__init__.py

{easytranscriber-0.2.0 → easytranscriber-0.2.2}/src/easytranscriber.egg-info/requires.txt RENAMED Viewed

@@ -11,6 +11,6 @@ msgspec
 easyaligner==0.*
 [search]
-fastapi>=0.104.0
+fastapi>=0.109.0
 uvicorn[standard]>=0.24.0
 jinja2>=3.1.0

easytranscriber-0.2.0/src/easytranscriber/data/collators.py DELETED Viewed

@@ -1,30 +0,0 @@
-import torch
-def transcribe_collate_fn(batch: list[dict]) -> dict:
-    """
-    Collate function for transcription.
-    Parameters
-    ----------
-    batch : list of dict
-        List of samples from the dataset.
-    Returns
-    -------
-    dict
-        Collated batch with 'features', 'start_times', and 'speech_ids'.
-    """
-    # Remove None values
-    speech_ids = [b["speech_id"] for b in batch if b is not None]
-    start_times = [b["start_time_global"] for b in batch if b is not None]
-    batch = [b["feature"] for b in batch if b is not None]
-    # Concat, keep batch dimension
-    batch = torch.cat(batch, dim=0)
-    return {
-        "features": batch,
-        "start_times": start_times,
-        "speech_ids": speech_ids,
-    }

easytranscriber-0.2.0/src/easytranscriber/search/indexer.py DELETED Viewed

@@ -1,128 +0,0 @@
-import logging
-import sqlite3
-from pathlib import Path
-import msgspec
-from easytranscriber.data.datamodel import AudioMetadata
-logger = logging.getLogger(__name__)
-def index_file(conn: sqlite3.Connection, json_path: Path) -> bool:
-    """Index a single alignment JSON file. Returns True if the file was (re)indexed."""
-    mtime = json_path.stat().st_mtime
-    # Check if already indexed with same mtime
-    existing = conn.execute(
-        "SELECT id, mtime FROM documents WHERE json_path = ?", (str(json_path),)
-    ).fetchone()
-    if existing and existing["mtime"] == mtime:
-        return False
-    # Remove stale entry if mtime changed
-    if existing:
-        conn.execute("DELETE FROM documents WHERE id = ?", (existing["id"],))
-    # Parse JSON using the project's own data model
-    raw = json_path.read_bytes()
-    metadata = msgspec.json.decode(raw, type=AudioMetadata)
-    num_speeches = len(metadata.speeches) if metadata.speeches else 0
-    num_chunks = 0
-    if metadata.speeches:
-        for speech in metadata.speeches:
-            num_chunks += len(speech.chunks)
-    # Insert document
-    cur = conn.execute(
-        """INSERT INTO documents (audio_path, json_path, duration, sample_rate,
-                                  num_speeches, num_chunks, mtime)
-           VALUES (?, ?, ?, ?, ?, ?, ?)""",
-        (
-            metadata.audio_path,
-            str(json_path),
-            metadata.duration,
-            metadata.sample_rate,
-            num_speeches,
-            num_chunks,
-            mtime,
-        ),
-    )
-    doc_id = cur.lastrowid
-    # Insert chunks
-    if metadata.speeches:
-        rows = []
-        for speech_idx, speech in enumerate(metadata.speeches):
-            for chunk_idx, chunk in enumerate(speech.chunks):
-                if not chunk.text:
-                    continue
-                rows.append(
-                    (
-                        doc_id,
-                        speech_idx,
-                        chunk_idx,
-                        chunk.text,
-                        chunk.start,
-                        chunk.end,
-                        chunk.duration,
-                    )
-                )
-        conn.executemany(
-            """INSERT INTO chunks
-               (document_id, speech_idx, chunk_idx, text, start_time, end_time, duration)
-               VALUES (?, ?, ?, ?, ?, ?, ?)""",
-            rows,
-        )
-    return True
-def index_directory(
-    alignments_dir: Path, conn: sqlite3.Connection, force: bool = False
-) -> tuple[int, int]:
-    """
-    Index all JSON files in the alignments directory.
-    Returns (indexed_count, skipped_count).
-    """
-    if force:
-        conn.execute("DELETE FROM chunks")
-        conn.execute("DELETE FROM documents")
-        # Rebuild FTS index
-        conn.execute("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')")
-        conn.commit()
-    json_files = sorted(alignments_dir.glob("*.json"))
-    total_files = len(json_files)
-    if not json_files:
-        logger.warning("No JSON files found in %s", alignments_dir)
-        return 0, 0
-    indexed = 0
-    skipped = 0
-    for file_num, json_path in enumerate(json_files, 1):
-        try:
-            was_indexed = index_file(conn, json_path)
-            if was_indexed:
-                indexed += 1
-            else:
-                skipped += 1
-            status = "indexed" if was_indexed else "skipped (unchanged)"
-            logger.info("[%d/%d] %s — %s", file_num, total_files, json_path.name, status)
-        except Exception:
-            logger.exception("[%d/%d] Failed to index %s", file_num, total_files, json_path)
-    # Remove documents whose JSON files no longer exist
-    existing_paths = {str(p) for p in json_files}
-    all_db_paths = conn.execute("SELECT id, json_path FROM documents").fetchall()
-    stale_ids = [r["id"] for r in all_db_paths if r["json_path"] not in existing_paths]
-    if stale_ids:
-        placeholders = ",".join("?" * len(stale_ids))
-        conn.execute(f"DELETE FROM documents WHERE id IN ({placeholders})", stale_ids)
-        logger.info("Removed %d stale documents from index", len(stale_ids))
-    conn.commit()
-    return indexed, skipped