PyPI - easytranscriber - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

easytranscriber 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{easytranscriber-0.2.2/src/easytranscriber.egg-info → easytranscriber-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.2
+Version: 0.2.4
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber

{easytranscriber-0.2.2 → easytranscriber-0.2.4}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 [project]
-version = "0.2.2"
+version = "0.2.4"
 name = "easytranscriber"
 requires-python = ">= 3.10"
 description = "Speech recognition with accurate word-level timestamps."

{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/cohere.py RENAMED Viewed

@@ -127,9 +127,11 @@ def transcribe(
                 transcription = processor.batch_decode(outputs, skip_special_tokens=True)
                 transcription_texts.extend(transcription)
-        for i, speech in enumerate(metadata.speeches):
-            for j, chunk in enumerate(speech.chunks):
-                chunk.text = transcription_texts[j].strip()
+        global_chunk_idx = 0
+        for speech in metadata.speeches:
+            for chunk in speech.chunks:
+                chunk.text = transcription_texts[global_chunk_idx].strip()
+                global_chunk_idx += 1
         output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
         output_path.parent.mkdir(parents=True, exist_ok=True)

{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/ct2.py RENAMED Viewed

@@ -9,13 +9,13 @@ import logging
 from pathlib import Path
 import ctranslate2
-import numpy as np
 import torch
-from easyaligner.utils import save_metadata_json, save_metadata_msgpack
-from easytranscriber.data.collators import transcribe_collate_fn
+from easyaligner.utils import save_metadata_json
 from tqdm import tqdm
 from transformers import WhisperProcessor
+from easytranscriber.data.collators import transcribe_collate_fn
 logger = logging.getLogger(__name__)
@@ -91,7 +91,7 @@ def transcribe(
             batch_size=batch_size,
             num_workers=num_workers,
             prefetch_factor=prefetch_factor,
-            pin_memory=True,
+            pin_memory=False,
             collate_fn=transcribe_collate_fn,
         )
@@ -148,12 +148,14 @@ def transcribe(
             transcription_texts.extend(transcription)
         # Update metadata with transcriptions
-        for i, speech in enumerate(metadata.speeches):
-            for j, chunk in enumerate(speech.chunks):
-                chunk.text = transcription_texts[j].strip()
+        global_chunk_idx = 0
+        for speech in metadata.speeches:
+            for chunk in speech.chunks:
+                chunk.text = transcription_texts[global_chunk_idx].strip()
                 if len(language_detections) > 0:
-                    chunk.language = language_detections[j]["language"]
-                    chunk.language_prob = language_detections[j]["probability"]
+                    chunk.language = language_detections[global_chunk_idx]["language"]
+                    chunk.language_prob = language_detections[global_chunk_idx]["probability"]
+                global_chunk_idx += 1
         # Save transcription to file
         output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
@@ -197,7 +199,7 @@ def lang_detect_only(
             batch_size=batch_size,
             num_workers=num_workers,
             prefetch_factor=prefetch_factor,
-            pin_memory=True,
+            pin_memory=False,
             collate_fn=transcribe_collate_fn,
         )
@@ -205,12 +207,14 @@ def lang_detect_only(
             features_ct2 = batch["features"].numpy()
             features_ct2 = ctranslate2.StorageView.from_array(features_ct2)
             languages = detect_language(model, features_ct2)
-            language_detections.append(languages)
-        for i, speech in enumerate(metadata.speeches):
-            for j, chunk in enumerate(speech.chunks):
-                chunk.language = language_detections[j]["language"]
-                chunk.language_probability = language_detections[j]["probability"]
+            language_detections.extend(languages)
+        global_chunk_idx = 0
+        for speech in metadata.speeches:
+            for chunk in speech.chunks:
+                chunk.language = language_detections[global_chunk_idx]["language"]
+                chunk.language_probability = language_detections[global_chunk_idx]["probability"]
+                global_chunk_idx += 1
         # Save transcription to file
         output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")

{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/hf.py RENAMED Viewed

@@ -76,7 +76,7 @@ def transcribe(
         for batch in feature_dataloader:
             with torch.inference_mode():
-                batch = batch["features"].to(device).half()
+                batch = batch["features"].to(device=device, dtype=model.dtype)
                 predicted_ids = model.generate(
                     batch,
                     return_dict_in_generate=True,
@@ -96,9 +96,11 @@ def transcribe(
                 transcription_texts.extend(transcription)
-        for i, speech in enumerate(metadata.speeches):
-            for j, chunk in enumerate(speech.chunks):
-                chunk.text = transcription_texts[j].strip()
+        global_chunk_idx = 0
+        for speech in metadata.speeches:
+            for chunk in speech.chunks:
+                chunk.text = transcription_texts[global_chunk_idx].strip()
+                global_chunk_idx += 1
         # Write final transcription to file with msgspec serialization
         output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")

{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/pipelines.py RENAMED Viewed

@@ -80,7 +80,6 @@ def pipeline(
     word_boundary: str | None = None,
     indent: int = 2,
     ndigits: int = 5,
-    batch_size_files: int = 1,
     num_workers_files: int = 2,
     prefetch_factor_files: int = 2,
     batch_size_features: int = 8,
@@ -163,8 +162,6 @@ def pipeline(
         JSON indentation.
     ndigits : int, optional
         Number of digits for rounding.
-    batch_size_files : int, optional
-        Batch size for files. Recommended to set to 1.
     num_workers_files : int, optional
         Number of workers for file loading.
     prefetch_factor_files : int, optional
@@ -233,6 +230,9 @@ def pipeline(
     # TODO: Support msgpack throughout the pipeline
     json_paths = [Path(p).with_suffix(".json") for p in audio_paths]
+    # fp16 is a GPU optimization; most fp16 ops are unimplemented or slow on CPU.
+    torch_dtype = torch.float16 if "cuda" in str(device) else torch.float32
     if streaming:
         DatasetClass = StreamingAudioFileDataset
     else:
@@ -257,7 +257,6 @@ def pipeline(
         speeches=speeches,
         chunk_size=chunk_size,
         sample_rate=sample_rate,
-        batch_size=batch_size_files,
         num_workers=num_workers_files,
         prefetch_factor=prefetch_factor_files,
         save_json=save_json,
@@ -294,7 +293,7 @@ def pipeline(
         logger.info(f"Loading Cohere ASR model from {transcription_model}...")
         model = (
             CohereAsrForConditionalGeneration.from_pretrained(
-                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+                transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
             )
             .to(device)
             .eval()
@@ -324,7 +323,7 @@ def pipeline(
         else:
             logger.info(f"Loading Hugging Face model from {transcription_model}...")
             model = WhisperForConditionalGeneration.from_pretrained(
-                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+                transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
             ).to(device)
         processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
@@ -345,7 +344,7 @@ def pipeline(
     file_dataloader = torch.utils.data.DataLoader(
         file_dataset,
-        batch_size=batch_size_files,
+        batch_size=1,
         shuffle=False,
         collate_fn=audiofile_collate_fn,
         num_workers=num_workers_files,
@@ -372,7 +371,11 @@ def pipeline(
         json_paths=[str(Path(output_transcriptions_dir) / p) for p in json_paths]
     )
-    model = AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir).to("cuda").half()
+    model = (
+        AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir)
+        .to(device)
+        .to(torch_dtype)
+    )
     processor = Wav2Vec2Processor.from_pretrained(emissions_model, cache_dir=cache_dir)
     if blank_id is None:
@@ -388,7 +391,6 @@ def pipeline(
         sample_rate=sample_rate,
         chunk_size=chunk_size,
         alignment_strategy=alignment_strategy,
-        batch_size_files=batch_size_files,
         num_workers_files=num_workers_files,
         prefetch_factor_files=prefetch_factor_files,
         batch_size_features=batch_size_features,
@@ -399,6 +401,7 @@ def pipeline(
         save_emissions=save_emissions,
         return_emissions=False,
         output_dir=output_emissions_dir,
+        device=device,
     )
     # Step 4: Perform Alignment
@@ -407,7 +410,7 @@ def pipeline(
     )
     json_dataloader = torch.utils.data.DataLoader(
         json_dataset,
-        batch_size=batch_size_files,
+        batch_size=1,
         shuffle=False,
         collate_fn=metadata_collate_fn,
         num_workers=num_workers_files,

{easytranscriber-0.2.2 → easytranscriber-0.2.4/src/easytranscriber.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.2
+Version: 0.2.4
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber