PyPI - easytranscriber - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

easytranscriber 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{easytranscriber-0.2.3/src/easytranscriber.egg-info → easytranscriber-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.3
+Version: 0.2.4
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber

{easytranscriber-0.2.3 → easytranscriber-0.2.4}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 [project]
-version = "0.2.3"
+version = "0.2.4"
 name = "easytranscriber"
 requires-python = ">= 3.10"
 description = "Speech recognition with accurate word-level timestamps."

{easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/cohere.py RENAMED Viewed

@@ -127,9 +127,11 @@ def transcribe(
                 transcription = processor.batch_decode(outputs, skip_special_tokens=True)
                 transcription_texts.extend(transcription)
-        for i, speech in enumerate(metadata.speeches):
-            for j, chunk in enumerate(speech.chunks):
-                chunk.text = transcription_texts[j].strip()
+        global_chunk_idx = 0
+        for speech in metadata.speeches:
+            for chunk in speech.chunks:
+                chunk.text = transcription_texts[global_chunk_idx].strip()
+                global_chunk_idx += 1
         output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
         output_path.parent.mkdir(parents=True, exist_ok=True)

{easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/ct2.py RENAMED Viewed

@@ -11,10 +11,11 @@ from pathlib import Path
 import ctranslate2
 import torch
 from easyaligner.utils import save_metadata_json
-from easytranscriber.data.collators import transcribe_collate_fn
 from tqdm import tqdm
 from transformers import WhisperProcessor
+from easytranscriber.data.collators import transcribe_collate_fn
 logger = logging.getLogger(__name__)
@@ -90,7 +91,7 @@ def transcribe(
             batch_size=batch_size,
             num_workers=num_workers,
             prefetch_factor=prefetch_factor,
-            pin_memory=True,
+            pin_memory=False,
             collate_fn=transcribe_collate_fn,
         )
@@ -198,7 +199,7 @@ def lang_detect_only(
             batch_size=batch_size,
             num_workers=num_workers,
             prefetch_factor=prefetch_factor,
-            pin_memory=True,
+            pin_memory=False,
             collate_fn=transcribe_collate_fn,
         )

{easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/hf.py RENAMED Viewed

@@ -76,7 +76,7 @@ def transcribe(
         for batch in feature_dataloader:
             with torch.inference_mode():
-                batch = batch["features"].to(device).half()
+                batch = batch["features"].to(device=device, dtype=model.dtype)
                 predicted_ids = model.generate(
                     batch,
                     return_dict_in_generate=True,

{easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/pipelines.py RENAMED Viewed

@@ -230,6 +230,9 @@ def pipeline(
     # TODO: Support msgpack throughout the pipeline
     json_paths = [Path(p).with_suffix(".json") for p in audio_paths]
+    # fp16 is a GPU optimization; most fp16 ops are unimplemented or slow on CPU.
+    torch_dtype = torch.float16 if "cuda" in str(device) else torch.float32
     if streaming:
         DatasetClass = StreamingAudioFileDataset
     else:
@@ -290,7 +293,7 @@ def pipeline(
         logger.info(f"Loading Cohere ASR model from {transcription_model}...")
         model = (
             CohereAsrForConditionalGeneration.from_pretrained(
-                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+                transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
             )
             .to(device)
             .eval()
@@ -320,7 +323,7 @@ def pipeline(
         else:
             logger.info(f"Loading Hugging Face model from {transcription_model}...")
             model = WhisperForConditionalGeneration.from_pretrained(
-                transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
+                transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
             ).to(device)
         processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
@@ -368,7 +371,11 @@ def pipeline(
         json_paths=[str(Path(output_transcriptions_dir) / p) for p in json_paths]
     )
-    model = AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir).to("cuda").half()
+    model = (
+        AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir)
+        .to(device)
+        .to(torch_dtype)
+    )
     processor = Wav2Vec2Processor.from_pretrained(emissions_model, cache_dir=cache_dir)
     if blank_id is None:
@@ -394,6 +401,7 @@ def pipeline(
         save_emissions=save_emissions,
         return_emissions=False,
         output_dir=output_emissions_dir,
+        device=device,
     )
     # Step 4: Perform Alignment

{easytranscriber-0.2.3 → easytranscriber-0.2.4/src/easytranscriber.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easytranscriber
-Version: 0.2.3
+Version: 0.2.4
 Summary: Speech recognition with accurate word-level timestamps.
 Author: Faton Rekathati
 Project-URL: Repository, https://github.com/kb-labb/easytranscriber