easytranscriber 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {easytranscriber-0.2.3/src/easytranscriber.egg-info → easytranscriber-0.2.4}/PKG-INFO +1 -1
  2. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/pyproject.toml +1 -1
  3. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/cohere.py +5 -3
  4. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/ct2.py +4 -3
  5. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/asr/hf.py +1 -1
  6. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/pipelines.py +11 -3
  7. {easytranscriber-0.2.3 → easytranscriber-0.2.4/src/easytranscriber.egg-info}/PKG-INFO +1 -1
  8. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/LICENSE +0 -0
  9. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/README.md +0 -0
  10. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/setup.cfg +0 -0
  11. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/audio.py +0 -0
  12. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/data/__init__.py +0 -0
  13. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/data/collators.py +0 -0
  14. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/data/datamodel.py +0 -0
  15. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/data/dataset.py +0 -0
  16. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/search/__init__.py +0 -0
  17. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/search/__main__.py +0 -0
  18. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/search/app.py +0 -0
  19. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/search/db.py +0 -0
  20. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/search/indexer.py +0 -0
  21. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/text/normalization.py +0 -0
  22. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber/utils.py +0 -0
  23. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/SOURCES.txt +0 -0
  24. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/dependency_links.txt +0 -0
  25. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/entry_points.txt +0 -0
  26. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/requires.txt +0 -0
  27. {easytranscriber-0.2.3 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easytranscriber
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Speech recognition with accurate word-level timestamps.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easytranscriber
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
- version = "0.2.3"
6
+ version = "0.2.4"
7
7
  name = "easytranscriber"
8
8
  requires-python = ">= 3.10"
9
9
  description = "Speech recognition with accurate word-level timestamps."
@@ -127,9 +127,11 @@ def transcribe(
127
127
  transcription = processor.batch_decode(outputs, skip_special_tokens=True)
128
128
  transcription_texts.extend(transcription)
129
129
 
130
- for i, speech in enumerate(metadata.speeches):
131
- for j, chunk in enumerate(speech.chunks):
132
- chunk.text = transcription_texts[j].strip()
130
+ global_chunk_idx = 0
131
+ for speech in metadata.speeches:
132
+ for chunk in speech.chunks:
133
+ chunk.text = transcription_texts[global_chunk_idx].strip()
134
+ global_chunk_idx += 1
133
135
 
134
136
  output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
135
137
  output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -11,10 +11,11 @@ from pathlib import Path
11
11
  import ctranslate2
12
12
  import torch
13
13
  from easyaligner.utils import save_metadata_json
14
- from easytranscriber.data.collators import transcribe_collate_fn
15
14
  from tqdm import tqdm
16
15
  from transformers import WhisperProcessor
17
16
 
17
+ from easytranscriber.data.collators import transcribe_collate_fn
18
+
18
19
  logger = logging.getLogger(__name__)
19
20
 
20
21
 
@@ -90,7 +91,7 @@ def transcribe(
90
91
  batch_size=batch_size,
91
92
  num_workers=num_workers,
92
93
  prefetch_factor=prefetch_factor,
93
- pin_memory=True,
94
+ pin_memory=False,
94
95
  collate_fn=transcribe_collate_fn,
95
96
  )
96
97
 
@@ -198,7 +199,7 @@ def lang_detect_only(
198
199
  batch_size=batch_size,
199
200
  num_workers=num_workers,
200
201
  prefetch_factor=prefetch_factor,
201
- pin_memory=True,
202
+ pin_memory=False,
202
203
  collate_fn=transcribe_collate_fn,
203
204
  )
204
205
 
@@ -76,7 +76,7 @@ def transcribe(
76
76
 
77
77
  for batch in feature_dataloader:
78
78
  with torch.inference_mode():
79
- batch = batch["features"].to(device).half()
79
+ batch = batch["features"].to(device=device, dtype=model.dtype)
80
80
  predicted_ids = model.generate(
81
81
  batch,
82
82
  return_dict_in_generate=True,
@@ -230,6 +230,9 @@ def pipeline(
230
230
  # TODO: Support msgpack throughout the pipeline
231
231
  json_paths = [Path(p).with_suffix(".json") for p in audio_paths]
232
232
 
233
+ # fp16 is a GPU optimization; most fp16 ops are unimplemented or slow on CPU.
234
+ torch_dtype = torch.float16 if "cuda" in str(device) else torch.float32
235
+
233
236
  if streaming:
234
237
  DatasetClass = StreamingAudioFileDataset
235
238
  else:
@@ -290,7 +293,7 @@ def pipeline(
290
293
  logger.info(f"Loading Cohere ASR model from {transcription_model}...")
291
294
  model = (
292
295
  CohereAsrForConditionalGeneration.from_pretrained(
293
- transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
296
+ transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
294
297
  )
295
298
  .to(device)
296
299
  .eval()
@@ -320,7 +323,7 @@ def pipeline(
320
323
  else:
321
324
  logger.info(f"Loading Hugging Face model from {transcription_model}...")
322
325
  model = WhisperForConditionalGeneration.from_pretrained(
323
- transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
326
+ transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
324
327
  ).to(device)
325
328
 
326
329
  processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
@@ -368,7 +371,11 @@ def pipeline(
368
371
  json_paths=[str(Path(output_transcriptions_dir) / p) for p in json_paths]
369
372
  )
370
373
 
371
- model = AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir).to("cuda").half()
374
+ model = (
375
+ AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir)
376
+ .to(device)
377
+ .to(torch_dtype)
378
+ )
372
379
  processor = Wav2Vec2Processor.from_pretrained(emissions_model, cache_dir=cache_dir)
373
380
 
374
381
  if blank_id is None:
@@ -394,6 +401,7 @@ def pipeline(
394
401
  save_emissions=save_emissions,
395
402
  return_emissions=False,
396
403
  output_dir=output_emissions_dir,
404
+ device=device,
397
405
  )
398
406
 
399
407
  # Step 4: Perform Alignment
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easytranscriber
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Speech recognition with accurate word-level timestamps.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easytranscriber
File without changes