easytranscriber 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {easytranscriber-0.2.2/src/easytranscriber.egg-info → easytranscriber-0.2.4}/PKG-INFO +1 -1
  2. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/pyproject.toml +1 -1
  3. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/cohere.py +5 -3
  4. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/ct2.py +20 -16
  5. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/hf.py +6 -4
  6. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/pipelines.py +13 -10
  7. {easytranscriber-0.2.2 → easytranscriber-0.2.4/src/easytranscriber.egg-info}/PKG-INFO +1 -1
  8. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/LICENSE +0 -0
  9. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/README.md +0 -0
  10. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/setup.cfg +0 -0
  11. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/audio.py +0 -0
  12. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/__init__.py +0 -0
  13. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/collators.py +0 -0
  14. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/datamodel.py +0 -0
  15. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/dataset.py +0 -0
  16. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/__init__.py +0 -0
  17. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/__main__.py +0 -0
  18. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/app.py +0 -0
  19. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/db.py +0 -0
  20. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/indexer.py +0 -0
  21. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/text/normalization.py +0 -0
  22. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/utils.py +0 -0
  23. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/SOURCES.txt +0 -0
  24. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/dependency_links.txt +0 -0
  25. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/entry_points.txt +0 -0
  26. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/requires.txt +0 -0
  27. {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easytranscriber
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Speech recognition with accurate word-level timestamps.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easytranscriber
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
- version = "0.2.2"
6
+ version = "0.2.4"
7
7
  name = "easytranscriber"
8
8
  requires-python = ">= 3.10"
9
9
  description = "Speech recognition with accurate word-level timestamps."
@@ -127,9 +127,11 @@ def transcribe(
127
127
  transcription = processor.batch_decode(outputs, skip_special_tokens=True)
128
128
  transcription_texts.extend(transcription)
129
129
 
130
- for i, speech in enumerate(metadata.speeches):
131
- for j, chunk in enumerate(speech.chunks):
132
- chunk.text = transcription_texts[j].strip()
130
+ global_chunk_idx = 0
131
+ for speech in metadata.speeches:
132
+ for chunk in speech.chunks:
133
+ chunk.text = transcription_texts[global_chunk_idx].strip()
134
+ global_chunk_idx += 1
133
135
 
134
136
  output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
135
137
  output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -9,13 +9,13 @@ import logging
9
9
  from pathlib import Path
10
10
 
11
11
  import ctranslate2
12
- import numpy as np
13
12
  import torch
14
- from easyaligner.utils import save_metadata_json, save_metadata_msgpack
15
- from easytranscriber.data.collators import transcribe_collate_fn
13
+ from easyaligner.utils import save_metadata_json
16
14
  from tqdm import tqdm
17
15
  from transformers import WhisperProcessor
18
16
 
17
+ from easytranscriber.data.collators import transcribe_collate_fn
18
+
19
19
  logger = logging.getLogger(__name__)
20
20
 
21
21
 
@@ -91,7 +91,7 @@ def transcribe(
91
91
  batch_size=batch_size,
92
92
  num_workers=num_workers,
93
93
  prefetch_factor=prefetch_factor,
94
- pin_memory=True,
94
+ pin_memory=False,
95
95
  collate_fn=transcribe_collate_fn,
96
96
  )
97
97
 
@@ -148,12 +148,14 @@ def transcribe(
148
148
  transcription_texts.extend(transcription)
149
149
 
150
150
  # Update metadata with transcriptions
151
- for i, speech in enumerate(metadata.speeches):
152
- for j, chunk in enumerate(speech.chunks):
153
- chunk.text = transcription_texts[j].strip()
151
+ global_chunk_idx = 0
152
+ for speech in metadata.speeches:
153
+ for chunk in speech.chunks:
154
+ chunk.text = transcription_texts[global_chunk_idx].strip()
154
155
  if len(language_detections) > 0:
155
- chunk.language = language_detections[j]["language"]
156
- chunk.language_prob = language_detections[j]["probability"]
156
+ chunk.language = language_detections[global_chunk_idx]["language"]
157
+ chunk.language_prob = language_detections[global_chunk_idx]["probability"]
158
+ global_chunk_idx += 1
157
159
 
158
160
  # Save transcription to file
159
161
  output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
@@ -197,7 +199,7 @@ def lang_detect_only(
197
199
  batch_size=batch_size,
198
200
  num_workers=num_workers,
199
201
  prefetch_factor=prefetch_factor,
200
- pin_memory=True,
202
+ pin_memory=False,
201
203
  collate_fn=transcribe_collate_fn,
202
204
  )
203
205
 
@@ -205,12 +207,14 @@ def lang_detect_only(
205
207
  features_ct2 = batch["features"].numpy()
206
208
  features_ct2 = ctranslate2.StorageView.from_array(features_ct2)
207
209
  languages = detect_language(model, features_ct2)
208
- language_detections.append(languages)
209
-
210
- for i, speech in enumerate(metadata.speeches):
211
- for j, chunk in enumerate(speech.chunks):
212
- chunk.language = language_detections[j]["language"]
213
- chunk.language_probability = language_detections[j]["probability"]
210
+ language_detections.extend(languages)
211
+
212
+ global_chunk_idx = 0
213
+ for speech in metadata.speeches:
214
+ for chunk in speech.chunks:
215
+ chunk.language = language_detections[global_chunk_idx]["language"]
216
+ chunk.language_probability = language_detections[global_chunk_idx]["probability"]
217
+ global_chunk_idx += 1
214
218
 
215
219
  # Save transcription to file
216
220
  output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
@@ -76,7 +76,7 @@ def transcribe(
76
76
 
77
77
  for batch in feature_dataloader:
78
78
  with torch.inference_mode():
79
- batch = batch["features"].to(device).half()
79
+ batch = batch["features"].to(device=device, dtype=model.dtype)
80
80
  predicted_ids = model.generate(
81
81
  batch,
82
82
  return_dict_in_generate=True,
@@ -96,9 +96,11 @@ def transcribe(
96
96
 
97
97
  transcription_texts.extend(transcription)
98
98
 
99
- for i, speech in enumerate(metadata.speeches):
100
- for j, chunk in enumerate(speech.chunks):
101
- chunk.text = transcription_texts[j].strip()
99
+ global_chunk_idx = 0
100
+ for speech in metadata.speeches:
101
+ for chunk in speech.chunks:
102
+ chunk.text = transcription_texts[global_chunk_idx].strip()
103
+ global_chunk_idx += 1
102
104
 
103
105
  # Write final transcription to file with msgspec serialization
104
106
  output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
@@ -80,7 +80,6 @@ def pipeline(
80
80
  word_boundary: str | None = None,
81
81
  indent: int = 2,
82
82
  ndigits: int = 5,
83
- batch_size_files: int = 1,
84
83
  num_workers_files: int = 2,
85
84
  prefetch_factor_files: int = 2,
86
85
  batch_size_features: int = 8,
@@ -163,8 +162,6 @@ def pipeline(
163
162
  JSON indentation.
164
163
  ndigits : int, optional
165
164
  Number of digits for rounding.
166
- batch_size_files : int, optional
167
- Batch size for files. Recommended to set to 1.
168
165
  num_workers_files : int, optional
169
166
  Number of workers for file loading.
170
167
  prefetch_factor_files : int, optional
@@ -233,6 +230,9 @@ def pipeline(
233
230
  # TODO: Support msgpack throughout the pipeline
234
231
  json_paths = [Path(p).with_suffix(".json") for p in audio_paths]
235
232
 
233
+ # fp16 is a GPU optimization; most fp16 ops are unimplemented or slow on CPU.
234
+ torch_dtype = torch.float16 if "cuda" in str(device) else torch.float32
235
+
236
236
  if streaming:
237
237
  DatasetClass = StreamingAudioFileDataset
238
238
  else:
@@ -257,7 +257,6 @@ def pipeline(
257
257
  speeches=speeches,
258
258
  chunk_size=chunk_size,
259
259
  sample_rate=sample_rate,
260
- batch_size=batch_size_files,
261
260
  num_workers=num_workers_files,
262
261
  prefetch_factor=prefetch_factor_files,
263
262
  save_json=save_json,
@@ -294,7 +293,7 @@ def pipeline(
294
293
  logger.info(f"Loading Cohere ASR model from {transcription_model}...")
295
294
  model = (
296
295
  CohereAsrForConditionalGeneration.from_pretrained(
297
- transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
296
+ transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
298
297
  )
299
298
  .to(device)
300
299
  .eval()
@@ -324,7 +323,7 @@ def pipeline(
324
323
  else:
325
324
  logger.info(f"Loading Hugging Face model from {transcription_model}...")
326
325
  model = WhisperForConditionalGeneration.from_pretrained(
327
- transcription_model, torch_dtype=torch.float16, cache_dir=cache_dir
326
+ transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
328
327
  ).to(device)
329
328
 
330
329
  processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
@@ -345,7 +344,7 @@ def pipeline(
345
344
 
346
345
  file_dataloader = torch.utils.data.DataLoader(
347
346
  file_dataset,
348
- batch_size=batch_size_files,
347
+ batch_size=1,
349
348
  shuffle=False,
350
349
  collate_fn=audiofile_collate_fn,
351
350
  num_workers=num_workers_files,
@@ -372,7 +371,11 @@ def pipeline(
372
371
  json_paths=[str(Path(output_transcriptions_dir) / p) for p in json_paths]
373
372
  )
374
373
 
375
- model = AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir).to("cuda").half()
374
+ model = (
375
+ AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir)
376
+ .to(device)
377
+ .to(torch_dtype)
378
+ )
376
379
  processor = Wav2Vec2Processor.from_pretrained(emissions_model, cache_dir=cache_dir)
377
380
 
378
381
  if blank_id is None:
@@ -388,7 +391,6 @@ def pipeline(
388
391
  sample_rate=sample_rate,
389
392
  chunk_size=chunk_size,
390
393
  alignment_strategy=alignment_strategy,
391
- batch_size_files=batch_size_files,
392
394
  num_workers_files=num_workers_files,
393
395
  prefetch_factor_files=prefetch_factor_files,
394
396
  batch_size_features=batch_size_features,
@@ -399,6 +401,7 @@ def pipeline(
399
401
  save_emissions=save_emissions,
400
402
  return_emissions=False,
401
403
  output_dir=output_emissions_dir,
404
+ device=device,
402
405
  )
403
406
 
404
407
  # Step 4: Perform Alignment
@@ -407,7 +410,7 @@ def pipeline(
407
410
  )
408
411
  json_dataloader = torch.utils.data.DataLoader(
409
412
  json_dataset,
410
- batch_size=batch_size_files,
413
+ batch_size=1,
411
414
  shuffle=False,
412
415
  collate_fn=metadata_collate_fn,
413
416
  num_workers=num_workers_files,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easytranscriber
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Speech recognition with accurate word-level timestamps.
5
5
  Author: Faton Rekathati
6
6
  Project-URL: Repository, https://github.com/kb-labb/easytranscriber
File without changes