easytranscriber 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {easytranscriber-0.2.2/src/easytranscriber.egg-info → easytranscriber-0.2.4}/PKG-INFO +1 -1
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/pyproject.toml +1 -1
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/cohere.py +5 -3
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/ct2.py +20 -16
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/asr/hf.py +6 -4
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/pipelines.py +13 -10
- {easytranscriber-0.2.2 → easytranscriber-0.2.4/src/easytranscriber.egg-info}/PKG-INFO +1 -1
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/LICENSE +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/README.md +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/setup.cfg +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/audio.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/__init__.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/collators.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/datamodel.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/data/dataset.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/__init__.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/__main__.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/app.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/db.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/search/indexer.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/text/normalization.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber/utils.py +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/SOURCES.txt +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/dependency_links.txt +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/entry_points.txt +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/requires.txt +0 -0
- {easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/top_level.txt +0 -0
|
@@ -127,9 +127,11 @@ def transcribe(
|
|
|
127
127
|
transcription = processor.batch_decode(outputs, skip_special_tokens=True)
|
|
128
128
|
transcription_texts.extend(transcription)
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
global_chunk_idx = 0
|
|
131
|
+
for speech in metadata.speeches:
|
|
132
|
+
for chunk in speech.chunks:
|
|
133
|
+
chunk.text = transcription_texts[global_chunk_idx].strip()
|
|
134
|
+
global_chunk_idx += 1
|
|
133
135
|
|
|
134
136
|
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
135
137
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -9,13 +9,13 @@ import logging
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
|
|
11
11
|
import ctranslate2
|
|
12
|
-
import numpy as np
|
|
13
12
|
import torch
|
|
14
|
-
from easyaligner.utils import save_metadata_json
|
|
15
|
-
from easytranscriber.data.collators import transcribe_collate_fn
|
|
13
|
+
from easyaligner.utils import save_metadata_json
|
|
16
14
|
from tqdm import tqdm
|
|
17
15
|
from transformers import WhisperProcessor
|
|
18
16
|
|
|
17
|
+
from easytranscriber.data.collators import transcribe_collate_fn
|
|
18
|
+
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
21
21
|
|
|
@@ -91,7 +91,7 @@ def transcribe(
|
|
|
91
91
|
batch_size=batch_size,
|
|
92
92
|
num_workers=num_workers,
|
|
93
93
|
prefetch_factor=prefetch_factor,
|
|
94
|
-
pin_memory=
|
|
94
|
+
pin_memory=False,
|
|
95
95
|
collate_fn=transcribe_collate_fn,
|
|
96
96
|
)
|
|
97
97
|
|
|
@@ -148,12 +148,14 @@ def transcribe(
|
|
|
148
148
|
transcription_texts.extend(transcription)
|
|
149
149
|
|
|
150
150
|
# Update metadata with transcriptions
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
151
|
+
global_chunk_idx = 0
|
|
152
|
+
for speech in metadata.speeches:
|
|
153
|
+
for chunk in speech.chunks:
|
|
154
|
+
chunk.text = transcription_texts[global_chunk_idx].strip()
|
|
154
155
|
if len(language_detections) > 0:
|
|
155
|
-
chunk.language = language_detections[
|
|
156
|
-
chunk.language_prob = language_detections[
|
|
156
|
+
chunk.language = language_detections[global_chunk_idx]["language"]
|
|
157
|
+
chunk.language_prob = language_detections[global_chunk_idx]["probability"]
|
|
158
|
+
global_chunk_idx += 1
|
|
157
159
|
|
|
158
160
|
# Save transcription to file
|
|
159
161
|
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
@@ -197,7 +199,7 @@ def lang_detect_only(
|
|
|
197
199
|
batch_size=batch_size,
|
|
198
200
|
num_workers=num_workers,
|
|
199
201
|
prefetch_factor=prefetch_factor,
|
|
200
|
-
pin_memory=
|
|
202
|
+
pin_memory=False,
|
|
201
203
|
collate_fn=transcribe_collate_fn,
|
|
202
204
|
)
|
|
203
205
|
|
|
@@ -205,12 +207,14 @@ def lang_detect_only(
|
|
|
205
207
|
features_ct2 = batch["features"].numpy()
|
|
206
208
|
features_ct2 = ctranslate2.StorageView.from_array(features_ct2)
|
|
207
209
|
languages = detect_language(model, features_ct2)
|
|
208
|
-
language_detections.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
chunk.
|
|
210
|
+
language_detections.extend(languages)
|
|
211
|
+
|
|
212
|
+
global_chunk_idx = 0
|
|
213
|
+
for speech in metadata.speeches:
|
|
214
|
+
for chunk in speech.chunks:
|
|
215
|
+
chunk.language = language_detections[global_chunk_idx]["language"]
|
|
216
|
+
chunk.language_probability = language_detections[global_chunk_idx]["probability"]
|
|
217
|
+
global_chunk_idx += 1
|
|
214
218
|
|
|
215
219
|
# Save transcription to file
|
|
216
220
|
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
@@ -76,7 +76,7 @@ def transcribe(
|
|
|
76
76
|
|
|
77
77
|
for batch in feature_dataloader:
|
|
78
78
|
with torch.inference_mode():
|
|
79
|
-
batch = batch["features"].to(device
|
|
79
|
+
batch = batch["features"].to(device=device, dtype=model.dtype)
|
|
80
80
|
predicted_ids = model.generate(
|
|
81
81
|
batch,
|
|
82
82
|
return_dict_in_generate=True,
|
|
@@ -96,9 +96,11 @@ def transcribe(
|
|
|
96
96
|
|
|
97
97
|
transcription_texts.extend(transcription)
|
|
98
98
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
99
|
+
global_chunk_idx = 0
|
|
100
|
+
for speech in metadata.speeches:
|
|
101
|
+
for chunk in speech.chunks:
|
|
102
|
+
chunk.text = transcription_texts[global_chunk_idx].strip()
|
|
103
|
+
global_chunk_idx += 1
|
|
102
104
|
|
|
103
105
|
# Write final transcription to file with msgspec serialization
|
|
104
106
|
output_path = Path(output_dir) / Path(metadata.audio_path).with_suffix(".json")
|
|
@@ -80,7 +80,6 @@ def pipeline(
|
|
|
80
80
|
word_boundary: str | None = None,
|
|
81
81
|
indent: int = 2,
|
|
82
82
|
ndigits: int = 5,
|
|
83
|
-
batch_size_files: int = 1,
|
|
84
83
|
num_workers_files: int = 2,
|
|
85
84
|
prefetch_factor_files: int = 2,
|
|
86
85
|
batch_size_features: int = 8,
|
|
@@ -163,8 +162,6 @@ def pipeline(
|
|
|
163
162
|
JSON indentation.
|
|
164
163
|
ndigits : int, optional
|
|
165
164
|
Number of digits for rounding.
|
|
166
|
-
batch_size_files : int, optional
|
|
167
|
-
Batch size for files. Recommended to set to 1.
|
|
168
165
|
num_workers_files : int, optional
|
|
169
166
|
Number of workers for file loading.
|
|
170
167
|
prefetch_factor_files : int, optional
|
|
@@ -233,6 +230,9 @@ def pipeline(
|
|
|
233
230
|
# TODO: Support msgpack throughout the pipeline
|
|
234
231
|
json_paths = [Path(p).with_suffix(".json") for p in audio_paths]
|
|
235
232
|
|
|
233
|
+
# fp16 is a GPU optimization; most fp16 ops are unimplemented or slow on CPU.
|
|
234
|
+
torch_dtype = torch.float16 if "cuda" in str(device) else torch.float32
|
|
235
|
+
|
|
236
236
|
if streaming:
|
|
237
237
|
DatasetClass = StreamingAudioFileDataset
|
|
238
238
|
else:
|
|
@@ -257,7 +257,6 @@ def pipeline(
|
|
|
257
257
|
speeches=speeches,
|
|
258
258
|
chunk_size=chunk_size,
|
|
259
259
|
sample_rate=sample_rate,
|
|
260
|
-
batch_size=batch_size_files,
|
|
261
260
|
num_workers=num_workers_files,
|
|
262
261
|
prefetch_factor=prefetch_factor_files,
|
|
263
262
|
save_json=save_json,
|
|
@@ -294,7 +293,7 @@ def pipeline(
|
|
|
294
293
|
logger.info(f"Loading Cohere ASR model from {transcription_model}...")
|
|
295
294
|
model = (
|
|
296
295
|
CohereAsrForConditionalGeneration.from_pretrained(
|
|
297
|
-
transcription_model, torch_dtype=
|
|
296
|
+
transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
|
|
298
297
|
)
|
|
299
298
|
.to(device)
|
|
300
299
|
.eval()
|
|
@@ -324,7 +323,7 @@ def pipeline(
|
|
|
324
323
|
else:
|
|
325
324
|
logger.info(f"Loading Hugging Face model from {transcription_model}...")
|
|
326
325
|
model = WhisperForConditionalGeneration.from_pretrained(
|
|
327
|
-
transcription_model, torch_dtype=
|
|
326
|
+
transcription_model, torch_dtype=torch_dtype, cache_dir=cache_dir
|
|
328
327
|
).to(device)
|
|
329
328
|
|
|
330
329
|
processor = WhisperProcessor.from_pretrained(transcription_model, cache_dir=cache_dir)
|
|
@@ -345,7 +344,7 @@ def pipeline(
|
|
|
345
344
|
|
|
346
345
|
file_dataloader = torch.utils.data.DataLoader(
|
|
347
346
|
file_dataset,
|
|
348
|
-
batch_size=
|
|
347
|
+
batch_size=1,
|
|
349
348
|
shuffle=False,
|
|
350
349
|
collate_fn=audiofile_collate_fn,
|
|
351
350
|
num_workers=num_workers_files,
|
|
@@ -372,7 +371,11 @@ def pipeline(
|
|
|
372
371
|
json_paths=[str(Path(output_transcriptions_dir) / p) for p in json_paths]
|
|
373
372
|
)
|
|
374
373
|
|
|
375
|
-
model =
|
|
374
|
+
model = (
|
|
375
|
+
AutoModelForCTC.from_pretrained(emissions_model, cache_dir=cache_dir)
|
|
376
|
+
.to(device)
|
|
377
|
+
.to(torch_dtype)
|
|
378
|
+
)
|
|
376
379
|
processor = Wav2Vec2Processor.from_pretrained(emissions_model, cache_dir=cache_dir)
|
|
377
380
|
|
|
378
381
|
if blank_id is None:
|
|
@@ -388,7 +391,6 @@ def pipeline(
|
|
|
388
391
|
sample_rate=sample_rate,
|
|
389
392
|
chunk_size=chunk_size,
|
|
390
393
|
alignment_strategy=alignment_strategy,
|
|
391
|
-
batch_size_files=batch_size_files,
|
|
392
394
|
num_workers_files=num_workers_files,
|
|
393
395
|
prefetch_factor_files=prefetch_factor_files,
|
|
394
396
|
batch_size_features=batch_size_features,
|
|
@@ -399,6 +401,7 @@ def pipeline(
|
|
|
399
401
|
save_emissions=save_emissions,
|
|
400
402
|
return_emissions=False,
|
|
401
403
|
output_dir=output_emissions_dir,
|
|
404
|
+
device=device,
|
|
402
405
|
)
|
|
403
406
|
|
|
404
407
|
# Step 4: Perform Alignment
|
|
@@ -407,7 +410,7 @@ def pipeline(
|
|
|
407
410
|
)
|
|
408
411
|
json_dataloader = torch.utils.data.DataLoader(
|
|
409
412
|
json_dataset,
|
|
410
|
-
batch_size=
|
|
413
|
+
batch_size=1,
|
|
411
414
|
shuffle=False,
|
|
412
415
|
collate_fn=metadata_collate_fn,
|
|
413
416
|
num_workers=num_workers_files,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{easytranscriber-0.2.2 → easytranscriber-0.2.4}/src/easytranscriber.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|