spatelier 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analytics/__init__.py +1 -0
- analytics/reporter.py +497 -0
- cli/__init__.py +1 -0
- cli/app.py +147 -0
- cli/audio.py +129 -0
- cli/cli_analytics.py +320 -0
- cli/cli_utils.py +282 -0
- cli/error_handlers.py +122 -0
- cli/files.py +299 -0
- cli/update.py +325 -0
- cli/video.py +823 -0
- cli/worker.py +615 -0
- core/__init__.py +1 -0
- core/analytics_dashboard.py +368 -0
- core/base.py +303 -0
- core/base_service.py +69 -0
- core/config.py +345 -0
- core/database_service.py +116 -0
- core/decorators.py +263 -0
- core/error_handler.py +210 -0
- core/file_tracker.py +254 -0
- core/interactive_cli.py +366 -0
- core/interfaces.py +166 -0
- core/job_queue.py +437 -0
- core/logger.py +79 -0
- core/package_updater.py +469 -0
- core/progress.py +228 -0
- core/service_factory.py +295 -0
- core/streaming.py +299 -0
- core/worker.py +765 -0
- database/__init__.py +1 -0
- database/connection.py +265 -0
- database/metadata.py +516 -0
- database/models.py +288 -0
- database/repository.py +592 -0
- database/transcription_storage.py +219 -0
- modules/__init__.py +1 -0
- modules/audio/__init__.py +5 -0
- modules/audio/converter.py +197 -0
- modules/video/__init__.py +16 -0
- modules/video/converter.py +191 -0
- modules/video/fallback_extractor.py +334 -0
- modules/video/services/__init__.py +18 -0
- modules/video/services/audio_extraction_service.py +274 -0
- modules/video/services/download_service.py +852 -0
- modules/video/services/metadata_service.py +190 -0
- modules/video/services/playlist_service.py +445 -0
- modules/video/services/transcription_service.py +491 -0
- modules/video/transcription_service.py +385 -0
- modules/video/youtube_api.py +397 -0
- spatelier/__init__.py +33 -0
- spatelier-0.3.0.dist-info/METADATA +260 -0
- spatelier-0.3.0.dist-info/RECORD +59 -0
- spatelier-0.3.0.dist-info/WHEEL +5 -0
- spatelier-0.3.0.dist-info/entry_points.txt +2 -0
- spatelier-0.3.0.dist-info/licenses/LICENSE +21 -0
- spatelier-0.3.0.dist-info/top_level.txt +7 -0
- utils/__init__.py +1 -0
- utils/helpers.py +250 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transcription service for video files.
|
|
3
|
+
|
|
4
|
+
This module provides automatic transcription capabilities using OpenAI Whisper.
|
|
5
|
+
Supports multiple models for speed vs accuracy tradeoffs.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
12
|
+
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
from core.config import Config, TranscriptionConfig
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import whisper
|
|
19
|
+
from faster_whisper import WhisperModel
|
|
20
|
+
|
|
21
|
+
WHISPER_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
WHISPER_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
# Global model cache to avoid reloading models
|
|
26
|
+
_MODEL_CACHE = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TranscriptionService:
|
|
30
|
+
"""
|
|
31
|
+
Video transcription service using OpenAI Whisper.
|
|
32
|
+
|
|
33
|
+
Supports both openai-whisper and faster-whisper for different speed/accuracy needs.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self, config: Config, transcription_config: Optional[TranscriptionConfig] = None
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the transcription service.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Main configuration instance
|
|
44
|
+
transcription_config: Transcription-specific configuration (optional)
|
|
45
|
+
"""
|
|
46
|
+
self.config = config
|
|
47
|
+
self.transcription_config = transcription_config or config.transcription
|
|
48
|
+
|
|
49
|
+
self.model_size = self.transcription_config.default_model
|
|
50
|
+
self.use_faster_whisper = self.transcription_config.use_faster_whisper
|
|
51
|
+
self.device = self.transcription_config.device
|
|
52
|
+
self.compute_type = self.transcription_config.compute_type
|
|
53
|
+
self.model = None
|
|
54
|
+
self._load_model()
|
|
55
|
+
|
|
56
|
+
def _load_model(self):
|
|
57
|
+
"""Load the Whisper model with caching."""
|
|
58
|
+
if not WHISPER_AVAILABLE:
|
|
59
|
+
raise ImportError(
|
|
60
|
+
"Whisper dependencies not available. Install with: pip install spatelier[transcription]"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Create cache key based on model configuration
|
|
65
|
+
cache_key = f"{self.model_size}_{self.device}_{self.compute_type}_{self.use_faster_whisper}"
|
|
66
|
+
|
|
67
|
+
# Check if model is already cached
|
|
68
|
+
if cache_key in _MODEL_CACHE:
|
|
69
|
+
logger.info(f"Using cached Whisper model: {self.model_size}")
|
|
70
|
+
self.model = _MODEL_CACHE[cache_key]
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
# Load new model
|
|
74
|
+
if self.use_faster_whisper:
|
|
75
|
+
logger.info(f"Loading faster-whisper model: {self.model_size}")
|
|
76
|
+
self.model = WhisperModel(
|
|
77
|
+
self.model_size, device=self.device, compute_type=self.compute_type
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
logger.info(f"Loading openai-whisper model: {self.model_size}")
|
|
81
|
+
self.model = whisper.load_model(self.model_size)
|
|
82
|
+
|
|
83
|
+
# Cache the model
|
|
84
|
+
_MODEL_CACHE[cache_key] = self.model
|
|
85
|
+
logger.info("Whisper model loaded and cached successfully")
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.error(f"Failed to load Whisper model: {e}")
|
|
89
|
+
raise
|
|
90
|
+
|
|
91
|
+
def transcribe_video(self, video_path: Path, language: str = "en") -> Dict:
|
|
92
|
+
"""
|
|
93
|
+
Transcribe a video file.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
video_path: Path to the video file
|
|
97
|
+
language: Language code (e.g., 'en', 'es', 'fr')
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Dictionary with transcription results
|
|
101
|
+
"""
|
|
102
|
+
if not video_path.exists():
|
|
103
|
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
|
104
|
+
|
|
105
|
+
logger.info(f"Starting transcription of: {video_path}")
|
|
106
|
+
start_time = time.time()
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
if self.use_faster_whisper:
|
|
110
|
+
result = self._transcribe_with_faster_whisper(video_path, language)
|
|
111
|
+
else:
|
|
112
|
+
result = self._transcribe_with_openai_whisper(video_path, language)
|
|
113
|
+
|
|
114
|
+
processing_time = time.time() - start_time
|
|
115
|
+
result["processing_time"] = processing_time
|
|
116
|
+
result["model_used"] = f"whisper-{self.model_size}"
|
|
117
|
+
result["language"] = language
|
|
118
|
+
|
|
119
|
+
logger.info(f"Transcription completed in {processing_time:.1f}s")
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Transcription failed: {e}")
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
def _transcribe_with_faster_whisper(self, video_path: Path, language: str) -> Dict:
|
|
127
|
+
"""Transcribe using faster-whisper (faster, less accurate)."""
|
|
128
|
+
result = self.model.transcribe(
|
|
129
|
+
str(video_path), language=language, word_timestamps=True
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# faster-whisper returns (segments, info) tuple
|
|
133
|
+
segments, info = result
|
|
134
|
+
|
|
135
|
+
# Convert segments to our format
|
|
136
|
+
transcription_segments = []
|
|
137
|
+
for segment in segments:
|
|
138
|
+
transcription_segments.append(
|
|
139
|
+
{
|
|
140
|
+
"start": segment.start,
|
|
141
|
+
"end": segment.end,
|
|
142
|
+
"text": segment.text.strip(),
|
|
143
|
+
"confidence": getattr(
|
|
144
|
+
segment, "avg_logprob", 0.0
|
|
145
|
+
), # Convert logprob to confidence
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"segments": transcription_segments,
|
|
151
|
+
"language": info.language,
|
|
152
|
+
"language_probability": info.language_probability,
|
|
153
|
+
"duration": info.duration,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def _transcribe_with_openai_whisper(self, video_path: Path, language: str) -> Dict:
|
|
157
|
+
"""Transcribe using openai-whisper (more accurate, slower)."""
|
|
158
|
+
result = self.model.transcribe(
|
|
159
|
+
str(video_path), language=language, word_timestamps=True
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Convert to our format
|
|
163
|
+
transcription_segments = []
|
|
164
|
+
for segment in result["segments"]:
|
|
165
|
+
transcription_segments.append(
|
|
166
|
+
{
|
|
167
|
+
"start": segment["start"],
|
|
168
|
+
"end": segment["end"],
|
|
169
|
+
"text": segment["text"].strip(),
|
|
170
|
+
"confidence": segment.get("avg_logprob", 0.0),
|
|
171
|
+
}
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
"segments": transcription_segments,
|
|
176
|
+
"language": result.get("language", language),
|
|
177
|
+
"language_probability": 1.0, # openai-whisper doesn't provide this
|
|
178
|
+
"duration": result.get("duration", 0.0),
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def get_available_models(self) -> List[str]:
|
|
182
|
+
"""Get list of available Whisper models."""
|
|
183
|
+
return ["tiny", "base", "small", "medium", "large"]
|
|
184
|
+
|
|
185
|
+
def get_model_info(self) -> Dict:
|
|
186
|
+
"""Get information about the current model."""
|
|
187
|
+
return {
|
|
188
|
+
"model_size": self.model_size,
|
|
189
|
+
"use_faster_whisper": self.use_faster_whisper,
|
|
190
|
+
"available_models": self.get_available_models(),
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class TranscriptionStorage:
|
|
195
|
+
"""
|
|
196
|
+
Handles storage and retrieval of transcriptions in MongoDB.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, mongo_db):
|
|
200
|
+
"""
|
|
201
|
+
Initialize transcription storage.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
mongo_db: MongoDB database instance
|
|
205
|
+
"""
|
|
206
|
+
self.db = mongo_db
|
|
207
|
+
self.collection = self.db.transcriptions
|
|
208
|
+
|
|
209
|
+
def store_transcription(
|
|
210
|
+
self, video_id: Union[str, int], transcription_data: Dict
|
|
211
|
+
) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Store transcription data in MongoDB.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
video_id: ID of the video file (will be converted to int for consistency)
|
|
217
|
+
transcription_data: Transcription results from Whisper
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
MongoDB document ID
|
|
221
|
+
"""
|
|
222
|
+
# Ensure video_id is always stored as an integer for consistency
|
|
223
|
+
video_id_int = int(video_id) if isinstance(video_id, (str, int)) else video_id
|
|
224
|
+
|
|
225
|
+
document = {
|
|
226
|
+
"video_id": video_id_int,
|
|
227
|
+
"created_at": time.time(),
|
|
228
|
+
"segments": transcription_data["segments"],
|
|
229
|
+
"language": transcription_data["language"],
|
|
230
|
+
"language_probability": transcription_data.get("language_probability", 1.0),
|
|
231
|
+
"duration": transcription_data.get("duration", 0.0),
|
|
232
|
+
"model_used": transcription_data.get("model_used", "unknown"),
|
|
233
|
+
"processing_time": transcription_data.get("processing_time", 0.0),
|
|
234
|
+
"total_segments": len(transcription_data["segments"]),
|
|
235
|
+
"full_text": " ".join(
|
|
236
|
+
[seg["text"] for seg in transcription_data["segments"]]
|
|
237
|
+
),
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
result = self.collection.insert_one(document)
|
|
241
|
+
logger.info(f"Stored transcription for video {video_id}: {result.inserted_id}")
|
|
242
|
+
return str(result.inserted_id)
|
|
243
|
+
|
|
244
|
+
def get_transcription(self, video_id: Union[str, int]) -> Optional[Dict]:
|
|
245
|
+
"""Get transcription for a video."""
|
|
246
|
+
# Ensure consistent integer lookup
|
|
247
|
+
video_id_int = int(video_id) if isinstance(video_id, (str, int)) else video_id
|
|
248
|
+
return self.collection.find_one({"video_id": video_id_int})
|
|
249
|
+
|
|
250
|
+
def search_transcriptions(self, query: str, limit: int = 10) -> List[Dict]:
|
|
251
|
+
"""
|
|
252
|
+
Search transcriptions by text content.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
query: Search query
|
|
256
|
+
limit: Maximum number of results
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of matching transcriptions
|
|
260
|
+
"""
|
|
261
|
+
# Create text index if it doesn't exist
|
|
262
|
+
try:
|
|
263
|
+
self.collection.create_index([("full_text", "text")])
|
|
264
|
+
except Exception:
|
|
265
|
+
pass # Index might already exist
|
|
266
|
+
|
|
267
|
+
# Search using MongoDB text search
|
|
268
|
+
results = (
|
|
269
|
+
self.collection.find(
|
|
270
|
+
{"$text": {"$search": query}}, {"score": {"$meta": "textScore"}}
|
|
271
|
+
)
|
|
272
|
+
.sort("score", -1)
|
|
273
|
+
.limit(limit)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
return list(results)
|
|
277
|
+
|
|
278
|
+
def generate_srt_subtitle(
|
|
279
|
+
self, transcription_data: Dict, output_path: Path
|
|
280
|
+
) -> bool:
|
|
281
|
+
"""
|
|
282
|
+
Generate SRT subtitle file from transcription data.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
transcription_data: Transcription data with segments
|
|
286
|
+
output_path: Path to save SRT file
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
True if successful, False otherwise
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
segments = transcription_data.get("segments", [])
|
|
293
|
+
if not segments:
|
|
294
|
+
logger.warning("No segments found in transcription data")
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
298
|
+
for i, segment in enumerate(segments, 1):
|
|
299
|
+
start_time = self._format_srt_time(segment["start"])
|
|
300
|
+
end_time = self._format_srt_time(segment["end"])
|
|
301
|
+
text = segment["text"].strip()
|
|
302
|
+
|
|
303
|
+
f.write(f"{i}\n")
|
|
304
|
+
f.write(f"{start_time} --> {end_time}\n")
|
|
305
|
+
f.write(f"{text}\n\n")
|
|
306
|
+
|
|
307
|
+
logger.info(f"Generated SRT subtitle file: {output_path}")
|
|
308
|
+
return True
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error(f"Failed to generate SRT subtitle: {e}")
|
|
312
|
+
return False
|
|
313
|
+
|
|
314
|
+
def generate_vtt_subtitle(
|
|
315
|
+
self, transcription_data: Dict, output_path: Path
|
|
316
|
+
) -> bool:
|
|
317
|
+
"""
|
|
318
|
+
Generate VTT subtitle file from transcription data.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
transcription_data: Transcription data with segments
|
|
322
|
+
output_path: Path to save VTT file
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
True if successful, False otherwise
|
|
326
|
+
"""
|
|
327
|
+
try:
|
|
328
|
+
segments = transcription_data.get("segments", [])
|
|
329
|
+
if not segments:
|
|
330
|
+
logger.warning("No segments found in transcription data")
|
|
331
|
+
return False
|
|
332
|
+
|
|
333
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
334
|
+
f.write("WEBVTT\n\n")
|
|
335
|
+
|
|
336
|
+
for segment in segments:
|
|
337
|
+
start_time = self._format_vtt_time(segment["start"])
|
|
338
|
+
end_time = self._format_vtt_time(segment["end"])
|
|
339
|
+
text = segment["text"].strip()
|
|
340
|
+
|
|
341
|
+
f.write(f"{start_time} --> {end_time}\n")
|
|
342
|
+
f.write(f"{text}\n\n")
|
|
343
|
+
|
|
344
|
+
logger.info(f"Generated VTT subtitle file: {output_path}")
|
|
345
|
+
return True
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.error(f"Failed to generate VTT subtitle: {e}")
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
def _format_srt_time(self, seconds: float) -> str:
|
|
352
|
+
"""Format time for SRT format (HH:MM:SS,mmm)."""
|
|
353
|
+
hours = int(seconds // 3600)
|
|
354
|
+
minutes = int((seconds % 3600) // 60)
|
|
355
|
+
secs = int(seconds % 60)
|
|
356
|
+
millisecs = int((seconds % 1) * 1000)
|
|
357
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
|
|
358
|
+
|
|
359
|
+
def _format_vtt_time(self, seconds: float) -> str:
|
|
360
|
+
"""Format time for VTT format (HH:MM:SS.mmm)."""
|
|
361
|
+
hours = int(seconds // 3600)
|
|
362
|
+
minutes = int((seconds % 3600) // 60)
|
|
363
|
+
secs = int(seconds % 60)
|
|
364
|
+
millisecs = int((seconds % 1) * 1000)
|
|
365
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millisecs:03d}"
|
|
366
|
+
|
|
367
|
+
def get_analytics(self) -> Dict:
|
|
368
|
+
"""Get analytics about stored transcriptions."""
|
|
369
|
+
pipeline = [
|
|
370
|
+
{
|
|
371
|
+
"$group": {
|
|
372
|
+
"_id": None,
|
|
373
|
+
"total_transcriptions": {"$sum": 1},
|
|
374
|
+
"total_duration": {"$sum": "$duration"},
|
|
375
|
+
"avg_processing_time": {"$avg": "$processing_time"},
|
|
376
|
+
"languages": {"$addToSet": "$language"},
|
|
377
|
+
"models_used": {"$addToSet": "$model_used"},
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
result = list(self.collection.aggregate(pipeline))
|
|
383
|
+
if result:
|
|
384
|
+
return result[0]
|
|
385
|
+
return {}
|