oneword-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,368 @@
1
+ """
2
+ Core subtitle generation engine using Whisper.
3
+ Supports multiple subtitle modes and language detection.
4
+ """
5
+ import whisper
6
+ import datetime
7
+ import torch
8
+ from pathlib import Path
9
+ from whisper.audio import load_audio
10
+ from typing import Optional, Literal
11
+
12
+ SubtitleMode = Literal["oneword", "twoword", "phrase"]
13
+
14
+
15
+ class SubtitleGenerator:
16
+ """Generate SRT subtitles from video/audio using Whisper."""
17
+
18
+ def __init__(self, model_name: str = "medium"):
19
+ """
20
+ Initialize the subtitle generator.
21
+
22
+ Args:
23
+ model_name: Whisper model to use (tiny, base, small, medium, large)
24
+ """
25
+ self.model_name = model_name
26
+ self.model = None
27
+ # Prevent CPU overload
28
+ torch.set_num_threads(4)
29
+
30
+ def load_model(self, status_callback=None):
31
+ """Load the Whisper model (OpenAI or Hugging Face)."""
32
+ if self.model is None:
33
+ if "Oriserve" in self.model_name or "/" in self.model_name:
34
+ print(f"📦 Loading Hugging Face model: {self.model_name}...")
35
+ from transformers import pipeline
36
+ import torch
37
+ import os
38
+ from tqdm.auto import tqdm as tqdm_auto
39
+
40
+ # Set up custom tqdm callback to capture download progress
41
+ original_tqdm = tqdm_auto
42
+
43
+ class ProgressCapture(original_tqdm):
44
+ def __init__(self, *args, **kwargs):
45
+ super().__init__(*args, **kwargs)
46
+
47
+ def display(self, msg=None, pos=None):
48
+ super().display(msg, pos)
49
+ # Capture progress for status callback
50
+ if status_callback and self.total:
51
+ downloaded = self.n
52
+ total = self.total
53
+ rate = self.format_dict.get('rate', 0) or 0
54
+ elapsed = self.format_dict.get('elapsed', 0) or 0
55
+
56
+ # Format like: "68.5M/6.17G [01:29<2:06:23, 805kB/s]"
57
+ downloaded_str = self.format_sizeof(downloaded, 'B', 1024)
58
+ total_str = self.format_sizeof(total, 'B', 1024)
59
+ rate_str = self.format_sizeof(rate, 'B/s', 1024) if rate else "0B/s"
60
+
61
+ elapsed_str = self.format_interval(elapsed)
62
+
63
+ # Calculate remaining time
64
+ if rate > 0:
65
+ remaining = (total - downloaded) / rate
66
+ remaining_str = self.format_interval(remaining)
67
+ else:
68
+ remaining_str = "??:??"
69
+
70
+ progress_msg = f"📦 Downloading: {downloaded_str}/{total_str} [{elapsed_str}<{remaining_str}, {rate_str}]"
71
+ status_callback(progress_msg)
72
+
73
+ # Monkey-patch tqdm for this model load
74
+ import tqdm.auto
75
+ original_tqdm_ref = tqdm.auto.tqdm
76
+ tqdm.auto.tqdm = ProgressCapture
77
+
78
+ try:
79
+ if status_callback:
80
+ status_callback("📦 Checking model files... (Download starting if needed)")
81
+
82
+ device = "cuda" if torch.cuda.is_available() else "cpu"
83
+ self.model = pipeline(
84
+ "automatic-speech-recognition",
85
+ model=self.model_name,
86
+ device=device
87
+ )
88
+ self.model_type = "huggingface"
89
+ finally:
90
+ # Restore original tqdm
91
+ tqdm.auto.tqdm = original_tqdm_ref
92
+
93
+ else:
94
+ if status_callback:
95
+ size_estimate = "~1.5GB" if "medium" in self.model_name else "~3GB"
96
+ status_callback(f"📦 Downloading Model ({size_estimate}, 5-15 min) - One-time only!")
97
+
98
+ print(f"📦 Loading OpenAI Whisper model: {self.model_name}...")
99
+ self.model = whisper.load_model(self.model_name)
100
+ self.model_type = "openai"
101
+
102
+ if status_callback:
103
+ status_callback("✅ Model Ready! Transcribing...")
104
+
105
+ def transcribe(
106
+ self,
107
+ file_path: str,
108
+ language: Optional[str] = None,
109
+ progress_callback=None,
110
+ status_callback=None
111
+ ) -> dict:
112
+ """
113
+ Transcribe audio/video file.
114
+ Args:
115
+ file_path: Path to audio/video file
116
+ language: Language code (hi, en, ur, es) or None for auto-detect
117
+ progress_callback: Optional callback function for progress updates
118
+ status_callback: Optional callback for status text updates
119
+ """
120
+ self.load_model(status_callback)
121
+
122
+ print(f"🎵 Reading audio...")
123
+ audio = load_audio(str(file_path))
124
+ audio_duration = len(audio) / 16000
125
+ print(f"⏱ Audio Duration: {audio_duration:.2f} seconds")
126
+
127
+ print(f"\n🧠 Transcribing started ({self.model_type})...\n")
128
+
129
+ # Threaded transcription to support progress callback
130
+ if progress_callback:
131
+ import threading
132
+ import time
133
+
134
+ result_container = {}
135
+ exception_container = {}
136
+
137
+ # Transcription function wrapper
138
+ def transcribe_task():
139
+ try:
140
+ if self.model_type == "huggingface":
141
+ # Hugging Face Pipeline
142
+ generate_kwargs = {"language": language} if language else {}
143
+ # For Hindi2Hinglish, language might need to be 'hi' or auto
144
+
145
+ out = self.model(
146
+ str(file_path),
147
+ return_timestamps="word",
148
+ generate_kwargs=generate_kwargs
149
+ )
150
+
151
+ # Normalize output to match OpenAI structure
152
+ # Transformers returns: {'text': '...', 'chunks': [{'text': ' word', 'timestamp': (start, end)}, ...]}
153
+ chunks = out.get("chunks", [])
154
+ words = []
155
+ for chunk in chunks:
156
+ # Timestamp is a tuple (start, end)
157
+ ts = chunk.get("timestamp")
158
+ if ts:
159
+ start, end = ts
160
+ words.append({
161
+ "word": chunk["text"].strip(),
162
+ "start": start,
163
+ "end": end
164
+ })
165
+
166
+ # Create a single segment with all words
167
+ segment = {
168
+ "text": out["text"],
169
+ "start": words[0]["start"] if words else 0,
170
+ "end": words[-1]["end"] if words else 0,
171
+ "words": words
172
+ }
173
+
174
+ result_container['result'] = {
175
+ "text": out["text"],
176
+ "segments": [segment]
177
+ }
178
+
179
+ else:
180
+ # OpenAI Whisper
181
+ transcribe_options = {"word_timestamps": True, "verbose": False}
182
+ if language:
183
+ transcribe_options["language"] = language
184
+
185
+ result_container['result'] = self.model.transcribe(str(file_path), **transcribe_options)
186
+
187
+ except Exception as e:
188
+ exception_container['error'] = e
189
+
190
+ # Start background thread
191
+ thread = threading.Thread(target=transcribe_task)
192
+ thread.start()
193
+
194
+ # Simulate progress
195
+ # Estimate speed: HF models can be slower or faster depending on implementation
196
+ # Generic estimate logic
197
+ speed_mult = 4.0 if self.model_type == "huggingface" else 1.5
198
+ if self.model_name in ['tiny', 'base']: speed_mult = 1.0
199
+
200
+ estimated_time = audio_duration * speed_mult
201
+
202
+ start_time = time.time()
203
+ while thread.is_alive():
204
+ elapsed = time.time() - start_time
205
+ progress = min(95, (elapsed / estimated_time) * 100)
206
+ progress_callback(progress)
207
+ time.sleep(0.5)
208
+
209
+ thread.join()
210
+
211
+ if 'error' in exception_container:
212
+ raise exception_container['error']
213
+
214
+ progress_callback(100)
215
+ return result_container['result']
216
+
217
+ else:
218
+ # Sync execution (CLI usage mostly)
219
+ if self.model_type == "huggingface":
220
+ out = self.model(str(file_path), return_timestamps="word")
221
+ chunks = out.get("chunks", [])
222
+ words = []
223
+ for chunk in chunks:
224
+ ts = chunk.get("timestamp")
225
+ if ts:
226
+ words.append({
227
+ "word": chunk["text"].strip(),
228
+ "start": ts[0],
229
+ "end": ts[1]
230
+ })
231
+ return {
232
+ "text": out["text"],
233
+ "segments": [{
234
+ "text": out["text"],
235
+ "start": words[0]["start"] if words else 0,
236
+ "end": words[-1]["end"] if words else 0,
237
+ "words": words
238
+ }]
239
+ }
240
+ else:
241
+ transcribe_options = {"word_timestamps": True, "verbose": False}
242
+ if language:
243
+ transcribe_options["language"] = language
244
+ return self.model.transcribe(str(file_path), **transcribe_options)
245
+
246
+ # ... (format_timestamp and generate_srt remain unchanged) ...
247
+
248
+ def process(
249
+ self,
250
+ input_path: str,
251
+ output_path: Optional[str] = None,
252
+ language: Optional[str] = None,
253
+ mode: SubtitleMode = "oneword",
254
+ progress_callback=None,
255
+ status_callback=None
256
+ ) -> str:
257
+ """
258
+ Full processing pipeline: transcribe and generate SRT.
259
+
260
+ Args:
261
+ input_path: Path to input video/audio
262
+ output_path: Path for output SRT (auto-generated if None)
263
+ language: Language code or None for auto-detect
264
+ mode: Subtitle mode
265
+ progress_callback: Optional progress callback
266
+ status_callback: Optional callback for status text updates
267
+
268
+ Returns:
269
+ Path to generated SRT file
270
+ """
271
+ video_path = Path(input_path)
272
+ if not video_path.exists():
273
+ raise FileNotFoundError(f"File {input_path} not found")
274
+
275
+ # Auto-generate output path
276
+ if output_path is None:
277
+ output_path = video_path.parent / f"{video_path.stem}_{mode}_subs.srt"
278
+
279
+ # Transcribe
280
+ result = self.transcribe(input_path, language, progress_callback, status_callback)
281
+
282
+ @staticmethod
283
+ def format_timestamp(seconds: float) -> str:
284
+ """Convert seconds to SRT timestamp format."""
285
+ td = datetime.timedelta(seconds=seconds)
286
+ total_seconds = int(td.total_seconds())
287
+ hours = total_seconds // 3600
288
+ minutes = (total_seconds % 3600) // 60
289
+ secs = total_seconds % 60
290
+ millis = int(td.microseconds / 1000)
291
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
292
+
293
+ def generate_srt(
294
+ self,
295
+ result: dict,
296
+ output_path: str,
297
+ mode: SubtitleMode = "oneword"
298
+ ):
299
+ """
300
+ Generate SRT file from transcription result.
301
+
302
+ Args:
303
+ result: Whisper transcription result
304
+ output_path: Path to save SRT file
305
+ mode: Subtitle mode - oneword, twoword, or phrase
306
+ """
307
+ print("✍ Writing subtitles...")
308
+
309
+ with open(output_path, "w", encoding="utf-8") as f:
310
+ counter = 1
311
+
312
+ if mode == "oneword":
313
+ # One word per subtitle
314
+ for segment in result["segments"]:
315
+ for word_data in segment["words"]:
316
+ word = word_data["word"].strip().replace(",", "")
317
+ if not word:
318
+ continue
319
+
320
+ start = word_data["start"]
321
+ end = word_data["end"]
322
+
323
+ f.write(f"{counter}\n")
324
+ f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
325
+ f.write(f"{word}\n\n")
326
+ counter += 1
327
+
328
+ elif mode == "twoword":
329
+ # Two words per subtitle (punch effect)
330
+ for segment in result["segments"]:
331
+ words = segment["words"]
332
+ i = 0
333
+ while i < len(words):
334
+ # Get up to 2 words
335
+ word_group = words[i:i+2]
336
+ text = " ".join([w["word"].strip().replace(",", "") for w in word_group if w["word"].strip()])
337
+
338
+ if not text:
339
+ i += 2
340
+ continue
341
+
342
+ start = word_group[0]["start"]
343
+ end = word_group[-1]["end"]
344
+
345
+ f.write(f"{counter}\n")
346
+ f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
347
+ f.write(f"{text}\n\n")
348
+ counter += 1
349
+ i += 2
350
+
351
+ elif mode == "phrase":
352
+ # Full segment text (phrase mode)
353
+ for segment in result["segments"]:
354
+ text = segment["text"].strip()
355
+ if not text:
356
+ continue
357
+
358
+ start = segment["start"]
359
+ end = segment["end"]
360
+
361
+ f.write(f"{counter}\n")
362
+ f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
363
+ f.write(f"{text}\n\n")
364
+ counter += 1
365
+
366
+ print(f"✅ Success! Subtitles saved to: {output_path}")
367
+
368
+