oneword-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oneword_ai-0.1.0.dist-info/METADATA +237 -0
- oneword_ai-0.1.0.dist-info/RECORD +15 -0
- oneword_ai-0.1.0.dist-info/WHEEL +5 -0
- oneword_ai-0.1.0.dist-info/entry_points.txt +3 -0
- oneword_ai-0.1.0.dist-info/licenses/license.txt +7 -0
- oneword_ai-0.1.0.dist-info/top_level.txt +1 -0
- onewordai/__init__.py +2 -0
- onewordai/api/__init__.py +1 -0
- onewordai/api/main.py +262 -0
- onewordai/cli.py +67 -0
- onewordai/core/__init__.py +4 -0
- onewordai/core/engine.py +368 -0
- onewordai/web/app.js +361 -0
- onewordai/web/index.html +154 -0
- onewordai/web/style.css +485 -0
onewordai/core/engine.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core subtitle generation engine using Whisper.
|
|
3
|
+
Supports multiple subtitle modes and language detection.
|
|
4
|
+
"""
|
|
5
|
+
import whisper
|
|
6
|
+
import datetime
|
|
7
|
+
import torch
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from whisper.audio import load_audio
|
|
10
|
+
from typing import Optional, Literal
|
|
11
|
+
|
|
12
|
+
SubtitleMode = Literal["oneword", "twoword", "phrase"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SubtitleGenerator:
|
|
16
|
+
"""Generate SRT subtitles from video/audio using Whisper."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, model_name: str = "medium"):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the subtitle generator.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
model_name: Whisper model to use (tiny, base, small, medium, large)
|
|
24
|
+
"""
|
|
25
|
+
self.model_name = model_name
|
|
26
|
+
self.model = None
|
|
27
|
+
# Prevent CPU overload
|
|
28
|
+
torch.set_num_threads(4)
|
|
29
|
+
|
|
30
|
+
def load_model(self, status_callback=None):
|
|
31
|
+
"""Load the Whisper model (OpenAI or Hugging Face)."""
|
|
32
|
+
if self.model is None:
|
|
33
|
+
if "Oriserve" in self.model_name or "/" in self.model_name:
|
|
34
|
+
print(f"📦 Loading Hugging Face model: {self.model_name}...")
|
|
35
|
+
from transformers import pipeline
|
|
36
|
+
import torch
|
|
37
|
+
import os
|
|
38
|
+
from tqdm.auto import tqdm as tqdm_auto
|
|
39
|
+
|
|
40
|
+
# Set up custom tqdm callback to capture download progress
|
|
41
|
+
original_tqdm = tqdm_auto
|
|
42
|
+
|
|
43
|
+
class ProgressCapture(original_tqdm):
|
|
44
|
+
def __init__(self, *args, **kwargs):
|
|
45
|
+
super().__init__(*args, **kwargs)
|
|
46
|
+
|
|
47
|
+
def display(self, msg=None, pos=None):
|
|
48
|
+
super().display(msg, pos)
|
|
49
|
+
# Capture progress for status callback
|
|
50
|
+
if status_callback and self.total:
|
|
51
|
+
downloaded = self.n
|
|
52
|
+
total = self.total
|
|
53
|
+
rate = self.format_dict.get('rate', 0) or 0
|
|
54
|
+
elapsed = self.format_dict.get('elapsed', 0) or 0
|
|
55
|
+
|
|
56
|
+
# Format like: "68.5M/6.17G [01:29<2:06:23, 805kB/s]"
|
|
57
|
+
downloaded_str = self.format_sizeof(downloaded, 'B', 1024)
|
|
58
|
+
total_str = self.format_sizeof(total, 'B', 1024)
|
|
59
|
+
rate_str = self.format_sizeof(rate, 'B/s', 1024) if rate else "0B/s"
|
|
60
|
+
|
|
61
|
+
elapsed_str = self.format_interval(elapsed)
|
|
62
|
+
|
|
63
|
+
# Calculate remaining time
|
|
64
|
+
if rate > 0:
|
|
65
|
+
remaining = (total - downloaded) / rate
|
|
66
|
+
remaining_str = self.format_interval(remaining)
|
|
67
|
+
else:
|
|
68
|
+
remaining_str = "??:??"
|
|
69
|
+
|
|
70
|
+
progress_msg = f"📦 Downloading: {downloaded_str}/{total_str} [{elapsed_str}<{remaining_str}, {rate_str}]"
|
|
71
|
+
status_callback(progress_msg)
|
|
72
|
+
|
|
73
|
+
# Monkey-patch tqdm for this model load
|
|
74
|
+
import tqdm.auto
|
|
75
|
+
original_tqdm_ref = tqdm.auto.tqdm
|
|
76
|
+
tqdm.auto.tqdm = ProgressCapture
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
if status_callback:
|
|
80
|
+
status_callback("📦 Checking model files... (Download starting if needed)")
|
|
81
|
+
|
|
82
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
83
|
+
self.model = pipeline(
|
|
84
|
+
"automatic-speech-recognition",
|
|
85
|
+
model=self.model_name,
|
|
86
|
+
device=device
|
|
87
|
+
)
|
|
88
|
+
self.model_type = "huggingface"
|
|
89
|
+
finally:
|
|
90
|
+
# Restore original tqdm
|
|
91
|
+
tqdm.auto.tqdm = original_tqdm_ref
|
|
92
|
+
|
|
93
|
+
else:
|
|
94
|
+
if status_callback:
|
|
95
|
+
size_estimate = "~1.5GB" if "medium" in self.model_name else "~3GB"
|
|
96
|
+
status_callback(f"📦 Downloading Model ({size_estimate}, 5-15 min) - One-time only!")
|
|
97
|
+
|
|
98
|
+
print(f"📦 Loading OpenAI Whisper model: {self.model_name}...")
|
|
99
|
+
self.model = whisper.load_model(self.model_name)
|
|
100
|
+
self.model_type = "openai"
|
|
101
|
+
|
|
102
|
+
if status_callback:
|
|
103
|
+
status_callback("✅ Model Ready! Transcribing...")
|
|
104
|
+
|
|
105
|
+
def transcribe(
|
|
106
|
+
self,
|
|
107
|
+
file_path: str,
|
|
108
|
+
language: Optional[str] = None,
|
|
109
|
+
progress_callback=None,
|
|
110
|
+
status_callback=None
|
|
111
|
+
) -> dict:
|
|
112
|
+
"""
|
|
113
|
+
Transcribe audio/video file.
|
|
114
|
+
Args:
|
|
115
|
+
file_path: Path to audio/video file
|
|
116
|
+
language: Language code (hi, en, ur, es) or None for auto-detect
|
|
117
|
+
progress_callback: Optional callback function for progress updates
|
|
118
|
+
status_callback: Optional callback for status text updates
|
|
119
|
+
"""
|
|
120
|
+
self.load_model(status_callback)
|
|
121
|
+
|
|
122
|
+
print(f"🎵 Reading audio...")
|
|
123
|
+
audio = load_audio(str(file_path))
|
|
124
|
+
audio_duration = len(audio) / 16000
|
|
125
|
+
print(f"⏱ Audio Duration: {audio_duration:.2f} seconds")
|
|
126
|
+
|
|
127
|
+
print(f"\n🧠 Transcribing started ({self.model_type})...\n")
|
|
128
|
+
|
|
129
|
+
# Threaded transcription to support progress callback
|
|
130
|
+
if progress_callback:
|
|
131
|
+
import threading
|
|
132
|
+
import time
|
|
133
|
+
|
|
134
|
+
result_container = {}
|
|
135
|
+
exception_container = {}
|
|
136
|
+
|
|
137
|
+
# Transcription function wrapper
|
|
138
|
+
def transcribe_task():
|
|
139
|
+
try:
|
|
140
|
+
if self.model_type == "huggingface":
|
|
141
|
+
# Hugging Face Pipeline
|
|
142
|
+
generate_kwargs = {"language": language} if language else {}
|
|
143
|
+
# For Hindi2Hinglish, language might need to be 'hi' or auto
|
|
144
|
+
|
|
145
|
+
out = self.model(
|
|
146
|
+
str(file_path),
|
|
147
|
+
return_timestamps="word",
|
|
148
|
+
generate_kwargs=generate_kwargs
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Normalize output to match OpenAI structure
|
|
152
|
+
# Transformers returns: {'text': '...', 'chunks': [{'text': ' word', 'timestamp': (start, end)}, ...]}
|
|
153
|
+
chunks = out.get("chunks", [])
|
|
154
|
+
words = []
|
|
155
|
+
for chunk in chunks:
|
|
156
|
+
# Timestamp is a tuple (start, end)
|
|
157
|
+
ts = chunk.get("timestamp")
|
|
158
|
+
if ts:
|
|
159
|
+
start, end = ts
|
|
160
|
+
words.append({
|
|
161
|
+
"word": chunk["text"].strip(),
|
|
162
|
+
"start": start,
|
|
163
|
+
"end": end
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
# Create a single segment with all words
|
|
167
|
+
segment = {
|
|
168
|
+
"text": out["text"],
|
|
169
|
+
"start": words[0]["start"] if words else 0,
|
|
170
|
+
"end": words[-1]["end"] if words else 0,
|
|
171
|
+
"words": words
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
result_container['result'] = {
|
|
175
|
+
"text": out["text"],
|
|
176
|
+
"segments": [segment]
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
else:
|
|
180
|
+
# OpenAI Whisper
|
|
181
|
+
transcribe_options = {"word_timestamps": True, "verbose": False}
|
|
182
|
+
if language:
|
|
183
|
+
transcribe_options["language"] = language
|
|
184
|
+
|
|
185
|
+
result_container['result'] = self.model.transcribe(str(file_path), **transcribe_options)
|
|
186
|
+
|
|
187
|
+
except Exception as e:
|
|
188
|
+
exception_container['error'] = e
|
|
189
|
+
|
|
190
|
+
# Start background thread
|
|
191
|
+
thread = threading.Thread(target=transcribe_task)
|
|
192
|
+
thread.start()
|
|
193
|
+
|
|
194
|
+
# Simulate progress
|
|
195
|
+
# Estimate speed: HF models can be slower or faster depending on implementation
|
|
196
|
+
# Generic estimate logic
|
|
197
|
+
speed_mult = 4.0 if self.model_type == "huggingface" else 1.5
|
|
198
|
+
if self.model_name in ['tiny', 'base']: speed_mult = 1.0
|
|
199
|
+
|
|
200
|
+
estimated_time = audio_duration * speed_mult
|
|
201
|
+
|
|
202
|
+
start_time = time.time()
|
|
203
|
+
while thread.is_alive():
|
|
204
|
+
elapsed = time.time() - start_time
|
|
205
|
+
progress = min(95, (elapsed / estimated_time) * 100)
|
|
206
|
+
progress_callback(progress)
|
|
207
|
+
time.sleep(0.5)
|
|
208
|
+
|
|
209
|
+
thread.join()
|
|
210
|
+
|
|
211
|
+
if 'error' in exception_container:
|
|
212
|
+
raise exception_container['error']
|
|
213
|
+
|
|
214
|
+
progress_callback(100)
|
|
215
|
+
return result_container['result']
|
|
216
|
+
|
|
217
|
+
else:
|
|
218
|
+
# Sync execution (CLI usage mostly)
|
|
219
|
+
if self.model_type == "huggingface":
|
|
220
|
+
out = self.model(str(file_path), return_timestamps="word")
|
|
221
|
+
chunks = out.get("chunks", [])
|
|
222
|
+
words = []
|
|
223
|
+
for chunk in chunks:
|
|
224
|
+
ts = chunk.get("timestamp")
|
|
225
|
+
if ts:
|
|
226
|
+
words.append({
|
|
227
|
+
"word": chunk["text"].strip(),
|
|
228
|
+
"start": ts[0],
|
|
229
|
+
"end": ts[1]
|
|
230
|
+
})
|
|
231
|
+
return {
|
|
232
|
+
"text": out["text"],
|
|
233
|
+
"segments": [{
|
|
234
|
+
"text": out["text"],
|
|
235
|
+
"start": words[0]["start"] if words else 0,
|
|
236
|
+
"end": words[-1]["end"] if words else 0,
|
|
237
|
+
"words": words
|
|
238
|
+
}]
|
|
239
|
+
}
|
|
240
|
+
else:
|
|
241
|
+
transcribe_options = {"word_timestamps": True, "verbose": False}
|
|
242
|
+
if language:
|
|
243
|
+
transcribe_options["language"] = language
|
|
244
|
+
return self.model.transcribe(str(file_path), **transcribe_options)
|
|
245
|
+
|
|
246
|
+
# ... (format_timestamp and generate_srt remain unchanged) ...
|
|
247
|
+
|
|
248
|
+
def process(
|
|
249
|
+
self,
|
|
250
|
+
input_path: str,
|
|
251
|
+
output_path: Optional[str] = None,
|
|
252
|
+
language: Optional[str] = None,
|
|
253
|
+
mode: SubtitleMode = "oneword",
|
|
254
|
+
progress_callback=None,
|
|
255
|
+
status_callback=None
|
|
256
|
+
) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Full processing pipeline: transcribe and generate SRT.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
input_path: Path to input video/audio
|
|
262
|
+
output_path: Path for output SRT (auto-generated if None)
|
|
263
|
+
language: Language code or None for auto-detect
|
|
264
|
+
mode: Subtitle mode
|
|
265
|
+
progress_callback: Optional progress callback
|
|
266
|
+
status_callback: Optional callback for status text updates
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Path to generated SRT file
|
|
270
|
+
"""
|
|
271
|
+
video_path = Path(input_path)
|
|
272
|
+
if not video_path.exists():
|
|
273
|
+
raise FileNotFoundError(f"File {input_path} not found")
|
|
274
|
+
|
|
275
|
+
# Auto-generate output path
|
|
276
|
+
if output_path is None:
|
|
277
|
+
output_path = video_path.parent / f"{video_path.stem}_{mode}_subs.srt"
|
|
278
|
+
|
|
279
|
+
# Transcribe
|
|
280
|
+
result = self.transcribe(input_path, language, progress_callback, status_callback)
|
|
281
|
+
|
|
282
|
+
@staticmethod
|
|
283
|
+
def format_timestamp(seconds: float) -> str:
|
|
284
|
+
"""Convert seconds to SRT timestamp format."""
|
|
285
|
+
td = datetime.timedelta(seconds=seconds)
|
|
286
|
+
total_seconds = int(td.total_seconds())
|
|
287
|
+
hours = total_seconds // 3600
|
|
288
|
+
minutes = (total_seconds % 3600) // 60
|
|
289
|
+
secs = total_seconds % 60
|
|
290
|
+
millis = int(td.microseconds / 1000)
|
|
291
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
292
|
+
|
|
293
|
+
def generate_srt(
|
|
294
|
+
self,
|
|
295
|
+
result: dict,
|
|
296
|
+
output_path: str,
|
|
297
|
+
mode: SubtitleMode = "oneword"
|
|
298
|
+
):
|
|
299
|
+
"""
|
|
300
|
+
Generate SRT file from transcription result.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
result: Whisper transcription result
|
|
304
|
+
output_path: Path to save SRT file
|
|
305
|
+
mode: Subtitle mode - oneword, twoword, or phrase
|
|
306
|
+
"""
|
|
307
|
+
print("✍ Writing subtitles...")
|
|
308
|
+
|
|
309
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
310
|
+
counter = 1
|
|
311
|
+
|
|
312
|
+
if mode == "oneword":
|
|
313
|
+
# One word per subtitle
|
|
314
|
+
for segment in result["segments"]:
|
|
315
|
+
for word_data in segment["words"]:
|
|
316
|
+
word = word_data["word"].strip().replace(",", "")
|
|
317
|
+
if not word:
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
start = word_data["start"]
|
|
321
|
+
end = word_data["end"]
|
|
322
|
+
|
|
323
|
+
f.write(f"{counter}\n")
|
|
324
|
+
f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
|
|
325
|
+
f.write(f"{word}\n\n")
|
|
326
|
+
counter += 1
|
|
327
|
+
|
|
328
|
+
elif mode == "twoword":
|
|
329
|
+
# Two words per subtitle (punch effect)
|
|
330
|
+
for segment in result["segments"]:
|
|
331
|
+
words = segment["words"]
|
|
332
|
+
i = 0
|
|
333
|
+
while i < len(words):
|
|
334
|
+
# Get up to 2 words
|
|
335
|
+
word_group = words[i:i+2]
|
|
336
|
+
text = " ".join([w["word"].strip().replace(",", "") for w in word_group if w["word"].strip()])
|
|
337
|
+
|
|
338
|
+
if not text:
|
|
339
|
+
i += 2
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
start = word_group[0]["start"]
|
|
343
|
+
end = word_group[-1]["end"]
|
|
344
|
+
|
|
345
|
+
f.write(f"{counter}\n")
|
|
346
|
+
f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
|
|
347
|
+
f.write(f"{text}\n\n")
|
|
348
|
+
counter += 1
|
|
349
|
+
i += 2
|
|
350
|
+
|
|
351
|
+
elif mode == "phrase":
|
|
352
|
+
# Full segment text (phrase mode)
|
|
353
|
+
for segment in result["segments"]:
|
|
354
|
+
text = segment["text"].strip()
|
|
355
|
+
if not text:
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
start = segment["start"]
|
|
359
|
+
end = segment["end"]
|
|
360
|
+
|
|
361
|
+
f.write(f"{counter}\n")
|
|
362
|
+
f.write(f"{self.format_timestamp(start)} --> {self.format_timestamp(end)}\n")
|
|
363
|
+
f.write(f"{text}\n\n")
|
|
364
|
+
counter += 1
|
|
365
|
+
|
|
366
|
+
print(f"✅ Success! Subtitles saved to: {output_path}")
|
|
367
|
+
|
|
368
|
+
|