karaoke-gen 0.57.0__py3-none-any.whl → 0.71.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. karaoke_gen/audio_fetcher.py +461 -0
  2. karaoke_gen/audio_processor.py +407 -30
  3. karaoke_gen/config.py +62 -113
  4. karaoke_gen/file_handler.py +32 -59
  5. karaoke_gen/karaoke_finalise/karaoke_finalise.py +148 -67
  6. karaoke_gen/karaoke_gen.py +270 -61
  7. karaoke_gen/lyrics_processor.py +13 -1
  8. karaoke_gen/metadata.py +78 -73
  9. karaoke_gen/pipeline/__init__.py +87 -0
  10. karaoke_gen/pipeline/base.py +215 -0
  11. karaoke_gen/pipeline/context.py +230 -0
  12. karaoke_gen/pipeline/executors/__init__.py +21 -0
  13. karaoke_gen/pipeline/executors/local.py +159 -0
  14. karaoke_gen/pipeline/executors/remote.py +257 -0
  15. karaoke_gen/pipeline/stages/__init__.py +27 -0
  16. karaoke_gen/pipeline/stages/finalize.py +202 -0
  17. karaoke_gen/pipeline/stages/render.py +165 -0
  18. karaoke_gen/pipeline/stages/screens.py +139 -0
  19. karaoke_gen/pipeline/stages/separation.py +191 -0
  20. karaoke_gen/pipeline/stages/transcription.py +191 -0
  21. karaoke_gen/style_loader.py +531 -0
  22. karaoke_gen/utils/bulk_cli.py +6 -0
  23. karaoke_gen/utils/cli_args.py +424 -0
  24. karaoke_gen/utils/gen_cli.py +26 -261
  25. karaoke_gen/utils/remote_cli.py +1965 -0
  26. karaoke_gen/video_background_processor.py +351 -0
  27. karaoke_gen-0.71.27.dist-info/METADATA +610 -0
  28. karaoke_gen-0.71.27.dist-info/RECORD +275 -0
  29. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.27.dist-info}/WHEEL +1 -1
  30. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.27.dist-info}/entry_points.txt +1 -0
  31. lyrics_transcriber/__init__.py +10 -0
  32. lyrics_transcriber/cli/__init__.py +0 -0
  33. lyrics_transcriber/cli/cli_main.py +285 -0
  34. lyrics_transcriber/core/__init__.py +0 -0
  35. lyrics_transcriber/core/config.py +50 -0
  36. lyrics_transcriber/core/controller.py +520 -0
  37. lyrics_transcriber/correction/__init__.py +0 -0
  38. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  39. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  40. lyrics_transcriber/correction/agentic/agent.py +313 -0
  41. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  42. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  43. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  44. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  45. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  46. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  47. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  48. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  49. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  50. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  51. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  52. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  53. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  54. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  55. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  56. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  57. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  58. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  59. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  60. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  61. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  62. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  63. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  64. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  65. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  66. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  67. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  68. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  69. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  70. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  71. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  72. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  73. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  74. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  75. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  76. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  77. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  78. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  79. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  80. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  81. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  82. lyrics_transcriber/correction/agentic/router.py +35 -0
  83. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  84. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  85. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  86. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  87. lyrics_transcriber/correction/anchor_sequence.py +1043 -0
  88. lyrics_transcriber/correction/corrector.py +760 -0
  89. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  90. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  91. lyrics_transcriber/correction/feedback/store.py +236 -0
  92. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  93. lyrics_transcriber/correction/handlers/base.py +52 -0
  94. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  95. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  96. lyrics_transcriber/correction/handlers/llm.py +293 -0
  97. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  98. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  99. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  100. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  101. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  102. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  103. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  104. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  105. lyrics_transcriber/correction/operations.py +352 -0
  106. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  107. lyrics_transcriber/correction/text_utils.py +30 -0
  108. lyrics_transcriber/frontend/.gitignore +23 -0
  109. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  110. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  111. lyrics_transcriber/frontend/README.md +50 -0
  112. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  113. lyrics_transcriber/frontend/__init__.py +25 -0
  114. lyrics_transcriber/frontend/eslint.config.js +28 -0
  115. lyrics_transcriber/frontend/index.html +18 -0
  116. lyrics_transcriber/frontend/package.json +42 -0
  117. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  118. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  119. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  120. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  121. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  122. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  123. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  124. lyrics_transcriber/frontend/src/App.tsx +212 -0
  125. lyrics_transcriber/frontend/src/api.ts +239 -0
  126. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  127. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  128. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  129. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  130. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  131. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  132. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  133. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  134. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  135. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  136. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  137. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  138. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  139. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  140. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  141. lyrics_transcriber/frontend/src/components/Header.tsx +387 -0
  142. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1373 -0
  143. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  144. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  145. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  146. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  147. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  148. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +688 -0
  149. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  150. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  151. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  152. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  153. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  154. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  155. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  157. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  158. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  159. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  160. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  161. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  162. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  163. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  164. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  165. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  166. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  167. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  168. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  169. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  170. lyrics_transcriber/frontend/src/main.tsx +17 -0
  171. lyrics_transcriber/frontend/src/theme.ts +177 -0
  172. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  173. lyrics_transcriber/frontend/src/types.js +2 -0
  174. lyrics_transcriber/frontend/src/types.ts +199 -0
  175. lyrics_transcriber/frontend/src/validation.ts +132 -0
  176. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  177. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  178. lyrics_transcriber/frontend/tsconfig.json +25 -0
  179. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  180. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  181. lyrics_transcriber/frontend/update_version.js +11 -0
  182. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  183. lyrics_transcriber/frontend/vite.config.js +10 -0
  184. lyrics_transcriber/frontend/vite.config.ts +11 -0
  185. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  186. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  187. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  188. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js +42039 -0
  189. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +1 -0
  190. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  191. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  192. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  193. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  194. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  195. lyrics_transcriber/frontend/yarn.lock +3752 -0
  196. lyrics_transcriber/lyrics/__init__.py +0 -0
  197. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  198. lyrics_transcriber/lyrics/file_provider.py +95 -0
  199. lyrics_transcriber/lyrics/genius.py +384 -0
  200. lyrics_transcriber/lyrics/lrclib.py +231 -0
  201. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  202. lyrics_transcriber/lyrics/spotify.py +290 -0
  203. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  204. lyrics_transcriber/output/__init__.py +0 -0
  205. lyrics_transcriber/output/ass/__init__.py +21 -0
  206. lyrics_transcriber/output/ass/ass.py +2088 -0
  207. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  208. lyrics_transcriber/output/ass/config.py +180 -0
  209. lyrics_transcriber/output/ass/constants.py +23 -0
  210. lyrics_transcriber/output/ass/event.py +94 -0
  211. lyrics_transcriber/output/ass/formatters.py +132 -0
  212. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  213. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  214. lyrics_transcriber/output/ass/section_detector.py +89 -0
  215. lyrics_transcriber/output/ass/section_screen.py +106 -0
  216. lyrics_transcriber/output/ass/style.py +187 -0
  217. lyrics_transcriber/output/cdg.py +619 -0
  218. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  219. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  220. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  221. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  222. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  223. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  224. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  225. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  226. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  227. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  228. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  229. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  230. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  231. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  232. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  233. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  234. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  235. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  236. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  237. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  238. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  239. lyrics_transcriber/output/countdown_processor.py +267 -0
  240. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  241. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  242. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  243. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  244. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  245. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  246. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  247. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  248. lyrics_transcriber/output/generator.py +257 -0
  249. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  250. lyrics_transcriber/output/lyrics_file.py +102 -0
  251. lyrics_transcriber/output/plain_text.py +96 -0
  252. lyrics_transcriber/output/segment_resizer.py +431 -0
  253. lyrics_transcriber/output/subtitles.py +397 -0
  254. lyrics_transcriber/output/video.py +544 -0
  255. lyrics_transcriber/review/__init__.py +0 -0
  256. lyrics_transcriber/review/server.py +676 -0
  257. lyrics_transcriber/storage/__init__.py +0 -0
  258. lyrics_transcriber/storage/dropbox.py +225 -0
  259. lyrics_transcriber/transcribers/__init__.py +0 -0
  260. lyrics_transcriber/transcribers/audioshake.py +290 -0
  261. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  262. lyrics_transcriber/transcribers/whisper.py +330 -0
  263. lyrics_transcriber/types.py +648 -0
  264. lyrics_transcriber/utils/__init__.py +0 -0
  265. lyrics_transcriber/utils/word_utils.py +27 -0
  266. karaoke_gen-0.57.0.dist-info/METADATA +0 -167
  267. karaoke_gen-0.57.0.dist-info/RECORD +0 -23
  268. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.27.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,330 @@
1
+ #! /usr/bin/env python3
2
+ from dataclasses import dataclass
3
+ import os
4
+ import json
5
+ import requests
6
+ import hashlib
7
+ import tempfile
8
+ import time
9
+ from typing import Optional, Dict, Any, Protocol, Union
10
+ from pathlib import Path
11
+ from pydub import AudioSegment
12
+ from lyrics_transcriber.types import TranscriptionData, LyricsSegment, Word
13
+ from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber, TranscriptionError
14
+ from lyrics_transcriber.utils.word_utils import WordUtils
15
+
16
+
17
+ @dataclass
18
+ class WhisperConfig:
19
+ """Configuration for Whisper transcription service."""
20
+
21
+ runpod_api_key: Optional[str] = None
22
+ endpoint_id: Optional[str] = None
23
+ dropbox_app_key: Optional[str] = None
24
+ dropbox_app_secret: Optional[str] = None
25
+ dropbox_refresh_token: Optional[str] = None
26
+ timeout_minutes: int = 10
27
+
28
+
29
+ class FileStorageProtocol(Protocol):
30
+ """Protocol for file storage operations."""
31
+
32
+ def file_exists(self, path: str) -> bool: ... # pragma: no cover
33
+ def upload_with_retry(self, file: Any, path: str) -> None: ... # pragma: no cover
34
+ def create_or_get_shared_link(self, path: str) -> str: ... # pragma: no cover
35
+
36
+
37
+ class RunPodWhisperAPI:
38
+ """Handles interactions with RunPod API."""
39
+
40
+ def __init__(self, config: WhisperConfig, logger):
41
+ self.config = config
42
+ self.logger = logger
43
+ self._validate_config()
44
+
45
+ def _validate_config(self) -> None:
46
+ """Validate API configuration."""
47
+ if not self.config.runpod_api_key or not self.config.endpoint_id:
48
+ raise ValueError("RunPod API key and endpoint ID must be provided")
49
+
50
+ def submit_job(self, audio_url: str) -> str:
51
+ """Submit transcription job and return job ID."""
52
+ run_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/run"
53
+ headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
54
+
55
+ payload = {
56
+ "input": {
57
+ "audio": audio_url,
58
+ "word_timestamps": True,
59
+ "model": "medium",
60
+ "temperature": 0.2,
61
+ "best_of": 5,
62
+ "compression_ratio_threshold": 2.8,
63
+ "no_speech_threshold": 1,
64
+ "condition_on_previous_text": True,
65
+ "enable_vad": True,
66
+ }
67
+ }
68
+
69
+ self.logger.info("Submitting transcription job...")
70
+ response = requests.post(run_url, json=payload, headers=headers)
71
+
72
+ self.logger.debug(f"Response status code: {response.status_code}")
73
+
74
+ # Try to parse and log the JSON response
75
+ try:
76
+ response_json = response.json()
77
+ self.logger.debug(f"Response content: {json.dumps(response_json, indent=2)}")
78
+ except ValueError:
79
+ self.logger.debug(f"Raw response content: {response.text}")
80
+ # Re-raise if we can't parse the response at all
81
+ raise TranscriptionError(f"Invalid JSON response: {response.text}")
82
+
83
+ response.raise_for_status()
84
+ return response_json["id"]
85
+
86
+ def get_job_status(self, job_id: str) -> Dict[str, Any]:
87
+ """Get job status and results."""
88
+ status_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/status/{job_id}"
89
+ headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
90
+
91
+ response = requests.get(status_url, headers=headers)
92
+ response.raise_for_status()
93
+ return response.json()
94
+
95
+ def cancel_job(self, job_id: str) -> None:
96
+ """Cancel a running job."""
97
+ cancel_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/cancel/{job_id}"
98
+ headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
99
+
100
+ try:
101
+ response = requests.post(cancel_url, headers=headers)
102
+ response.raise_for_status()
103
+ except Exception as e:
104
+ self.logger.warning(f"Failed to cancel job {job_id}: {e}")
105
+
106
+ def wait_for_job_result(self, job_id: str) -> Dict[str, Any]:
107
+ """Poll for job completion and return results."""
108
+ self.logger.info(f"Getting job result for job {job_id}")
109
+
110
+ start_time = time.time()
111
+ last_status_log = start_time
112
+ timeout_seconds = self.config.timeout_minutes * 60
113
+
114
+ while True:
115
+ current_time = time.time()
116
+ elapsed_time = current_time - start_time
117
+
118
+ if elapsed_time > timeout_seconds:
119
+ self.cancel_job(job_id)
120
+ raise TranscriptionError(f"Transcription timed out after {self.config.timeout_minutes} minutes")
121
+
122
+ # Log status periodically
123
+ if current_time - last_status_log >= 60:
124
+ self.logger.info(f"Still waiting for transcription... Elapsed time: {int(elapsed_time/60)} minutes")
125
+ last_status_log = current_time
126
+
127
+ status_data = self.get_job_status(job_id)
128
+
129
+ if status_data["status"] == "COMPLETED":
130
+ return status_data["output"]
131
+ elif status_data["status"] == "FAILED":
132
+ error_msg = status_data.get("error", "Unknown error")
133
+ self.logger.error(f"Job failed with error: {error_msg}")
134
+ raise TranscriptionError(f"Transcription failed: {error_msg}")
135
+
136
+ time.sleep(5)
137
+
138
+
139
+ class AudioProcessor:
140
+ """Handles audio file processing."""
141
+
142
+ def __init__(self, logger):
143
+ self.logger = logger
144
+
145
+ def get_file_md5(self, filepath: str) -> str:
146
+ """Calculate MD5 hash of a file."""
147
+ md5_hash = hashlib.md5()
148
+ with open(filepath, "rb") as f:
149
+ for chunk in iter(lambda: f.read(4096), b""):
150
+ md5_hash.update(chunk)
151
+ return md5_hash.hexdigest()
152
+
153
+ def convert_to_flac(self, filepath: str) -> str:
154
+ """Convert WAV to FLAC if needed for faster upload."""
155
+ if not filepath.lower().endswith(".wav"):
156
+ return filepath
157
+
158
+ self.logger.info("Converting WAV to FLAC for faster upload...")
159
+ audio = AudioSegment.from_wav(filepath)
160
+
161
+ with tempfile.NamedTemporaryFile(suffix=".flac", delete=False) as temp_flac:
162
+ flac_path = temp_flac.name
163
+ audio.export(flac_path, format="flac")
164
+
165
+ return flac_path
166
+
167
+
168
+ class WhisperTranscriber(BaseTranscriber):
169
+ """Transcription service using Whisper API via RunPod."""
170
+
171
+ def __init__(
172
+ self,
173
+ cache_dir: Union[str, Path],
174
+ config: Optional[WhisperConfig] = None,
175
+ logger: Optional[Any] = None,
176
+ runpod_client: Optional[RunPodWhisperAPI] = None,
177
+ storage_client: Optional[FileStorageProtocol] = None,
178
+ audio_processor: Optional[AudioProcessor] = None,
179
+ ):
180
+ """Initialize Whisper transcriber."""
181
+ super().__init__(cache_dir=cache_dir, logger=logger)
182
+
183
+ # Initialize configuration
184
+ self.config = config or WhisperConfig(
185
+ runpod_api_key=os.getenv("RUNPOD_API_KEY"),
186
+ endpoint_id=os.getenv("WHISPER_RUNPOD_ID"),
187
+ dropbox_app_key=os.getenv("WHISPER_DROPBOX_APP_KEY"),
188
+ dropbox_app_secret=os.getenv("WHISPER_DROPBOX_APP_SECRET"),
189
+ dropbox_refresh_token=os.getenv("WHISPER_DROPBOX_REFRESH_TOKEN"),
190
+ )
191
+
192
+ # Initialize components (with dependency injection)
193
+ self.runpod = runpod_client or RunPodWhisperAPI(self.config, self.logger)
194
+ self.storage = storage_client or self._initialize_storage()
195
+ self.audio_processor = audio_processor or AudioProcessor(self.logger)
196
+
197
+ def _initialize_storage(self) -> FileStorageProtocol:
198
+ """Initialize storage client."""
199
+ from lyrics_transcriber.storage.dropbox import DropboxHandler, DropboxConfig
200
+
201
+ # Create config using os.getenv directly
202
+ config = DropboxConfig(
203
+ app_key=os.getenv("WHISPER_DROPBOX_APP_KEY"),
204
+ app_secret=os.getenv("WHISPER_DROPBOX_APP_SECRET"),
205
+ refresh_token=os.getenv("WHISPER_DROPBOX_REFRESH_TOKEN"),
206
+ )
207
+
208
+ # Log the actual config values being used
209
+ self.logger.debug("Initializing DropboxHandler with config")
210
+ return DropboxHandler(config=config)
211
+
212
+ def get_name(self) -> str:
213
+ return "Whisper"
214
+
215
+ def _perform_transcription(self, audio_filepath: str) -> TranscriptionData:
216
+ """Actually perform the whisper transcription using Whisper API."""
217
+ self.logger.info(f"Starting transcription for {audio_filepath}")
218
+
219
+ # Start transcription and get results
220
+ job_id = self.start_transcription(audio_filepath)
221
+ result = self.get_transcription_result(job_id)
222
+ return result
223
+
224
+ def start_transcription(self, audio_filepath: str) -> str:
225
+ """Prepare audio and start whisper transcription job."""
226
+ audio_url, temp_filepath = self._prepare_audio_url(audio_filepath)
227
+ try:
228
+ return self.runpod.submit_job(audio_url)
229
+ except Exception as e:
230
+ if temp_filepath:
231
+ self._cleanup_temporary_files(temp_filepath)
232
+ raise TranscriptionError(f"Failed to submit job: {str(e)}") from e
233
+
234
+ def _prepare_audio_url(self, audio_filepath: str) -> tuple[str, Optional[str]]:
235
+ """Process audio file and return URL for API and path to any temporary files."""
236
+ if audio_filepath.startswith(("http://", "https://")):
237
+ return audio_filepath, None
238
+
239
+ file_hash = self.audio_processor.get_file_md5(audio_filepath)
240
+ temp_flac_filepath = self.audio_processor.convert_to_flac(audio_filepath)
241
+
242
+ # Upload and get URL
243
+ dropbox_path = f"/transcription_temp/{file_hash}{os.path.splitext(temp_flac_filepath)[1]}"
244
+ url = self._upload_and_get_link(temp_flac_filepath, dropbox_path)
245
+ return url, temp_flac_filepath
246
+
247
+ def get_transcription_result(self, job_id: str) -> Dict[str, Any]:
248
+ """Poll for whisper job completion and return raw results."""
249
+ raw_data = self.runpod.wait_for_job_result(job_id)
250
+
251
+ # Add job_id to raw data for later use
252
+ raw_data["job_id"] = job_id
253
+
254
+ return raw_data
255
+
256
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> TranscriptionData:
257
+ """Convert Whisper API response to standard format."""
258
+ self._validate_response(raw_data)
259
+
260
+ job_id = raw_data.get("job_id")
261
+ all_words = []
262
+
263
+ # First collect all words from word_timestamps
264
+ word_list = [
265
+ Word(
266
+ id=WordUtils.generate_id(), # Generate unique ID for each word
267
+ text=word["word"].strip(),
268
+ start_time=word["start"],
269
+ end_time=word["end"],
270
+ confidence=word.get("probability"), # Only set if provided
271
+ )
272
+ for word in raw_data.get("word_timestamps", [])
273
+ ]
274
+ all_words.extend(word_list)
275
+
276
+ # Then create segments, using the words that fall within each segment's time range
277
+ segments = []
278
+ for seg in raw_data["segments"]:
279
+ segment_words = [word for word in word_list if seg["start"] <= word.start_time < seg["end"]]
280
+ segments.append(
281
+ LyricsSegment(
282
+ id=WordUtils.generate_id(), # Generate unique ID for each segment
283
+ text=seg["text"].strip(),
284
+ words=segment_words,
285
+ start_time=seg["start"],
286
+ end_time=seg["end"],
287
+ )
288
+ )
289
+
290
+ return TranscriptionData(
291
+ segments=segments,
292
+ words=all_words,
293
+ text=raw_data["transcription"],
294
+ source=self.get_name(),
295
+ metadata={
296
+ "language": raw_data.get("detected_language", "en"),
297
+ "model": raw_data.get("model"),
298
+ "job_id": job_id,
299
+ },
300
+ )
301
+
302
+ def _upload_and_get_link(self, filepath: str, dropbox_path: str) -> str:
303
+ """Upload file to storage and return shared link."""
304
+ if not self.storage.file_exists(dropbox_path):
305
+ self.logger.info("Uploading file to storage...")
306
+ with open(filepath, "rb") as f:
307
+ self.storage.upload_with_retry(f, dropbox_path)
308
+ else:
309
+ self.logger.info("File already exists in storage, skipping upload...")
310
+
311
+ audio_url = self.storage.create_or_get_shared_link(dropbox_path)
312
+ self.logger.debug(f"Using shared link: {audio_url}")
313
+ return audio_url
314
+
315
+ def _cleanup_temporary_files(self, *filepaths: Optional[str]) -> None:
316
+ """Clean up any temporary files that were created during transcription."""
317
+ for filepath in filepaths:
318
+ if filepath and os.path.exists(filepath):
319
+ try:
320
+ os.remove(filepath)
321
+ self.logger.debug(f"Cleaned up temporary file: {filepath}")
322
+ except Exception as e:
323
+ self.logger.warning(f"Failed to clean up temporary file {filepath}: {e}")
324
+
325
+ def _validate_response(self, raw_data: Dict[str, Any]) -> None:
326
+ """Validate the response contains required fields."""
327
+ if "segments" not in raw_data:
328
+ raise TranscriptionError("Response missing required 'segments' field")
329
+ if "transcription" not in raw_data:
330
+ raise TranscriptionError("Response missing required 'transcription' field")