voice-mode 3.34.3__py3-none-any.whl → 4.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. voice_mode/__version__.py +1 -1
  2. voice_mode/cli.py +8 -0
  3. voice_mode/cli_commands/pronounce_commands.py +223 -0
  4. voice_mode/cli_commands/transcribe.py +141 -0
  5. voice_mode/config.py +139 -37
  6. voice_mode/data/default_pronunciation.yaml +268 -0
  7. voice_mode/frontend/.next/BUILD_ID +1 -0
  8. voice_mode/frontend/.next/app-build-manifest.json +28 -0
  9. voice_mode/frontend/.next/app-path-routes-manifest.json +1 -0
  10. voice_mode/frontend/.next/build-manifest.json +32 -0
  11. voice_mode/frontend/.next/export-marker.json +1 -0
  12. voice_mode/frontend/.next/images-manifest.json +1 -0
  13. voice_mode/frontend/.next/next-minimal-server.js.nft.json +1 -0
  14. voice_mode/frontend/.next/next-server.js.nft.json +1 -0
  15. voice_mode/frontend/.next/package.json +1 -0
  16. voice_mode/frontend/.next/prerender-manifest.json +1 -0
  17. voice_mode/frontend/.next/react-loadable-manifest.json +1 -0
  18. voice_mode/frontend/.next/required-server-files.json +1 -0
  19. voice_mode/frontend/.next/routes-manifest.json +1 -0
  20. voice_mode/frontend/.next/server/app/_not-found/page.js +1 -0
  21. voice_mode/frontend/.next/server/app/_not-found/page.js.nft.json +1 -0
  22. voice_mode/frontend/.next/server/app/_not-found/page_client-reference-manifest.js +1 -0
  23. voice_mode/frontend/.next/server/app/_not-found.html +1 -0
  24. voice_mode/frontend/.next/server/app/_not-found.meta +6 -0
  25. voice_mode/frontend/.next/server/app/_not-found.rsc +9 -0
  26. voice_mode/frontend/.next/server/app/api/connection-details/route.js +12 -0
  27. voice_mode/frontend/.next/server/app/api/connection-details/route.js.nft.json +1 -0
  28. voice_mode/frontend/.next/server/app/favicon.ico/route.js +12 -0
  29. voice_mode/frontend/.next/server/app/favicon.ico/route.js.nft.json +1 -0
  30. voice_mode/frontend/.next/server/app/favicon.ico.body +0 -0
  31. voice_mode/frontend/.next/server/app/favicon.ico.meta +1 -0
  32. voice_mode/frontend/.next/server/app/index.html +1 -0
  33. voice_mode/frontend/.next/server/app/index.meta +5 -0
  34. voice_mode/frontend/.next/server/app/index.rsc +7 -0
  35. voice_mode/frontend/.next/server/app/page.js +11 -0
  36. voice_mode/frontend/.next/server/app/page.js.nft.json +1 -0
  37. voice_mode/frontend/.next/server/app/page_client-reference-manifest.js +1 -0
  38. voice_mode/frontend/.next/server/app-paths-manifest.json +6 -0
  39. voice_mode/frontend/.next/server/chunks/463.js +1 -0
  40. voice_mode/frontend/.next/server/chunks/682.js +6 -0
  41. voice_mode/frontend/.next/server/chunks/948.js +2 -0
  42. voice_mode/frontend/.next/server/chunks/994.js +2 -0
  43. voice_mode/frontend/.next/server/chunks/font-manifest.json +1 -0
  44. voice_mode/frontend/.next/server/font-manifest.json +1 -0
  45. voice_mode/frontend/.next/server/functions-config-manifest.json +1 -0
  46. voice_mode/frontend/.next/server/interception-route-rewrite-manifest.js +1 -0
  47. voice_mode/frontend/.next/server/middleware-build-manifest.js +1 -0
  48. voice_mode/frontend/.next/server/middleware-manifest.json +6 -0
  49. voice_mode/frontend/.next/server/middleware-react-loadable-manifest.js +1 -0
  50. voice_mode/frontend/.next/server/next-font-manifest.js +1 -0
  51. voice_mode/frontend/.next/server/next-font-manifest.json +1 -0
  52. voice_mode/frontend/.next/server/pages/404.html +1 -0
  53. voice_mode/frontend/.next/server/pages/500.html +1 -0
  54. voice_mode/frontend/.next/server/pages/_app.js +1 -0
  55. voice_mode/frontend/.next/server/pages/_app.js.nft.json +1 -0
  56. voice_mode/frontend/.next/server/pages/_document.js +1 -0
  57. voice_mode/frontend/.next/server/pages/_document.js.nft.json +1 -0
  58. voice_mode/frontend/.next/server/pages/_error.js +1 -0
  59. voice_mode/frontend/.next/server/pages/_error.js.nft.json +1 -0
  60. voice_mode/frontend/.next/server/pages-manifest.json +1 -0
  61. voice_mode/frontend/.next/server/server-reference-manifest.js +1 -0
  62. voice_mode/frontend/.next/server/server-reference-manifest.json +1 -0
  63. voice_mode/frontend/.next/server/webpack-runtime.js +1 -0
  64. voice_mode/frontend/.next/standalone/.next/BUILD_ID +1 -0
  65. voice_mode/frontend/.next/standalone/.next/app-build-manifest.json +28 -0
  66. voice_mode/frontend/.next/standalone/.next/app-path-routes-manifest.json +1 -0
  67. voice_mode/frontend/.next/standalone/.next/build-manifest.json +32 -0
  68. voice_mode/frontend/.next/standalone/.next/package.json +1 -0
  69. voice_mode/frontend/.next/standalone/.next/prerender-manifest.json +1 -0
  70. voice_mode/frontend/.next/standalone/.next/react-loadable-manifest.json +1 -0
  71. voice_mode/frontend/.next/standalone/.next/required-server-files.json +1 -0
  72. voice_mode/frontend/.next/standalone/.next/routes-manifest.json +1 -0
  73. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js +1 -0
  74. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page.js.nft.json +1 -0
  75. voice_mode/frontend/.next/standalone/.next/server/app/_not-found/page_client-reference-manifest.js +1 -0
  76. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.html +1 -0
  77. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.meta +6 -0
  78. voice_mode/frontend/.next/standalone/.next/server/app/_not-found.rsc +9 -0
  79. voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js +12 -0
  80. voice_mode/frontend/.next/standalone/.next/server/app/api/connection-details/route.js.nft.json +1 -0
  81. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js +12 -0
  82. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico/route.js.nft.json +1 -0
  83. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico.body +0 -0
  84. voice_mode/frontend/.next/standalone/.next/server/app/favicon.ico.meta +1 -0
  85. voice_mode/frontend/.next/standalone/.next/server/app/index.html +1 -0
  86. voice_mode/frontend/.next/standalone/.next/server/app/index.meta +5 -0
  87. voice_mode/frontend/.next/standalone/.next/server/app/index.rsc +7 -0
  88. voice_mode/frontend/.next/standalone/.next/server/app/page.js +11 -0
  89. voice_mode/frontend/.next/standalone/.next/server/app/page.js.nft.json +1 -0
  90. voice_mode/frontend/.next/standalone/.next/server/app/page_client-reference-manifest.js +1 -0
  91. voice_mode/frontend/.next/standalone/.next/server/app-paths-manifest.json +6 -0
  92. voice_mode/frontend/.next/standalone/.next/server/chunks/463.js +1 -0
  93. voice_mode/frontend/.next/standalone/.next/server/chunks/682.js +6 -0
  94. voice_mode/frontend/.next/standalone/.next/server/chunks/948.js +2 -0
  95. voice_mode/frontend/.next/standalone/.next/server/chunks/994.js +2 -0
  96. voice_mode/frontend/.next/standalone/.next/server/font-manifest.json +1 -0
  97. voice_mode/frontend/.next/standalone/.next/server/middleware-build-manifest.js +1 -0
  98. voice_mode/frontend/.next/standalone/.next/server/middleware-manifest.json +6 -0
  99. voice_mode/frontend/.next/standalone/.next/server/middleware-react-loadable-manifest.js +1 -0
  100. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.js +1 -0
  101. voice_mode/frontend/.next/standalone/.next/server/next-font-manifest.json +1 -0
  102. voice_mode/frontend/.next/standalone/.next/server/pages/404.html +1 -0
  103. voice_mode/frontend/.next/standalone/.next/server/pages/500.html +1 -0
  104. voice_mode/frontend/.next/standalone/.next/server/pages/_app.js +1 -0
  105. voice_mode/frontend/.next/standalone/.next/server/pages/_app.js.nft.json +1 -0
  106. voice_mode/frontend/.next/standalone/.next/server/pages/_document.js +1 -0
  107. voice_mode/frontend/.next/standalone/.next/server/pages/_document.js.nft.json +1 -0
  108. voice_mode/frontend/.next/standalone/.next/server/pages/_error.js +1 -0
  109. voice_mode/frontend/.next/standalone/.next/server/pages/_error.js.nft.json +1 -0
  110. voice_mode/frontend/.next/standalone/.next/server/pages-manifest.json +1 -0
  111. voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.js +1 -0
  112. voice_mode/frontend/.next/standalone/.next/server/server-reference-manifest.json +1 -0
  113. voice_mode/frontend/.next/standalone/.next/server/webpack-runtime.js +1 -0
  114. voice_mode/frontend/.next/standalone/package.json +40 -0
  115. voice_mode/frontend/.next/standalone/server.js +38 -0
  116. voice_mode/frontend/.next/static/chunks/117-40bc79a2b97edb21.js +2 -0
  117. voice_mode/frontend/.next/static/chunks/144d3bae-2d5f122b82426d88.js +1 -0
  118. voice_mode/frontend/.next/static/chunks/471-bd4b96a33883dfa2.js +3 -0
  119. voice_mode/frontend/.next/static/chunks/app/_not-found/page-5011050e402ab9c8.js +1 -0
  120. voice_mode/frontend/.next/static/chunks/app/layout-fcb9b9ba5b72c7fc.js +1 -0
  121. voice_mode/frontend/.next/static/chunks/app/page-7c7ec2ad413ace39.js +1 -0
  122. voice_mode/frontend/.next/static/chunks/fd9d1056-af324d327b243cf1.js +1 -0
  123. voice_mode/frontend/.next/static/chunks/framework-f66176bb897dc684.js +1 -0
  124. voice_mode/frontend/.next/static/chunks/main-3163eca598b76a9f.js +1 -0
  125. voice_mode/frontend/.next/static/chunks/main-app-d02bd38ac01adb8a.js +1 -0
  126. voice_mode/frontend/.next/static/chunks/pages/_app-72b849fbd24ac258.js +1 -0
  127. voice_mode/frontend/.next/static/chunks/pages/_error-7ba65e1336b92748.js +1 -0
  128. voice_mode/frontend/.next/static/chunks/polyfills-42372ed130431b0a.js +1 -0
  129. voice_mode/frontend/.next/static/chunks/webpack-0ea9b80f19935b70.js +1 -0
  130. voice_mode/frontend/.next/static/css/a2f49a47752b5010.css +3 -0
  131. voice_mode/frontend/.next/static/media/01099be941da1820-s.woff2 +0 -0
  132. voice_mode/frontend/.next/static/media/39883d31a7792467-s.p.woff2 +0 -0
  133. voice_mode/frontend/.next/static/media/6368404d2e8d66fe-s.woff2 +0 -0
  134. voice_mode/frontend/.next/static/pbDjheefW1LwCua_8mPoZ/_buildManifest.js +1 -0
  135. voice_mode/frontend/.next/static/pbDjheefW1LwCua_8mPoZ/_ssgManifest.js +1 -0
  136. voice_mode/frontend/.next/trace +43 -0
  137. voice_mode/frontend/.next/types/app/api/connection-details/route.ts +343 -0
  138. voice_mode/frontend/.next/types/app/layout.ts +79 -0
  139. voice_mode/frontend/.next/types/app/page.ts +79 -0
  140. voice_mode/frontend/.next/types/package.json +1 -0
  141. voice_mode/frontend/package-lock.json +154 -1
  142. voice_mode/pronounce.py +397 -0
  143. voice_mode/providers.py +7 -8
  144. voice_mode/resources/configuration.py +2 -2
  145. voice_mode/tools/configuration_management.py +106 -5
  146. voice_mode/tools/converse.py +109 -0
  147. voice_mode/tools/pronounce.py +245 -0
  148. voice_mode/tools/transcription/__init__.py +14 -0
  149. voice_mode/tools/transcription/backends.py +287 -0
  150. voice_mode/tools/transcription/core.py +136 -0
  151. voice_mode/tools/transcription/formats.py +144 -0
  152. voice_mode/tools/transcription/types.py +52 -0
  153. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/METADATA +5 -2
  154. voice_mode-4.1.0.dist-info/RECORD +259 -0
  155. voice_mode/voice_preferences.py +0 -125
  156. voice_mode-3.34.3.dist-info/RECORD +0 -116
  157. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/WHEEL +0 -0
  158. {voice_mode-3.34.3.dist-info → voice_mode-4.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,287 @@
1
+ """Backend implementations for transcription."""
2
+
3
+ import os
4
+ import json
5
+ import subprocess
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Dict, Any, Optional, List
9
+ import httpx
10
+
11
+ from voice_mode.config import OPENAI_API_KEY
12
+ from .types import TranscriptionResult
13
+
14
+
15
+ async def transcribe_with_openai(
16
+ audio_path: Path,
17
+ word_timestamps: bool = False,
18
+ language: Optional[str] = None,
19
+ model: str = "whisper-1"
20
+ ) -> TranscriptionResult:
21
+ """
22
+ Transcribe using OpenAI API with optional word-level timestamps.
23
+ """
24
+
25
+ # Import OpenAI client
26
+ from openai import AsyncOpenAI
27
+
28
+ # Get API key from VoiceMode config
29
+ api_key = OPENAI_API_KEY or os.environ.get("OPENAI_API_KEY")
30
+
31
+ if not api_key:
32
+ return TranscriptionResult(
33
+ text="",
34
+ language="",
35
+ segments=[],
36
+ backend="openai",
37
+ success=False,
38
+ error="OpenAI API key not configured. Set OPENAI_API_KEY environment variable."
39
+ )
40
+
41
+ # Initialize async client (automatically respects OPENAI_BASE_URL env var)
42
+ client = AsyncOpenAI(api_key=api_key)
43
+
44
+ # Prepare timestamp granularities
45
+ timestamp_granularities = ["segment"]
46
+ if word_timestamps:
47
+ timestamp_granularities.append("word")
48
+
49
+ try:
50
+ # Open and transcribe the audio file
51
+ with open(audio_path, "rb") as audio_file:
52
+ transcription = await client.audio.transcriptions.create(
53
+ model=model,
54
+ file=audio_file,
55
+ response_format="verbose_json",
56
+ timestamp_granularities=timestamp_granularities,
57
+ language=language
58
+ )
59
+
60
+ # Convert response to dictionary
61
+ result = transcription.model_dump() if hasattr(transcription, 'model_dump') else transcription.dict()
62
+
63
+ # Format response
64
+ formatted = TranscriptionResult(
65
+ text=result.get("text", ""),
66
+ language=result.get("language", ""),
67
+ duration=result.get("duration", 0),
68
+ segments=[],
69
+ backend="openai",
70
+ model=model,
71
+ success=True
72
+ )
73
+
74
+ # Process segments
75
+ for segment in result.get("segments", []):
76
+ seg_data = {
77
+ "id": segment.get("id"),
78
+ "text": segment.get("text", "").strip(),
79
+ "start": segment.get("start", 0),
80
+ "end": segment.get("end", 0)
81
+ }
82
+ formatted["segments"].append(seg_data)
83
+
84
+ # Handle word timestamps - OpenAI returns them at the top level
85
+ if word_timestamps and "words" in result:
86
+ formatted["words"] = [
87
+ {
88
+ "word": w.get("word", ""),
89
+ "start": w.get("start", 0),
90
+ "end": w.get("end", 0)
91
+ }
92
+ for w in result.get("words", [])
93
+ ]
94
+ else:
95
+ formatted["words"] = []
96
+
97
+ return formatted
98
+
99
+ except Exception as e:
100
+ return TranscriptionResult(
101
+ text="",
102
+ language="",
103
+ segments=[],
104
+ backend="openai",
105
+ success=False,
106
+ error=str(e)
107
+ )
108
+
109
+
110
+ async def transcribe_with_whisperx(
111
+ audio_path: Path,
112
+ word_timestamps: bool = True,
113
+ language: Optional[str] = None
114
+ ) -> TranscriptionResult:
115
+ """
116
+ Transcribe using WhisperX for enhanced word-level alignment.
117
+ """
118
+
119
+ try:
120
+ # Try importing WhisperX
121
+ import whisperx
122
+ import torch
123
+ except ImportError:
124
+ return TranscriptionResult(
125
+ text="",
126
+ language="",
127
+ segments=[],
128
+ backend="whisperx",
129
+ success=False,
130
+ error="WhisperX not installed. Install with: pip install git+https://github.com/m-bain/whisperX.git"
131
+ )
132
+
133
+ try:
134
+ device = "cuda" if torch.cuda.is_available() else "cpu"
135
+ compute_type = "float16" if device == "cuda" else "int8"
136
+
137
+ # Load model
138
+ model = whisperx.load_model("large-v3", device, compute_type=compute_type)
139
+
140
+ # Load audio
141
+ audio = whisperx.load_audio(str(audio_path))
142
+
143
+ # Transcribe
144
+ result = model.transcribe(audio, batch_size=16, language=language)
145
+
146
+ # Align for word timestamps if requested
147
+ if word_timestamps:
148
+ # Load alignment model
149
+ model_a, metadata = whisperx.load_align_model(
150
+ language_code=result.get("language", language or "en"),
151
+ device=device
152
+ )
153
+
154
+ # Align
155
+ result = whisperx.align(
156
+ result["segments"],
157
+ model_a,
158
+ metadata,
159
+ audio,
160
+ device,
161
+ return_char_alignments=False
162
+ )
163
+
164
+ # Format response
165
+ formatted = TranscriptionResult(
166
+ text=" ".join(s.get("text", "") for s in result.get("segments", [])),
167
+ language=result.get("language", ""),
168
+ segments=result.get("segments", []),
169
+ backend="whisperx",
170
+ success=True
171
+ )
172
+
173
+ # Add enhanced_alignment flag
174
+ if word_timestamps:
175
+ formatted["enhanced_alignment"] = True
176
+
177
+ # Flatten words if available
178
+ if word_timestamps:
179
+ formatted["words"] = []
180
+ for segment in formatted["segments"]:
181
+ if "words" in segment:
182
+ formatted["words"].extend(segment["words"])
183
+
184
+ return formatted
185
+
186
+ except Exception as e:
187
+ return TranscriptionResult(
188
+ text="",
189
+ language="",
190
+ segments=[],
191
+ backend="whisperx",
192
+ success=False,
193
+ error=str(e)
194
+ )
195
+
196
+
197
+ async def transcribe_with_whisper_cpp(
198
+ audio_path: Path,
199
+ word_timestamps: bool = False,
200
+ language: Optional[str] = None
201
+ ) -> TranscriptionResult:
202
+ """
203
+ Transcribe using local whisper.cpp server.
204
+ """
205
+
206
+ # Check if whisper-server is running (using localhost:2022 as configured)
207
+ server_url = "http://localhost:2022/v1/audio/transcriptions"
208
+
209
+ # Convert audio to WAV if needed
210
+ if audio_path.suffix.lower() != ".wav":
211
+ # Use ffmpeg to convert
212
+ wav_path = Path(tempfile.mktemp(suffix=".wav"))
213
+ try:
214
+ subprocess.run([
215
+ "ffmpeg", "-i", str(audio_path),
216
+ "-ar", "16000", "-ac", "1", "-f", "wav",
217
+ str(wav_path)
218
+ ], check=True, capture_output=True)
219
+ except subprocess.CalledProcessError as e:
220
+ return TranscriptionResult(
221
+ text="",
222
+ language="",
223
+ segments=[],
224
+ backend="whisper-cpp",
225
+ success=False,
226
+ error=f"Failed to convert audio to WAV: {e.stderr.decode() if e.stderr else str(e)}"
227
+ )
228
+ else:
229
+ wav_path = audio_path
230
+
231
+ try:
232
+ # Read audio file
233
+ with open(wav_path, "rb") as f:
234
+ audio_data = f.read()
235
+
236
+ # Prepare request
237
+ files = {"file": ("audio.wav", audio_data, "audio/wav")}
238
+ data = {
239
+ "response_format": "verbose_json" if word_timestamps else "json",
240
+ "word_timestamps": "true" if word_timestamps else "false"
241
+ }
242
+ if language:
243
+ data["language"] = language
244
+
245
+ # Send request
246
+ async with httpx.AsyncClient() as client:
247
+ response = await client.post(
248
+ server_url,
249
+ files=files,
250
+ data=data,
251
+ timeout=120.0
252
+ )
253
+
254
+ if response.status_code != 200:
255
+ raise Exception(f"Whisper server error: {response.text}")
256
+
257
+ result = response.json()
258
+
259
+ # Format response
260
+ formatted = TranscriptionResult(
261
+ text=result.get("text", ""),
262
+ language=result.get("language", ""),
263
+ segments=result.get("segments", []),
264
+ backend="whisper-cpp",
265
+ success=True
266
+ )
267
+
268
+ # Add word timestamps if available
269
+ if word_timestamps and "words" in result:
270
+ formatted["words"] = result["words"]
271
+
272
+ return formatted
273
+
274
+ except Exception as e:
275
+ return TranscriptionResult(
276
+ text="",
277
+ language="",
278
+ segments=[],
279
+ backend="whisper-cpp",
280
+ success=False,
281
+ error=str(e)
282
+ )
283
+
284
+ finally:
285
+ # Clean up temp file if created
286
+ if wav_path != audio_path and wav_path.exists():
287
+ wav_path.unlink()
@@ -0,0 +1,136 @@
1
+ """Core transcription functionality."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import Optional, Union, BinaryIO, Dict, Any
6
+
7
+ from .types import TranscriptionResult, TranscriptionBackend, OutputFormat
8
+ from .backends import (
9
+ transcribe_with_openai,
10
+ transcribe_with_whisperx,
11
+ transcribe_with_whisper_cpp
12
+ )
13
+ from .formats import convert_to_format
14
+
15
+
16
+ async def transcribe_audio(
17
+ audio_file: Union[str, Path, BinaryIO],
18
+ word_timestamps: bool = False,
19
+ backend: TranscriptionBackend = TranscriptionBackend.OPENAI,
20
+ output_format: OutputFormat = OutputFormat.JSON,
21
+ language: Optional[str] = None,
22
+ model: str = "whisper-1"
23
+ ) -> TranscriptionResult:
24
+ """
25
+ Transcribe audio with optional word-level timestamps.
26
+
27
+ This is the main API entry point for VoiceMode transcription.
28
+
29
+ Args:
30
+ audio_file: Path to audio file or file-like object
31
+ word_timestamps: Include word-level timestamps
32
+ backend: Which transcription backend to use
33
+ output_format: Output format for transcription
34
+ language: Language code (e.g., 'en', 'es', 'fr')
35
+ model: Model to use (for OpenAI backend)
36
+
37
+ Returns:
38
+ TranscriptionResult with transcription data
39
+ """
40
+ # Convert path to Path object
41
+ if isinstance(audio_file, str):
42
+ audio_path = Path(audio_file)
43
+ elif isinstance(audio_file, Path):
44
+ audio_path = audio_file
45
+ else:
46
+ # Handle BinaryIO case
47
+ import tempfile
48
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
49
+ tmp.write(audio_file.read())
50
+ audio_path = Path(tmp.name)
51
+
52
+ # Validate file exists
53
+ if not audio_path.exists():
54
+ return TranscriptionResult(
55
+ text="",
56
+ language="",
57
+ segments=[],
58
+ backend=backend.value,
59
+ success=False,
60
+ error=f"Audio file not found: {audio_path}"
61
+ )
62
+
63
+ # Call appropriate backend
64
+ try:
65
+ if backend == TranscriptionBackend.OPENAI:
66
+ result = await transcribe_with_openai(
67
+ audio_path,
68
+ word_timestamps=word_timestamps,
69
+ language=language,
70
+ model=model
71
+ )
72
+ elif backend == TranscriptionBackend.WHISPERX:
73
+ result = await transcribe_with_whisperx(
74
+ audio_path,
75
+ word_timestamps=word_timestamps,
76
+ language=language
77
+ )
78
+ elif backend == TranscriptionBackend.WHISPER_CPP:
79
+ result = await transcribe_with_whisper_cpp(
80
+ audio_path,
81
+ word_timestamps=word_timestamps,
82
+ language=language
83
+ )
84
+ else:
85
+ return TranscriptionResult(
86
+ text="",
87
+ language="",
88
+ segments=[],
89
+ backend=backend.value,
90
+ success=False,
91
+ error=f"Unknown backend: {backend}"
92
+ )
93
+
94
+ # Convert format if needed
95
+ if output_format != OutputFormat.JSON and result.get("success", False):
96
+ formatted_content = convert_to_format(result, output_format)
97
+ result["formatted_content"] = formatted_content
98
+
99
+ return result
100
+
101
+ except Exception as e:
102
+ return TranscriptionResult(
103
+ text="",
104
+ language="",
105
+ segments=[],
106
+ backend=backend.value,
107
+ success=False,
108
+ error=str(e)
109
+ )
110
+ finally:
111
+ # Clean up temp file if created from BinaryIO
112
+ if not isinstance(audio_file, (str, Path)) and audio_path.exists():
113
+ audio_path.unlink()
114
+
115
+
116
+ def transcribe_audio_sync(
117
+ audio_file: Union[str, Path, BinaryIO],
118
+ word_timestamps: bool = False,
119
+ backend: TranscriptionBackend = TranscriptionBackend.OPENAI,
120
+ output_format: OutputFormat = OutputFormat.JSON,
121
+ language: Optional[str] = None,
122
+ model: str = "whisper-1"
123
+ ) -> TranscriptionResult:
124
+ """
125
+ Synchronous wrapper for transcribe_audio.
126
+
127
+ Useful for CLI and non-async contexts.
128
+ """
129
+ return asyncio.run(transcribe_audio(
130
+ audio_file=audio_file,
131
+ word_timestamps=word_timestamps,
132
+ backend=backend,
133
+ output_format=output_format,
134
+ language=language,
135
+ model=model
136
+ ))
@@ -0,0 +1,144 @@
1
+ """Format converters for transcription output."""
2
+
3
+ import csv
4
+ import io
5
+ from typing import Dict, Any, List
6
+
7
+ from .types import TranscriptionResult, OutputFormat
8
+
9
+
10
+ def format_timestamp_srt(seconds: float) -> str:
11
+ """Format timestamp for SRT (HH:MM:SS,mmm)"""
12
+ hours = int(seconds // 3600)
13
+ minutes = int((seconds % 3600) // 60)
14
+ secs = seconds % 60
15
+ return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace(".", ",")
16
+
17
+
18
+ def format_timestamp_vtt(seconds: float) -> str:
19
+ """Format timestamp for WebVTT (HH:MM:SS.mmm)"""
20
+ hours = int(seconds // 3600)
21
+ minutes = int((seconds % 3600) // 60)
22
+ secs = seconds % 60
23
+ return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
24
+
25
+
26
+ def convert_to_srt(transcription: Dict[str, Any]) -> str:
27
+ """
28
+ Convert transcription to SRT subtitle format.
29
+ """
30
+ srt_lines = []
31
+
32
+ for i, segment in enumerate(transcription.get("segments", []), 1):
33
+ start = format_timestamp_srt(segment.get("start", 0))
34
+ end = format_timestamp_srt(segment.get("end", 0))
35
+ text = segment.get("text", "").strip()
36
+
37
+ # Add speaker if available
38
+ if "speaker" in segment:
39
+ text = f"[{segment['speaker']}] {text}"
40
+
41
+ srt_lines.append(str(i))
42
+ srt_lines.append(f"{start} --> {end}")
43
+ srt_lines.append(text)
44
+ srt_lines.append("")
45
+
46
+ return "\n".join(srt_lines)
47
+
48
+
49
+ def convert_to_vtt(transcription: Dict[str, Any]) -> str:
50
+ """
51
+ Convert transcription to WebVTT format.
52
+ """
53
+ vtt_lines = ["WEBVTT", ""]
54
+
55
+ for segment in transcription.get("segments", []):
56
+ start = format_timestamp_vtt(segment.get("start", 0))
57
+ end = format_timestamp_vtt(segment.get("end", 0))
58
+ text = segment.get("text", "").strip()
59
+
60
+ # Add speaker if available
61
+ if "speaker" in segment:
62
+ text = f"<v {segment['speaker']}>{text}"
63
+
64
+ vtt_lines.append(f"{start} --> {end}")
65
+ vtt_lines.append(text)
66
+ vtt_lines.append("")
67
+
68
+ return "\n".join(vtt_lines)
69
+
70
+
71
+ def convert_to_csv(transcription: Dict[str, Any]) -> str:
72
+ """
73
+ Convert transcription to CSV format with word-level data.
74
+ """
75
+ output = io.StringIO()
76
+
77
+ # Determine columns based on available data
78
+ has_words = "words" in transcription and transcription["words"]
79
+ has_speakers = any("speaker" in w for w in transcription.get("words", []))
80
+ has_probability = any("probability" in w for w in transcription.get("words", []))
81
+
82
+ # Write header
83
+ if has_words:
84
+ headers = ["word", "start", "end"]
85
+ if has_speakers:
86
+ headers.append("speaker")
87
+ if has_probability:
88
+ headers.append("probability")
89
+ else:
90
+ headers = ["text", "start", "end"]
91
+ if has_speakers:
92
+ headers.append("speaker")
93
+
94
+ writer = csv.DictWriter(output, fieldnames=headers)
95
+ writer.writeheader()
96
+
97
+ # Write data
98
+ if has_words:
99
+ for word in transcription.get("words", []):
100
+ row = {
101
+ "word": word.get("word", ""),
102
+ "start": word.get("start", 0),
103
+ "end": word.get("end", 0)
104
+ }
105
+ if has_speakers:
106
+ row["speaker"] = word.get("speaker", "")
107
+ if has_probability:
108
+ row["probability"] = word.get("probability", "")
109
+ writer.writerow(row)
110
+ else:
111
+ for segment in transcription.get("segments", []):
112
+ row = {
113
+ "text": segment.get("text", "").strip(),
114
+ "start": segment.get("start", 0),
115
+ "end": segment.get("end", 0)
116
+ }
117
+ if has_speakers:
118
+ row["speaker"] = segment.get("speaker", "")
119
+ writer.writerow(row)
120
+
121
+ return output.getvalue()
122
+
123
+
124
+ def convert_to_format(transcription: TranscriptionResult, format: OutputFormat) -> str:
125
+ """
126
+ Convert transcription to specified format.
127
+
128
+ Args:
129
+ transcription: The transcription result
130
+ format: Target output format
131
+
132
+ Returns:
133
+ Formatted string representation
134
+ """
135
+ if format == OutputFormat.SRT:
136
+ return convert_to_srt(transcription)
137
+ elif format == OutputFormat.VTT:
138
+ return convert_to_vtt(transcription)
139
+ elif format == OutputFormat.CSV:
140
+ return convert_to_csv(transcription)
141
+ else:
142
+ # Default to JSON (handled elsewhere)
143
+ import json
144
+ return json.dumps(transcription, indent=2)
@@ -0,0 +1,52 @@
1
+ """Type definitions for transcription module."""
2
+
3
+ from typing import TypedDict, List, Optional, Literal
4
+ from enum import Enum
5
+
6
+
7
+ class TranscriptionBackend(str, Enum):
8
+ """Available transcription backends."""
9
+ OPENAI = "openai"
10
+ WHISPERX = "whisperx"
11
+ WHISPER_CPP = "whisper-cpp"
12
+
13
+
14
+ class OutputFormat(str, Enum):
15
+ """Available output formats."""
16
+ JSON = "json"
17
+ SRT = "srt"
18
+ VTT = "vtt"
19
+ CSV = "csv"
20
+
21
+
22
+ class WordData(TypedDict, total=False):
23
+ """Word-level timestamp data."""
24
+ word: str
25
+ start: float
26
+ end: float
27
+ probability: Optional[float]
28
+ speaker: Optional[str]
29
+
30
+
31
+ class SegmentData(TypedDict, total=False):
32
+ """Segment-level timestamp data."""
33
+ id: Optional[int]
34
+ text: str
35
+ start: float
36
+ end: float
37
+ words: Optional[List[WordData]]
38
+ speaker: Optional[str]
39
+
40
+
41
+ class TranscriptionResult(TypedDict, total=False):
42
+ """Complete transcription result."""
43
+ text: str
44
+ language: str
45
+ duration: Optional[float]
46
+ segments: List[SegmentData]
47
+ words: Optional[List[WordData]]
48
+ backend: str
49
+ model: Optional[str]
50
+ success: bool
51
+ error: Optional[str]
52
+ formatted_content: Optional[str] # For non-JSON output formats
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: voice-mode
3
- Version: 3.34.3
3
+ Version: 4.1.0
4
4
  Summary: VoiceMode - Voice interaction capabilities for AI assistants (formerly voice-mcp)
5
5
  Project-URL: Homepage, https://github.com/mbailey/voicemode
6
6
  Project-URL: Repository, https://github.com/mbailey/voicemode
@@ -66,9 +66,12 @@ Requires-Dist: pandas>=2.0.0; extra == 'notebooks'
66
66
  Provides-Extra: scripts
67
67
  Requires-Dist: flask>=3.0.0; extra == 'scripts'
68
68
  Provides-Extra: test
69
+ Requires-Dist: coverage[toml]>=7.4.0; extra == 'test'
69
70
  Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
70
- Requires-Dist: pytest-cov>=4.0.0; extra == 'test'
71
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'test'
71
72
  Requires-Dist: pytest-mock>=3.10.0; extra == 'test'
73
+ Requires-Dist: pytest-timeout>=2.2.0; extra == 'test'
74
+ Requires-Dist: pytest-xdist>=3.5.0; extra == 'test'
72
75
  Requires-Dist: pytest>=7.0.0; extra == 'test'
73
76
  Description-Content-Type: text/markdown
74
77