openadapt-capture 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_capture/__init__.py +140 -0
- openadapt_capture/audio.py +489 -0
- openadapt_capture/capture.py +300 -0
- openadapt_capture/cli.py +289 -0
- openadapt_capture/comparison.py +276 -0
- openadapt_capture/config.py +29 -0
- openadapt_capture/events.py +280 -0
- openadapt_capture/input.py +494 -0
- openadapt_capture/processing.py +548 -0
- openadapt_capture/recorder.py +304 -0
- openadapt_capture/stats.py +212 -0
- openadapt_capture/storage.py +617 -0
- openadapt_capture/video.py +440 -0
- openadapt_capture/visualize/__init__.py +11 -0
- openadapt_capture/visualize/demo.py +343 -0
- openadapt_capture/visualize/html.py +1538 -0
- openadapt_capture/visualize/overlays.py +469 -0
- openadapt_capture-0.1.0.dist-info/METADATA +227 -0
- openadapt_capture-0.1.0.dist-info/RECORD +21 -0
- openadapt_capture-0.1.0.dist-info/WHEEL +4 -0
- openadapt_capture-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""OpenAdapt Capture - GUI interaction capture.
|
|
2
|
+
|
|
3
|
+
Platform-agnostic event streams with time-aligned media.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
# High-level APIs (primary interface)
|
|
9
|
+
from openadapt_capture.capture import Action, Capture, CaptureSession
|
|
10
|
+
|
|
11
|
+
# Frame comparison utilities
|
|
12
|
+
from openadapt_capture.comparison import (
|
|
13
|
+
ComparisonReport,
|
|
14
|
+
FrameComparison,
|
|
15
|
+
compare_frames,
|
|
16
|
+
compare_video_to_images,
|
|
17
|
+
plot_comparison,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Event types
|
|
21
|
+
from openadapt_capture.events import (
|
|
22
|
+
ActionEvent,
|
|
23
|
+
AudioChunkEvent,
|
|
24
|
+
AudioEvent,
|
|
25
|
+
BaseEvent,
|
|
26
|
+
Event,
|
|
27
|
+
EventType,
|
|
28
|
+
KeyDownEvent,
|
|
29
|
+
KeyTypeEvent,
|
|
30
|
+
KeyUpEvent,
|
|
31
|
+
MouseButton,
|
|
32
|
+
MouseClickEvent,
|
|
33
|
+
MouseDoubleClickEvent,
|
|
34
|
+
MouseDownEvent,
|
|
35
|
+
MouseDragEvent,
|
|
36
|
+
MouseMoveEvent,
|
|
37
|
+
MouseScrollEvent,
|
|
38
|
+
MouseUpEvent,
|
|
39
|
+
ScreenEvent,
|
|
40
|
+
ScreenFrameEvent,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Event processing
|
|
44
|
+
from openadapt_capture.processing import (
|
|
45
|
+
detect_drag_events,
|
|
46
|
+
get_action_events,
|
|
47
|
+
get_audio_events,
|
|
48
|
+
get_screen_events,
|
|
49
|
+
merge_consecutive_keyboard_events,
|
|
50
|
+
merge_consecutive_mouse_click_events,
|
|
51
|
+
merge_consecutive_mouse_move_events,
|
|
52
|
+
merge_consecutive_mouse_scroll_events,
|
|
53
|
+
process_events,
|
|
54
|
+
remove_invalid_keyboard_events,
|
|
55
|
+
remove_redundant_mouse_move_events,
|
|
56
|
+
)
|
|
57
|
+
from openadapt_capture.recorder import Recorder
|
|
58
|
+
|
|
59
|
+
# Performance statistics
|
|
60
|
+
from openadapt_capture.stats import (
|
|
61
|
+
CaptureStats,
|
|
62
|
+
PerfStat,
|
|
63
|
+
plot_capture_performance,
|
|
64
|
+
)
|
|
65
|
+
from openadapt_capture.storage import Capture as CaptureMetadata
|
|
66
|
+
|
|
67
|
+
# Storage (low-level)
|
|
68
|
+
from openadapt_capture.storage import (
|
|
69
|
+
CaptureStorage,
|
|
70
|
+
Stream,
|
|
71
|
+
create_capture,
|
|
72
|
+
load_capture,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Visualization
|
|
76
|
+
from openadapt_capture.visualize import create_demo, create_html
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
# Version
|
|
80
|
+
"__version__",
|
|
81
|
+
# High-level APIs
|
|
82
|
+
"Recorder",
|
|
83
|
+
"Capture",
|
|
84
|
+
"CaptureSession",
|
|
85
|
+
"Action",
|
|
86
|
+
# Event types
|
|
87
|
+
"EventType",
|
|
88
|
+
"MouseButton",
|
|
89
|
+
"BaseEvent",
|
|
90
|
+
"Event",
|
|
91
|
+
"ActionEvent",
|
|
92
|
+
"ScreenEvent",
|
|
93
|
+
"AudioEvent",
|
|
94
|
+
# Mouse events
|
|
95
|
+
"MouseMoveEvent",
|
|
96
|
+
"MouseDownEvent",
|
|
97
|
+
"MouseUpEvent",
|
|
98
|
+
"MouseScrollEvent",
|
|
99
|
+
"MouseClickEvent",
|
|
100
|
+
"MouseDoubleClickEvent",
|
|
101
|
+
"MouseDragEvent",
|
|
102
|
+
# Keyboard events
|
|
103
|
+
"KeyDownEvent",
|
|
104
|
+
"KeyUpEvent",
|
|
105
|
+
"KeyTypeEvent",
|
|
106
|
+
# Screen/audio events
|
|
107
|
+
"ScreenFrameEvent",
|
|
108
|
+
"AudioChunkEvent",
|
|
109
|
+
# Storage (low-level)
|
|
110
|
+
"CaptureMetadata",
|
|
111
|
+
"Stream",
|
|
112
|
+
"CaptureStorage",
|
|
113
|
+
"create_capture",
|
|
114
|
+
"load_capture",
|
|
115
|
+
# Processing
|
|
116
|
+
"process_events",
|
|
117
|
+
"remove_invalid_keyboard_events",
|
|
118
|
+
"remove_redundant_mouse_move_events",
|
|
119
|
+
"merge_consecutive_keyboard_events",
|
|
120
|
+
"merge_consecutive_mouse_move_events",
|
|
121
|
+
"merge_consecutive_mouse_scroll_events",
|
|
122
|
+
"merge_consecutive_mouse_click_events",
|
|
123
|
+
"detect_drag_events",
|
|
124
|
+
"get_action_events",
|
|
125
|
+
"get_screen_events",
|
|
126
|
+
"get_audio_events",
|
|
127
|
+
# Performance statistics
|
|
128
|
+
"CaptureStats",
|
|
129
|
+
"PerfStat",
|
|
130
|
+
"plot_capture_performance",
|
|
131
|
+
# Frame comparison
|
|
132
|
+
"ComparisonReport",
|
|
133
|
+
"FrameComparison",
|
|
134
|
+
"compare_frames",
|
|
135
|
+
"compare_video_to_images",
|
|
136
|
+
"plot_comparison",
|
|
137
|
+
# Visualization
|
|
138
|
+
"create_demo",
|
|
139
|
+
"create_html",
|
|
140
|
+
]
|
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
"""Audio capture and transcription.
|
|
2
|
+
|
|
3
|
+
This module provides audio recording with optional Whisper transcription,
|
|
4
|
+
following OpenAdapt's proven implementation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
14
|
+
|
|
15
|
+
from openadapt_capture.events import AudioChunkEvent
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
# Optional dependencies - imported at runtime
|
|
21
|
+
_sounddevice = None
|
|
22
|
+
_soundfile = None
|
|
23
|
+
_whisper = None
|
|
24
|
+
_np = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _import_audio_deps() -> None:
|
|
28
|
+
"""Import audio dependencies lazily."""
|
|
29
|
+
global _sounddevice, _soundfile, _np
|
|
30
|
+
if _sounddevice is None:
|
|
31
|
+
try:
|
|
32
|
+
import numpy as np
|
|
33
|
+
import sounddevice
|
|
34
|
+
import soundfile
|
|
35
|
+
|
|
36
|
+
_sounddevice = sounddevice
|
|
37
|
+
_soundfile = soundfile
|
|
38
|
+
_np = np
|
|
39
|
+
except ImportError as e:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
"Audio dependencies required. Install with: "
|
|
42
|
+
"pip install sounddevice soundfile numpy"
|
|
43
|
+
) from e
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _import_whisper() -> None:
|
|
47
|
+
"""Import whisper lazily."""
|
|
48
|
+
global _whisper
|
|
49
|
+
if _whisper is None:
|
|
50
|
+
try:
|
|
51
|
+
import whisper
|
|
52
|
+
|
|
53
|
+
_whisper = whisper
|
|
54
|
+
except ImportError as e:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"Whisper is required for transcription. Install with: "
|
|
57
|
+
"pip install openai-whisper"
|
|
58
|
+
) from e
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_timestamp() -> float:
|
|
62
|
+
"""Get current timestamp."""
|
|
63
|
+
return time.time()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# =============================================================================
|
|
67
|
+
# Audio Recorder
|
|
68
|
+
# =============================================================================
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class AudioRecorder:
|
|
72
|
+
"""Records audio from the default microphone.
|
|
73
|
+
|
|
74
|
+
Usage:
|
|
75
|
+
recorder = AudioRecorder()
|
|
76
|
+
recorder.start()
|
|
77
|
+
# ... record for some time ...
|
|
78
|
+
recorder.stop()
|
|
79
|
+
|
|
80
|
+
# Get the recorded audio
|
|
81
|
+
audio_data = recorder.get_audio()
|
|
82
|
+
|
|
83
|
+
# Optionally transcribe
|
|
84
|
+
text = recorder.transcribe()
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
sample_rate: int = 44100,
|
|
90
|
+
channels: int = 1,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Initialize audio recorder.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
sample_rate: Audio sample rate in Hz (44100 for quality, Whisper resamples).
|
|
96
|
+
channels: Number of audio channels (1 for mono).
|
|
97
|
+
"""
|
|
98
|
+
_import_audio_deps()
|
|
99
|
+
|
|
100
|
+
self.sample_rate = sample_rate
|
|
101
|
+
self.channels = channels
|
|
102
|
+
self._frames: list["np.ndarray"] = []
|
|
103
|
+
self._stream = None
|
|
104
|
+
self._running = False
|
|
105
|
+
self._start_time: float | None = None
|
|
106
|
+
self._lock = threading.Lock()
|
|
107
|
+
|
|
108
|
+
def _audio_callback(
|
|
109
|
+
self,
|
|
110
|
+
indata: "np.ndarray",
|
|
111
|
+
frames: int,
|
|
112
|
+
time_info: Any,
|
|
113
|
+
status: Any,
|
|
114
|
+
) -> None:
|
|
115
|
+
"""Callback for audio stream."""
|
|
116
|
+
if status:
|
|
117
|
+
# Log any audio issues (overflow, underflow)
|
|
118
|
+
import sys
|
|
119
|
+
print(f"Audio status: {status}", file=sys.stderr)
|
|
120
|
+
with self._lock:
|
|
121
|
+
self._frames.append(indata.copy())
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def start_time(self) -> float | None:
|
|
125
|
+
"""Get the recording start time."""
|
|
126
|
+
return self._start_time
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def is_recording(self) -> bool:
|
|
130
|
+
"""Check if currently recording."""
|
|
131
|
+
return self._running
|
|
132
|
+
|
|
133
|
+
def start(self) -> None:
|
|
134
|
+
"""Start recording audio."""
|
|
135
|
+
if self._running:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
with self._lock:
|
|
139
|
+
self._frames = []
|
|
140
|
+
|
|
141
|
+
self._stream = _sounddevice.InputStream(
|
|
142
|
+
callback=self._audio_callback,
|
|
143
|
+
samplerate=self.sample_rate,
|
|
144
|
+
channels=self.channels,
|
|
145
|
+
)
|
|
146
|
+
self._start_time = _get_timestamp()
|
|
147
|
+
self._stream.start()
|
|
148
|
+
self._running = True
|
|
149
|
+
|
|
150
|
+
def stop(self) -> None:
|
|
151
|
+
"""Stop recording audio."""
|
|
152
|
+
if not self._running:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
if self._stream is not None:
|
|
156
|
+
self._stream.stop()
|
|
157
|
+
self._stream.close()
|
|
158
|
+
self._stream = None
|
|
159
|
+
self._running = False
|
|
160
|
+
|
|
161
|
+
def get_audio(self) -> "np.ndarray":
|
|
162
|
+
"""Get the recorded audio as a numpy array.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Numpy array of audio samples (float32, normalized).
|
|
166
|
+
"""
|
|
167
|
+
with self._lock:
|
|
168
|
+
if not self._frames:
|
|
169
|
+
return _np.array([], dtype=_np.float32)
|
|
170
|
+
|
|
171
|
+
concatenated = _np.concatenate(self._frames, axis=0)
|
|
172
|
+
return concatenated.flatten().astype(_np.float32)
|
|
173
|
+
|
|
174
|
+
def get_duration(self) -> float:
|
|
175
|
+
"""Get the duration of recorded audio in seconds."""
|
|
176
|
+
audio = self.get_audio()
|
|
177
|
+
return len(audio) / self.sample_rate
|
|
178
|
+
|
|
179
|
+
def save_flac(self, output_path: str | Path) -> None:
|
|
180
|
+
"""Save recorded audio to FLAC file.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
output_path: Path to output FLAC file.
|
|
184
|
+
"""
|
|
185
|
+
audio = self.get_audio()
|
|
186
|
+
_soundfile.write(str(output_path), audio, self.sample_rate, format="FLAC")
|
|
187
|
+
|
|
188
|
+
def get_flac_bytes(self) -> bytes:
|
|
189
|
+
"""Get recorded audio as FLAC-compressed bytes.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
FLAC-compressed audio data.
|
|
193
|
+
"""
|
|
194
|
+
audio = self.get_audio()
|
|
195
|
+
buffer = io.BytesIO()
|
|
196
|
+
_soundfile.write(buffer, audio, self.sample_rate, format="FLAC")
|
|
197
|
+
return buffer.getvalue()
|
|
198
|
+
|
|
199
|
+
def transcribe(
|
|
200
|
+
self,
|
|
201
|
+
model_name: str = "base",
|
|
202
|
+
word_timestamps: bool = True,
|
|
203
|
+
) -> dict[str, Any]:
|
|
204
|
+
"""Transcribe recorded audio using Whisper.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
model_name: Whisper model to use (tiny, base, small, medium, large).
|
|
208
|
+
word_timestamps: Whether to include word-level timestamps.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Transcription result dict with 'text' and 'segments'.
|
|
212
|
+
"""
|
|
213
|
+
_import_whisper()
|
|
214
|
+
|
|
215
|
+
audio = self.get_audio()
|
|
216
|
+
if len(audio) == 0:
|
|
217
|
+
return {"text": "", "segments": []}
|
|
218
|
+
|
|
219
|
+
model = _whisper.load_model(model_name)
|
|
220
|
+
result = model.transcribe(
|
|
221
|
+
audio,
|
|
222
|
+
word_timestamps=word_timestamps,
|
|
223
|
+
fp16=False, # Use float32 for CPU compatibility
|
|
224
|
+
)
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
def __enter__(self) -> "AudioRecorder":
|
|
228
|
+
"""Context manager entry."""
|
|
229
|
+
self.start()
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
233
|
+
"""Context manager exit."""
|
|
234
|
+
self.stop()
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# =============================================================================
|
|
238
|
+
# Audio Chunk Generator
|
|
239
|
+
# =============================================================================
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class AudioChunkGenerator:
|
|
243
|
+
"""Generates AudioChunkEvents from recorded audio.
|
|
244
|
+
|
|
245
|
+
Splits audio into chunks for storage and processing.
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
def __init__(
|
|
249
|
+
self,
|
|
250
|
+
chunk_duration: float = 30.0,
|
|
251
|
+
sample_rate: int = 16000,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""Initialize chunk generator.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
chunk_duration: Duration of each chunk in seconds.
|
|
257
|
+
sample_rate: Audio sample rate.
|
|
258
|
+
"""
|
|
259
|
+
self.chunk_duration = chunk_duration
|
|
260
|
+
self.sample_rate = sample_rate
|
|
261
|
+
|
|
262
|
+
def generate_chunks(
|
|
263
|
+
self,
|
|
264
|
+
audio: "np.ndarray",
|
|
265
|
+
start_time: float,
|
|
266
|
+
transcription: dict[str, Any] | None = None,
|
|
267
|
+
) -> list[AudioChunkEvent]:
|
|
268
|
+
"""Generate AudioChunkEvents from audio data.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
audio: Audio data as numpy array.
|
|
272
|
+
start_time: Unix timestamp of audio start.
|
|
273
|
+
transcription: Optional transcription result.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List of AudioChunkEvent objects.
|
|
277
|
+
"""
|
|
278
|
+
_import_audio_deps()
|
|
279
|
+
|
|
280
|
+
total_samples = len(audio)
|
|
281
|
+
samples_per_chunk = int(self.chunk_duration * self.sample_rate)
|
|
282
|
+
|
|
283
|
+
events = []
|
|
284
|
+
chunk_idx = 0
|
|
285
|
+
|
|
286
|
+
while chunk_idx * samples_per_chunk < total_samples:
|
|
287
|
+
start_sample = chunk_idx * samples_per_chunk
|
|
288
|
+
end_sample = min((chunk_idx + 1) * samples_per_chunk, total_samples)
|
|
289
|
+
|
|
290
|
+
chunk_start_time = start_time + (start_sample / self.sample_rate)
|
|
291
|
+
chunk_end_time = start_time + (end_sample / self.sample_rate)
|
|
292
|
+
|
|
293
|
+
# Extract transcription for this chunk if available
|
|
294
|
+
chunk_text = None
|
|
295
|
+
if transcription and transcription.get("segments"):
|
|
296
|
+
chunk_words = []
|
|
297
|
+
for segment in transcription["segments"]:
|
|
298
|
+
if "words" in segment:
|
|
299
|
+
for word in segment["words"]:
|
|
300
|
+
word_start = start_time + word["start"]
|
|
301
|
+
word_end = start_time + word["end"]
|
|
302
|
+
if word_start >= chunk_start_time and word_end <= chunk_end_time:
|
|
303
|
+
chunk_words.append(word["word"])
|
|
304
|
+
chunk_text = " ".join(chunk_words).strip() if chunk_words else None
|
|
305
|
+
|
|
306
|
+
event = AudioChunkEvent(
|
|
307
|
+
timestamp=chunk_start_time,
|
|
308
|
+
start_time=chunk_start_time,
|
|
309
|
+
end_time=chunk_end_time,
|
|
310
|
+
transcription=chunk_text,
|
|
311
|
+
)
|
|
312
|
+
events.append(event)
|
|
313
|
+
chunk_idx += 1
|
|
314
|
+
|
|
315
|
+
return events
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# =============================================================================
|
|
319
|
+
# Continuous Audio Capture
|
|
320
|
+
# =============================================================================
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class ContinuousAudioCapture:
|
|
324
|
+
"""Continuous audio capture with streaming transcription support.
|
|
325
|
+
|
|
326
|
+
For long captures, provides periodic callbacks with audio chunks.
|
|
327
|
+
|
|
328
|
+
Usage:
|
|
329
|
+
def on_chunk(event, audio_bytes):
|
|
330
|
+
print(f"Chunk: {event.start_time} - {event.end_time}")
|
|
331
|
+
|
|
332
|
+
capture = ContinuousAudioCapture(on_chunk, chunk_duration=30.0)
|
|
333
|
+
capture.start()
|
|
334
|
+
# ... capture audio ...
|
|
335
|
+
capture.stop()
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
def __init__(
|
|
339
|
+
self,
|
|
340
|
+
callback: Callable[[AudioChunkEvent, bytes], None],
|
|
341
|
+
chunk_duration: float = 30.0,
|
|
342
|
+
sample_rate: int = 16000,
|
|
343
|
+
channels: int = 1,
|
|
344
|
+
transcribe: bool = False,
|
|
345
|
+
whisper_model: str = "base",
|
|
346
|
+
) -> None:
|
|
347
|
+
"""Initialize continuous audio capture.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
callback: Function called with (event, audio_bytes) for each chunk.
|
|
351
|
+
chunk_duration: Duration of each chunk in seconds.
|
|
352
|
+
sample_rate: Audio sample rate.
|
|
353
|
+
channels: Number of channels.
|
|
354
|
+
transcribe: Whether to transcribe each chunk.
|
|
355
|
+
whisper_model: Whisper model to use for transcription.
|
|
356
|
+
"""
|
|
357
|
+
_import_audio_deps()
|
|
358
|
+
|
|
359
|
+
self.callback = callback
|
|
360
|
+
self.chunk_duration = chunk_duration
|
|
361
|
+
self.sample_rate = sample_rate
|
|
362
|
+
self.channels = channels
|
|
363
|
+
self.transcribe = transcribe
|
|
364
|
+
self.whisper_model = whisper_model
|
|
365
|
+
|
|
366
|
+
self._recorder: AudioRecorder | None = None
|
|
367
|
+
self._running = False
|
|
368
|
+
self._thread: threading.Thread | None = None
|
|
369
|
+
self._stop_event = threading.Event()
|
|
370
|
+
|
|
371
|
+
if transcribe:
|
|
372
|
+
_import_whisper()
|
|
373
|
+
self._whisper_model = _whisper.load_model(whisper_model)
|
|
374
|
+
else:
|
|
375
|
+
self._whisper_model = None
|
|
376
|
+
|
|
377
|
+
def _process_chunk(self, audio: "np.ndarray", start_time: float) -> None:
|
|
378
|
+
"""Process a single audio chunk."""
|
|
379
|
+
# Get transcription if enabled
|
|
380
|
+
transcription = None
|
|
381
|
+
if self._whisper_model is not None:
|
|
382
|
+
try:
|
|
383
|
+
transcription = self._whisper_model.transcribe(
|
|
384
|
+
audio,
|
|
385
|
+
word_timestamps=True,
|
|
386
|
+
fp16=False,
|
|
387
|
+
)
|
|
388
|
+
transcription_text = transcription.get("text", "").strip()
|
|
389
|
+
except Exception:
|
|
390
|
+
transcription_text = None
|
|
391
|
+
else:
|
|
392
|
+
transcription_text = None
|
|
393
|
+
|
|
394
|
+
end_time = start_time + (len(audio) / self.sample_rate)
|
|
395
|
+
|
|
396
|
+
event = AudioChunkEvent(
|
|
397
|
+
timestamp=start_time,
|
|
398
|
+
start_time=start_time,
|
|
399
|
+
end_time=end_time,
|
|
400
|
+
transcription=transcription_text,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Convert to FLAC bytes
|
|
404
|
+
buffer = io.BytesIO()
|
|
405
|
+
_soundfile.write(buffer, audio, self.sample_rate, format="FLAC")
|
|
406
|
+
audio_bytes = buffer.getvalue()
|
|
407
|
+
|
|
408
|
+
self.callback(event, audio_bytes)
|
|
409
|
+
|
|
410
|
+
def _capture_loop(self) -> None:
|
|
411
|
+
"""Main capture loop."""
|
|
412
|
+
samples_per_chunk = int(self.chunk_duration * self.sample_rate)
|
|
413
|
+
chunk_start_time = _get_timestamp()
|
|
414
|
+
|
|
415
|
+
while not self._stop_event.is_set():
|
|
416
|
+
# Wait for chunk duration
|
|
417
|
+
self._stop_event.wait(self.chunk_duration)
|
|
418
|
+
|
|
419
|
+
if self._recorder is not None:
|
|
420
|
+
audio = self._recorder.get_audio()
|
|
421
|
+
|
|
422
|
+
if len(audio) >= samples_per_chunk:
|
|
423
|
+
# Get samples for this chunk
|
|
424
|
+
chunk_audio = audio[:samples_per_chunk].copy()
|
|
425
|
+
|
|
426
|
+
# Process in separate thread to not block capture
|
|
427
|
+
threading.Thread(
|
|
428
|
+
target=self._process_chunk,
|
|
429
|
+
args=(chunk_audio, chunk_start_time),
|
|
430
|
+
daemon=True,
|
|
431
|
+
).start()
|
|
432
|
+
|
|
433
|
+
# Reset recorder for next chunk
|
|
434
|
+
with self._recorder._lock:
|
|
435
|
+
self._recorder._frames = [
|
|
436
|
+
audio[samples_per_chunk:].reshape(-1, 1)
|
|
437
|
+
] if len(audio) > samples_per_chunk else []
|
|
438
|
+
|
|
439
|
+
chunk_start_time = _get_timestamp()
|
|
440
|
+
|
|
441
|
+
def start(self) -> None:
|
|
442
|
+
"""Start continuous audio capture."""
|
|
443
|
+
if self._running:
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
self._recorder = AudioRecorder(
|
|
447
|
+
sample_rate=self.sample_rate,
|
|
448
|
+
channels=self.channels,
|
|
449
|
+
)
|
|
450
|
+
self._recorder.start()
|
|
451
|
+
|
|
452
|
+
self._stop_event.clear()
|
|
453
|
+
self._thread = threading.Thread(target=self._capture_loop, daemon=True)
|
|
454
|
+
self._thread.start()
|
|
455
|
+
self._running = True
|
|
456
|
+
|
|
457
|
+
def stop(self) -> None:
|
|
458
|
+
"""Stop continuous audio capture and process remaining audio."""
|
|
459
|
+
if not self._running:
|
|
460
|
+
return
|
|
461
|
+
|
|
462
|
+
self._stop_event.set()
|
|
463
|
+
if self._thread is not None:
|
|
464
|
+
self._thread.join(timeout=2.0)
|
|
465
|
+
self._thread = None
|
|
466
|
+
|
|
467
|
+
# Process any remaining audio
|
|
468
|
+
if self._recorder is not None:
|
|
469
|
+
audio = self._recorder.get_audio()
|
|
470
|
+
if len(audio) > 0:
|
|
471
|
+
start_time = (
|
|
472
|
+
self._recorder.start_time + self._recorder.get_duration()
|
|
473
|
+
- (len(audio) / self.sample_rate)
|
|
474
|
+
)
|
|
475
|
+
self._process_chunk(audio, start_time)
|
|
476
|
+
|
|
477
|
+
self._recorder.stop()
|
|
478
|
+
self._recorder = None
|
|
479
|
+
|
|
480
|
+
self._running = False
|
|
481
|
+
|
|
482
|
+
def __enter__(self) -> "ContinuousAudioCapture":
|
|
483
|
+
"""Context manager entry."""
|
|
484
|
+
self.start()
|
|
485
|
+
return self
|
|
486
|
+
|
|
487
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
488
|
+
"""Context manager exit."""
|
|
489
|
+
self.stop()
|