openadapt-capture 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ """OpenAdapt Capture - GUI interaction capture.
2
+
3
+ Platform-agnostic event streams with time-aligned media.
4
+ """
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ # High-level APIs (primary interface)
9
+ from openadapt_capture.capture import Action, Capture, CaptureSession
10
+
11
+ # Frame comparison utilities
12
+ from openadapt_capture.comparison import (
13
+ ComparisonReport,
14
+ FrameComparison,
15
+ compare_frames,
16
+ compare_video_to_images,
17
+ plot_comparison,
18
+ )
19
+
20
+ # Event types
21
+ from openadapt_capture.events import (
22
+ ActionEvent,
23
+ AudioChunkEvent,
24
+ AudioEvent,
25
+ BaseEvent,
26
+ Event,
27
+ EventType,
28
+ KeyDownEvent,
29
+ KeyTypeEvent,
30
+ KeyUpEvent,
31
+ MouseButton,
32
+ MouseClickEvent,
33
+ MouseDoubleClickEvent,
34
+ MouseDownEvent,
35
+ MouseDragEvent,
36
+ MouseMoveEvent,
37
+ MouseScrollEvent,
38
+ MouseUpEvent,
39
+ ScreenEvent,
40
+ ScreenFrameEvent,
41
+ )
42
+
43
+ # Event processing
44
+ from openadapt_capture.processing import (
45
+ detect_drag_events,
46
+ get_action_events,
47
+ get_audio_events,
48
+ get_screen_events,
49
+ merge_consecutive_keyboard_events,
50
+ merge_consecutive_mouse_click_events,
51
+ merge_consecutive_mouse_move_events,
52
+ merge_consecutive_mouse_scroll_events,
53
+ process_events,
54
+ remove_invalid_keyboard_events,
55
+ remove_redundant_mouse_move_events,
56
+ )
57
+ from openadapt_capture.recorder import Recorder
58
+
59
+ # Performance statistics
60
+ from openadapt_capture.stats import (
61
+ CaptureStats,
62
+ PerfStat,
63
+ plot_capture_performance,
64
+ )
65
+ from openadapt_capture.storage import Capture as CaptureMetadata
66
+
67
+ # Storage (low-level)
68
+ from openadapt_capture.storage import (
69
+ CaptureStorage,
70
+ Stream,
71
+ create_capture,
72
+ load_capture,
73
+ )
74
+
75
+ # Visualization
76
+ from openadapt_capture.visualize import create_demo, create_html
77
+
78
+ __all__ = [
79
+ # Version
80
+ "__version__",
81
+ # High-level APIs
82
+ "Recorder",
83
+ "Capture",
84
+ "CaptureSession",
85
+ "Action",
86
+ # Event types
87
+ "EventType",
88
+ "MouseButton",
89
+ "BaseEvent",
90
+ "Event",
91
+ "ActionEvent",
92
+ "ScreenEvent",
93
+ "AudioEvent",
94
+ # Mouse events
95
+ "MouseMoveEvent",
96
+ "MouseDownEvent",
97
+ "MouseUpEvent",
98
+ "MouseScrollEvent",
99
+ "MouseClickEvent",
100
+ "MouseDoubleClickEvent",
101
+ "MouseDragEvent",
102
+ # Keyboard events
103
+ "KeyDownEvent",
104
+ "KeyUpEvent",
105
+ "KeyTypeEvent",
106
+ # Screen/audio events
107
+ "ScreenFrameEvent",
108
+ "AudioChunkEvent",
109
+ # Storage (low-level)
110
+ "CaptureMetadata",
111
+ "Stream",
112
+ "CaptureStorage",
113
+ "create_capture",
114
+ "load_capture",
115
+ # Processing
116
+ "process_events",
117
+ "remove_invalid_keyboard_events",
118
+ "remove_redundant_mouse_move_events",
119
+ "merge_consecutive_keyboard_events",
120
+ "merge_consecutive_mouse_move_events",
121
+ "merge_consecutive_mouse_scroll_events",
122
+ "merge_consecutive_mouse_click_events",
123
+ "detect_drag_events",
124
+ "get_action_events",
125
+ "get_screen_events",
126
+ "get_audio_events",
127
+ # Performance statistics
128
+ "CaptureStats",
129
+ "PerfStat",
130
+ "plot_capture_performance",
131
+ # Frame comparison
132
+ "ComparisonReport",
133
+ "FrameComparison",
134
+ "compare_frames",
135
+ "compare_video_to_images",
136
+ "plot_comparison",
137
+ # Visualization
138
+ "create_demo",
139
+ "create_html",
140
+ ]
@@ -0,0 +1,489 @@
1
+ """Audio capture and transcription.
2
+
3
+ This module provides audio recording with optional Whisper transcription,
4
+ following OpenAdapt's proven implementation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import io
10
+ import threading
11
+ import time
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any, Callable
14
+
15
+ from openadapt_capture.events import AudioChunkEvent
16
+
17
+ if TYPE_CHECKING:
18
+ import numpy as np
19
+
20
+ # Optional dependencies - imported at runtime
21
+ _sounddevice = None
22
+ _soundfile = None
23
+ _whisper = None
24
+ _np = None
25
+
26
+
27
+ def _import_audio_deps() -> None:
28
+ """Import audio dependencies lazily."""
29
+ global _sounddevice, _soundfile, _np
30
+ if _sounddevice is None:
31
+ try:
32
+ import numpy as np
33
+ import sounddevice
34
+ import soundfile
35
+
36
+ _sounddevice = sounddevice
37
+ _soundfile = soundfile
38
+ _np = np
39
+ except ImportError as e:
40
+ raise ImportError(
41
+ "Audio dependencies required. Install with: "
42
+ "pip install sounddevice soundfile numpy"
43
+ ) from e
44
+
45
+
46
+ def _import_whisper() -> None:
47
+ """Import whisper lazily."""
48
+ global _whisper
49
+ if _whisper is None:
50
+ try:
51
+ import whisper
52
+
53
+ _whisper = whisper
54
+ except ImportError as e:
55
+ raise ImportError(
56
+ "Whisper is required for transcription. Install with: "
57
+ "pip install openai-whisper"
58
+ ) from e
59
+
60
+
61
+ def _get_timestamp() -> float:
62
+ """Get current timestamp."""
63
+ return time.time()
64
+
65
+
66
+ # =============================================================================
67
+ # Audio Recorder
68
+ # =============================================================================
69
+
70
+
71
+ class AudioRecorder:
72
+ """Records audio from the default microphone.
73
+
74
+ Usage:
75
+ recorder = AudioRecorder()
76
+ recorder.start()
77
+ # ... record for some time ...
78
+ recorder.stop()
79
+
80
+ # Get the recorded audio
81
+ audio_data = recorder.get_audio()
82
+
83
+ # Optionally transcribe
84
+ text = recorder.transcribe()
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ sample_rate: int = 44100,
90
+ channels: int = 1,
91
+ ) -> None:
92
+ """Initialize audio recorder.
93
+
94
+ Args:
95
+ sample_rate: Audio sample rate in Hz (44100 for quality, Whisper resamples).
96
+ channels: Number of audio channels (1 for mono).
97
+ """
98
+ _import_audio_deps()
99
+
100
+ self.sample_rate = sample_rate
101
+ self.channels = channels
102
+ self._frames: list["np.ndarray"] = []
103
+ self._stream = None
104
+ self._running = False
105
+ self._start_time: float | None = None
106
+ self._lock = threading.Lock()
107
+
108
+ def _audio_callback(
109
+ self,
110
+ indata: "np.ndarray",
111
+ frames: int,
112
+ time_info: Any,
113
+ status: Any,
114
+ ) -> None:
115
+ """Callback for audio stream."""
116
+ if status:
117
+ # Log any audio issues (overflow, underflow)
118
+ import sys
119
+ print(f"Audio status: {status}", file=sys.stderr)
120
+ with self._lock:
121
+ self._frames.append(indata.copy())
122
+
123
+ @property
124
+ def start_time(self) -> float | None:
125
+ """Get the recording start time."""
126
+ return self._start_time
127
+
128
+ @property
129
+ def is_recording(self) -> bool:
130
+ """Check if currently recording."""
131
+ return self._running
132
+
133
+ def start(self) -> None:
134
+ """Start recording audio."""
135
+ if self._running:
136
+ return
137
+
138
+ with self._lock:
139
+ self._frames = []
140
+
141
+ self._stream = _sounddevice.InputStream(
142
+ callback=self._audio_callback,
143
+ samplerate=self.sample_rate,
144
+ channels=self.channels,
145
+ )
146
+ self._start_time = _get_timestamp()
147
+ self._stream.start()
148
+ self._running = True
149
+
150
+ def stop(self) -> None:
151
+ """Stop recording audio."""
152
+ if not self._running:
153
+ return
154
+
155
+ if self._stream is not None:
156
+ self._stream.stop()
157
+ self._stream.close()
158
+ self._stream = None
159
+ self._running = False
160
+
161
+ def get_audio(self) -> "np.ndarray":
162
+ """Get the recorded audio as a numpy array.
163
+
164
+ Returns:
165
+ Numpy array of audio samples (float32, normalized).
166
+ """
167
+ with self._lock:
168
+ if not self._frames:
169
+ return _np.array([], dtype=_np.float32)
170
+
171
+ concatenated = _np.concatenate(self._frames, axis=0)
172
+ return concatenated.flatten().astype(_np.float32)
173
+
174
+ def get_duration(self) -> float:
175
+ """Get the duration of recorded audio in seconds."""
176
+ audio = self.get_audio()
177
+ return len(audio) / self.sample_rate
178
+
179
+ def save_flac(self, output_path: str | Path) -> None:
180
+ """Save recorded audio to FLAC file.
181
+
182
+ Args:
183
+ output_path: Path to output FLAC file.
184
+ """
185
+ audio = self.get_audio()
186
+ _soundfile.write(str(output_path), audio, self.sample_rate, format="FLAC")
187
+
188
+ def get_flac_bytes(self) -> bytes:
189
+ """Get recorded audio as FLAC-compressed bytes.
190
+
191
+ Returns:
192
+ FLAC-compressed audio data.
193
+ """
194
+ audio = self.get_audio()
195
+ buffer = io.BytesIO()
196
+ _soundfile.write(buffer, audio, self.sample_rate, format="FLAC")
197
+ return buffer.getvalue()
198
+
199
+ def transcribe(
200
+ self,
201
+ model_name: str = "base",
202
+ word_timestamps: bool = True,
203
+ ) -> dict[str, Any]:
204
+ """Transcribe recorded audio using Whisper.
205
+
206
+ Args:
207
+ model_name: Whisper model to use (tiny, base, small, medium, large).
208
+ word_timestamps: Whether to include word-level timestamps.
209
+
210
+ Returns:
211
+ Transcription result dict with 'text' and 'segments'.
212
+ """
213
+ _import_whisper()
214
+
215
+ audio = self.get_audio()
216
+ if len(audio) == 0:
217
+ return {"text": "", "segments": []}
218
+
219
+ model = _whisper.load_model(model_name)
220
+ result = model.transcribe(
221
+ audio,
222
+ word_timestamps=word_timestamps,
223
+ fp16=False, # Use float32 for CPU compatibility
224
+ )
225
+ return result
226
+
227
+ def __enter__(self) -> "AudioRecorder":
228
+ """Context manager entry."""
229
+ self.start()
230
+ return self
231
+
232
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
233
+ """Context manager exit."""
234
+ self.stop()
235
+
236
+
237
+ # =============================================================================
238
+ # Audio Chunk Generator
239
+ # =============================================================================
240
+
241
+
242
+ class AudioChunkGenerator:
243
+ """Generates AudioChunkEvents from recorded audio.
244
+
245
+ Splits audio into chunks for storage and processing.
246
+ """
247
+
248
+ def __init__(
249
+ self,
250
+ chunk_duration: float = 30.0,
251
+ sample_rate: int = 16000,
252
+ ) -> None:
253
+ """Initialize chunk generator.
254
+
255
+ Args:
256
+ chunk_duration: Duration of each chunk in seconds.
257
+ sample_rate: Audio sample rate.
258
+ """
259
+ self.chunk_duration = chunk_duration
260
+ self.sample_rate = sample_rate
261
+
262
+ def generate_chunks(
263
+ self,
264
+ audio: "np.ndarray",
265
+ start_time: float,
266
+ transcription: dict[str, Any] | None = None,
267
+ ) -> list[AudioChunkEvent]:
268
+ """Generate AudioChunkEvents from audio data.
269
+
270
+ Args:
271
+ audio: Audio data as numpy array.
272
+ start_time: Unix timestamp of audio start.
273
+ transcription: Optional transcription result.
274
+
275
+ Returns:
276
+ List of AudioChunkEvent objects.
277
+ """
278
+ _import_audio_deps()
279
+
280
+ total_samples = len(audio)
281
+ samples_per_chunk = int(self.chunk_duration * self.sample_rate)
282
+
283
+ events = []
284
+ chunk_idx = 0
285
+
286
+ while chunk_idx * samples_per_chunk < total_samples:
287
+ start_sample = chunk_idx * samples_per_chunk
288
+ end_sample = min((chunk_idx + 1) * samples_per_chunk, total_samples)
289
+
290
+ chunk_start_time = start_time + (start_sample / self.sample_rate)
291
+ chunk_end_time = start_time + (end_sample / self.sample_rate)
292
+
293
+ # Extract transcription for this chunk if available
294
+ chunk_text = None
295
+ if transcription and transcription.get("segments"):
296
+ chunk_words = []
297
+ for segment in transcription["segments"]:
298
+ if "words" in segment:
299
+ for word in segment["words"]:
300
+ word_start = start_time + word["start"]
301
+ word_end = start_time + word["end"]
302
+ if word_start >= chunk_start_time and word_end <= chunk_end_time:
303
+ chunk_words.append(word["word"])
304
+ chunk_text = " ".join(chunk_words).strip() if chunk_words else None
305
+
306
+ event = AudioChunkEvent(
307
+ timestamp=chunk_start_time,
308
+ start_time=chunk_start_time,
309
+ end_time=chunk_end_time,
310
+ transcription=chunk_text,
311
+ )
312
+ events.append(event)
313
+ chunk_idx += 1
314
+
315
+ return events
316
+
317
+
318
+ # =============================================================================
319
+ # Continuous Audio Capture
320
+ # =============================================================================
321
+
322
+
323
+ class ContinuousAudioCapture:
324
+ """Continuous audio capture with streaming transcription support.
325
+
326
+ For long captures, provides periodic callbacks with audio chunks.
327
+
328
+ Usage:
329
+ def on_chunk(event, audio_bytes):
330
+ print(f"Chunk: {event.start_time} - {event.end_time}")
331
+
332
+ capture = ContinuousAudioCapture(on_chunk, chunk_duration=30.0)
333
+ capture.start()
334
+ # ... capture audio ...
335
+ capture.stop()
336
+ """
337
+
338
+ def __init__(
339
+ self,
340
+ callback: Callable[[AudioChunkEvent, bytes], None],
341
+ chunk_duration: float = 30.0,
342
+ sample_rate: int = 16000,
343
+ channels: int = 1,
344
+ transcribe: bool = False,
345
+ whisper_model: str = "base",
346
+ ) -> None:
347
+ """Initialize continuous audio capture.
348
+
349
+ Args:
350
+ callback: Function called with (event, audio_bytes) for each chunk.
351
+ chunk_duration: Duration of each chunk in seconds.
352
+ sample_rate: Audio sample rate.
353
+ channels: Number of channels.
354
+ transcribe: Whether to transcribe each chunk.
355
+ whisper_model: Whisper model to use for transcription.
356
+ """
357
+ _import_audio_deps()
358
+
359
+ self.callback = callback
360
+ self.chunk_duration = chunk_duration
361
+ self.sample_rate = sample_rate
362
+ self.channels = channels
363
+ self.transcribe = transcribe
364
+ self.whisper_model = whisper_model
365
+
366
+ self._recorder: AudioRecorder | None = None
367
+ self._running = False
368
+ self._thread: threading.Thread | None = None
369
+ self._stop_event = threading.Event()
370
+
371
+ if transcribe:
372
+ _import_whisper()
373
+ self._whisper_model = _whisper.load_model(whisper_model)
374
+ else:
375
+ self._whisper_model = None
376
+
377
+ def _process_chunk(self, audio: "np.ndarray", start_time: float) -> None:
378
+ """Process a single audio chunk."""
379
+ # Get transcription if enabled
380
+ transcription = None
381
+ if self._whisper_model is not None:
382
+ try:
383
+ transcription = self._whisper_model.transcribe(
384
+ audio,
385
+ word_timestamps=True,
386
+ fp16=False,
387
+ )
388
+ transcription_text = transcription.get("text", "").strip()
389
+ except Exception:
390
+ transcription_text = None
391
+ else:
392
+ transcription_text = None
393
+
394
+ end_time = start_time + (len(audio) / self.sample_rate)
395
+
396
+ event = AudioChunkEvent(
397
+ timestamp=start_time,
398
+ start_time=start_time,
399
+ end_time=end_time,
400
+ transcription=transcription_text,
401
+ )
402
+
403
+ # Convert to FLAC bytes
404
+ buffer = io.BytesIO()
405
+ _soundfile.write(buffer, audio, self.sample_rate, format="FLAC")
406
+ audio_bytes = buffer.getvalue()
407
+
408
+ self.callback(event, audio_bytes)
409
+
410
+ def _capture_loop(self) -> None:
411
+ """Main capture loop."""
412
+ samples_per_chunk = int(self.chunk_duration * self.sample_rate)
413
+ chunk_start_time = _get_timestamp()
414
+
415
+ while not self._stop_event.is_set():
416
+ # Wait for chunk duration
417
+ self._stop_event.wait(self.chunk_duration)
418
+
419
+ if self._recorder is not None:
420
+ audio = self._recorder.get_audio()
421
+
422
+ if len(audio) >= samples_per_chunk:
423
+ # Get samples for this chunk
424
+ chunk_audio = audio[:samples_per_chunk].copy()
425
+
426
+ # Process in separate thread to not block capture
427
+ threading.Thread(
428
+ target=self._process_chunk,
429
+ args=(chunk_audio, chunk_start_time),
430
+ daemon=True,
431
+ ).start()
432
+
433
+ # Reset recorder for next chunk
434
+ with self._recorder._lock:
435
+ self._recorder._frames = [
436
+ audio[samples_per_chunk:].reshape(-1, 1)
437
+ ] if len(audio) > samples_per_chunk else []
438
+
439
+ chunk_start_time = _get_timestamp()
440
+
441
+ def start(self) -> None:
442
+ """Start continuous audio capture."""
443
+ if self._running:
444
+ return
445
+
446
+ self._recorder = AudioRecorder(
447
+ sample_rate=self.sample_rate,
448
+ channels=self.channels,
449
+ )
450
+ self._recorder.start()
451
+
452
+ self._stop_event.clear()
453
+ self._thread = threading.Thread(target=self._capture_loop, daemon=True)
454
+ self._thread.start()
455
+ self._running = True
456
+
457
+ def stop(self) -> None:
458
+ """Stop continuous audio capture and process remaining audio."""
459
+ if not self._running:
460
+ return
461
+
462
+ self._stop_event.set()
463
+ if self._thread is not None:
464
+ self._thread.join(timeout=2.0)
465
+ self._thread = None
466
+
467
+ # Process any remaining audio
468
+ if self._recorder is not None:
469
+ audio = self._recorder.get_audio()
470
+ if len(audio) > 0:
471
+ start_time = (
472
+ self._recorder.start_time + self._recorder.get_duration()
473
+ - (len(audio) / self.sample_rate)
474
+ )
475
+ self._process_chunk(audio, start_time)
476
+
477
+ self._recorder.stop()
478
+ self._recorder = None
479
+
480
+ self._running = False
481
+
482
+ def __enter__(self) -> "ContinuousAudioCapture":
483
+ """Context manager entry."""
484
+ self.start()
485
+ return self
486
+
487
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
488
+ """Context manager exit."""
489
+ self.stop()