escribano 0.1.4 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "torch>=2.0",
6
+ # "soundfile",
7
+ # "numpy",
8
+ # "silero-vad",
9
+ # ]
10
+ # ///
11
+ """
12
+ Audio Preprocessor - Silero VAD for speech segment extraction.
13
+ Uses soundfile for I/O to avoid torchaudio/torchcodec native dependency issues.
14
+
15
+ Usage:
16
+ uv run audio_preprocessor.py --audio /path/to/audio.wav --output-dir /tmp/segments --output-json /path/to/segments.json
17
+ """
18
+
19
+ import argparse
20
+ import json
21
+ import os
22
+ from pathlib import Path
23
+ import torch
24
+ import soundfile as sf
25
+ import numpy as np
26
+
27
+ def parse_args():
28
+ parser = argparse.ArgumentParser(description="Audio Preprocessor with Silero VAD")
29
+ parser.add_argument("--audio", type=Path, required=True, help="Path to input audio file")
30
+ parser.add_argument("--output-dir", type=Path, required=True, help="Directory to save segment WAV files")
31
+ parser.add_argument("--output-json", type=Path, required=True, help="Path to save segments manifest JSON")
32
+ parser.add_argument("--threshold", type=float, default=0.5, help="VAD threshold (default: 0.5)")
33
+ parser.add_argument("--min-speech-duration-ms", type=int, default=250, help="Min speech duration in ms")
34
+ parser.add_argument("--min-silence-duration-ms", type=int, default=1000, help="Min silence duration in ms")
35
+ return parser.parse_args()
36
+
37
+ def read_audio_sf(path: str, sampling_rate: int = 16000):
38
+ wav, sr = sf.read(path)
39
+ if len(wav.shape) > 1:
40
+ wav = np.mean(wav, axis=1)
41
+ if sr != sampling_rate:
42
+ # Note: We expect the input to be pre-converted by ffmpeg to 16000
43
+ # But if not, we would need a resampler. For now, we assume sr is correct.
44
+ pass
45
+ return torch.from_numpy(wav.astype(np.float32))
46
+
47
+ def main():
48
+ args = parse_args()
49
+
50
+ if not args.audio.exists():
51
+ print(f"Error: Audio file not found: {args.audio}")
52
+ return 1
53
+
54
+ args.output_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ # Load Silero VAD model
57
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
58
+ model='silero_vad',
59
+ force_reload=False,
60
+ onnx=False)
61
+
62
+ (get_speech_timestamps, _, _, _, _) = utils
63
+
64
+ # Load audio
65
+ sampling_rate = 16000
66
+ wav = read_audio_sf(str(args.audio), sampling_rate=sampling_rate)
67
+
68
+ # Get speech timestamps
69
+ speech_timestamps = get_speech_timestamps(
70
+ wav,
71
+ model,
72
+ sampling_rate=sampling_rate,
73
+ threshold=args.threshold,
74
+ min_speech_duration_ms=args.min_speech_duration_ms,
75
+ min_silence_duration_ms=args.min_silence_duration_ms
76
+ )
77
+
78
+ segments = []
79
+
80
+ for i, ts in enumerate(speech_timestamps):
81
+ start_sec = ts['start'] / sampling_rate
82
+ end_sec = ts['end'] / sampling_rate
83
+
84
+ # Extract segment
85
+ segment_wav = wav[ts['start']:ts['end']].numpy()
86
+
87
+ # Save segment to WAV using soundfile
88
+ segment_filename = f"segment_{i:04d}.wav"
89
+ segment_path = args.output_dir / segment_filename
90
+
91
+ sf.write(str(segment_path), segment_wav, sampling_rate)
92
+
93
+ segments.append({
94
+ "start": float(start_sec),
95
+ "end": float(end_sec),
96
+ "audioPath": str(segment_path)
97
+ })
98
+
99
+ # Write manifest
100
+ with open(args.output_json, "w") as f:
101
+ json.dump(segments, f, indent=2)
102
+
103
+ print(f"Extracted {len(segments)} speech segments to {args.output_dir}")
104
+ print(f"Manifest written to {args.output_json}")
105
+
106
+ return 0
107
+
108
+ if __name__ == "__main__":
109
+ exit(main())
@@ -0,0 +1,417 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Visual Observer Base - OCR + CLIP indexing for screen recordings.
4
+
5
+ Usage:
6
+ uv run visual_observer_base.py --frames-dir /path/to/frames --output /path/to/visual-index.json
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import os
12
+ import time
13
+ from concurrent.futures import ProcessPoolExecutor, as_completed
14
+ from pathlib import Path
15
+ from typing import TypedDict
16
+
17
+ import open_clip
18
+ import pytesseract
19
+ import torch
20
+ from PIL import Image
21
+ from sklearn.cluster import AgglomerativeClustering
22
+
23
+
24
+ # Type definitions
25
+ class FrameData(TypedDict):
26
+ index: int
27
+ timestamp: float
28
+ imagePath: str
29
+ ocrText: str
30
+ clusterId: int
31
+ changeScore: float
32
+
33
+
34
+ class ClusterData(TypedDict):
35
+ id: int
36
+ heuristicLabel: str
37
+ timeRange: tuple[float, float]
38
+ frameCount: int
39
+ representativeIdx: int
40
+ avgOcrCharacters: float
41
+ mediaIndicators: list[str]
42
+
43
+
44
+ class VisualIndex(TypedDict):
45
+ frames: list[FrameData]
46
+ clusters: list[ClusterData]
47
+ processingTime: dict[str, int]
48
+
49
+
50
+ # Constants
51
+ # Prefer MPS for Apple Silicon, fallback to CPU
52
+ DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
53
+ CLIP_MODEL = "ViT-B-32"
54
+ CLIP_PRETRAINED = "laion2b_s34b_b79k"
55
+ CLUSTER_DISTANCE_THRESHOLD = 0.15 # 1 - 0.85 similarity
56
+
57
+ UI_CATEGORIES = [
58
+ "A screenshot of a code editor showing programming code",
59
+ "A screenshot of a terminal with command line interface",
60
+ "A screenshot of a web browser showing a website",
61
+ "A screenshot of a video player with playback controls",
62
+ "A screenshot of a document or PDF viewer",
63
+ "A screenshot of an image viewer or photo application",
64
+ "A screenshot of a chat or messaging application",
65
+ "A screenshot of a file manager or finder window",
66
+ ]
67
+
68
+ CATEGORY_LABELS = [
69
+ "code-editor",
70
+ "terminal",
71
+ "browser",
72
+ "video-player",
73
+ "document",
74
+ "image-viewer",
75
+ "chat",
76
+ "file-manager",
77
+ ]
78
+
79
+
80
+ def parse_args() -> argparse.Namespace:
81
+ parser = argparse.ArgumentParser(description="Visual Observer Base")
82
+ parser.add_argument("--frames-dir", type=Path, required=True)
83
+ parser.add_argument("--output", type=Path, required=True)
84
+ parser.add_argument("--frame-interval", type=float, default=2.0,
85
+ help="Seconds between frames (default: 2)")
86
+ parser.add_argument("--workers", type=int, default=os.cpu_count(),
87
+ help="Number of parallel OCR workers (default: CPU count)")
88
+ return parser.parse_args()
89
+
90
+
91
+ def load_frames(frames_dir: Path, frame_interval: float) -> list[tuple[int, float, Path]]:
92
+ """Load frame paths and compute timestamps.
93
+
94
+ Args:
95
+ frames_dir: Directory containing frame images
96
+ frame_interval: Seconds between frames (e.g., 2.0 means frame 0 at 0s, frame 1 at 2s)
97
+ """
98
+ frames = []
99
+ # Assumes filenames like scene_0001.jpg
100
+ # Using sorted glob to ensure chronological order
101
+ all_files = sorted(list(frames_dir.glob("*.jpg")))
102
+
103
+ for i, path in enumerate(all_files):
104
+ timestamp = i * frame_interval
105
+ frames.append((i, timestamp, path))
106
+
107
+ return frames
108
+
109
+
110
+ def extract_ocr(image_path: Path) -> str:
111
+ """Extract text from image using Tesseract.
112
+
113
+ Uses PSM 11 (sparse text) which works better for UI screenshots
114
+ where text is scattered across the screen (menus, buttons, tabs, URLs).
115
+ """
116
+ try:
117
+ image = Image.open(image_path)
118
+ # PSM 11: Sparse text - finds text scattered anywhere (UI elements)
119
+ # OEM 3: Default OCR engine mode (LSTM if available)
120
+ custom_config = r'--psm 11 --oem 3'
121
+ text = pytesseract.image_to_string(image, config=custom_config)
122
+ return text.strip()
123
+ except Exception as e:
124
+ print(f" Warning: OCR failed for {image_path.name}: {e}")
125
+ return ""
126
+
127
+
128
+ def extract_ocr_parallel(
129
+ frames: list[tuple[int, float, Path]],
130
+ max_workers: int
131
+ ) -> dict[int, str]:
132
+ """Extract OCR in parallel using multiprocessing.
133
+
134
+ Args:
135
+ frames: List of (index, timestamp, path) tuples
136
+ max_workers: Number of parallel workers
137
+
138
+ Returns:
139
+ Dictionary mapping frame index to OCR text
140
+ """
141
+ results = {}
142
+ total = len(frames)
143
+ completed = 0
144
+
145
+ print(f" Using {max_workers} parallel workers...")
146
+
147
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
148
+ # Submit all tasks
149
+ future_to_idx = {
150
+ executor.submit(extract_ocr, path): idx
151
+ for idx, _, path in frames
152
+ }
153
+
154
+ # Collect results as they complete
155
+ for future in as_completed(future_to_idx):
156
+ idx = future_to_idx[future]
157
+ try:
158
+ results[idx] = future.result()
159
+ except Exception as e:
160
+ print(f" Warning: OCR failed for frame {idx}: {e}")
161
+ results[idx] = ""
162
+
163
+ completed += 1
164
+ # Progress indicator every 10%
165
+ if completed % max(1, total // 10) == 0:
166
+ pct = (completed / total) * 100
167
+ print(f" OCR progress: {completed}/{total} ({pct:.0f}%)")
168
+
169
+ return results
170
+
171
+
172
+ def compute_clip_embeddings(
173
+ frames: list[tuple[int, float, Path]],
174
+ model,
175
+ preprocess,
176
+ ) -> torch.Tensor:
177
+ """Compute CLIP embeddings for all frames."""
178
+ embeddings = []
179
+
180
+ for _, _, path in frames:
181
+ try:
182
+ image = preprocess(Image.open(path)).unsqueeze(0).to(DEVICE)
183
+
184
+ with torch.no_grad():
185
+ embedding = model.encode_image(image)
186
+ embedding = embedding / embedding.norm(dim=-1, keepdim=True)
187
+
188
+ embeddings.append(embedding.cpu())
189
+ except Exception as e:
190
+ print(f" Warning: CLIP embedding failed for {path.name}: {e}")
191
+ # Use zero vector as fallback to maintain alignment
192
+ embeddings.append(torch.zeros((1, 512)))
193
+
194
+ if not embeddings:
195
+ return torch.zeros((0, 512))
196
+
197
+ return torch.cat(embeddings, dim=0)
198
+
199
+
200
+ def cluster_frames(embeddings: torch.Tensor) -> list[int]:
201
+ """Cluster frames by CLIP embedding similarity."""
202
+ if len(embeddings) < 2:
203
+ return [0] * len(embeddings)
204
+
205
+ clustering = AgglomerativeClustering(
206
+ n_clusters=None, # type: ignore
207
+ distance_threshold=CLUSTER_DISTANCE_THRESHOLD,
208
+ metric="cosine",
209
+ linkage="average",
210
+ )
211
+
212
+ labels = clustering.fit_predict(embeddings.numpy())
213
+ return labels.tolist()
214
+
215
+
216
+ def infer_label_with_clip(
217
+ image_path: Path,
218
+ model,
219
+ preprocess,
220
+ tokenizer,
221
+ ) -> str:
222
+ """Use CLIP zero-shot to classify frame into UI category."""
223
+ try:
224
+ image = preprocess(Image.open(image_path)).unsqueeze(0).to(DEVICE)
225
+ text_tokens = tokenizer(UI_CATEGORIES).to(DEVICE)
226
+
227
+ with torch.no_grad():
228
+ image_features = model.encode_image(image)
229
+ text_features = model.encode_text(text_tokens)
230
+
231
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
232
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
233
+
234
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
235
+ best_idx = similarity.argmax().item()
236
+
237
+ return CATEGORY_LABELS[best_idx]
238
+ except Exception as e:
239
+ print(f" Warning: Zero-shot classification failed for {image_path.name}: {e}")
240
+ return "unknown"
241
+
242
+
243
+ def detect_media_indicators(ocr_text: str) -> list[str]:
244
+ """
245
+ Detect indicators that frame shows media content.
246
+
247
+ TODO: Expand patterns based on real-world testing:
248
+ - Video platforms: Vimeo, Twitch, Netflix, Disney+
249
+ - Image formats: .gif, .webp, .svg, .bmp
250
+ - Media players: VLC, QuickTime, IINA, mpv
251
+ - Streaming: Spotify, Apple Music, SoundCloud
252
+ - Social media: Twitter/X, Instagram, TikTok
253
+ """
254
+ indicators = []
255
+ text_lower = ocr_text.lower()
256
+
257
+ # Video platforms
258
+ if "youtube" in text_lower:
259
+ indicators.append("youtube")
260
+
261
+ if "vimeo" in text_lower:
262
+ indicators.append("vimeo")
263
+
264
+ if "netflix" in text_lower:
265
+ indicators.append("netflix")
266
+
267
+ # Image files
268
+ image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
269
+ if any(ext in text_lower for ext in image_extensions):
270
+ indicators.append("image-file")
271
+
272
+ # TODO: Add more patterns after dry-run testing
273
+
274
+ return indicators
275
+
276
+
277
+ def build_cluster_metadata(
278
+ frames_data: list[FrameData],
279
+ cluster_labels: list[int],
280
+ model,
281
+ preprocess,
282
+ tokenizer,
283
+ ) -> list[ClusterData]:
284
+ """Build metadata for each cluster."""
285
+ clusters: dict[int, list[FrameData]] = {}
286
+
287
+ for frame, label in zip(frames_data, cluster_labels):
288
+ if label not in clusters:
289
+ clusters[label] = []
290
+ clusters[label].append(frame)
291
+
292
+ result = []
293
+ for cluster_id, cluster_frames in clusters.items():
294
+ # Find representative (middle frame)
295
+ representative = cluster_frames[len(cluster_frames) // 2]
296
+
297
+ # Compute average OCR characters
298
+ avg_chars = sum(len(f["ocrText"]) for f in cluster_frames) / len(cluster_frames)
299
+
300
+ # Get time range
301
+ timestamps = [f["timestamp"] for f in cluster_frames]
302
+ time_range = (float(min(timestamps)), float(max(timestamps)))
303
+
304
+ # Aggregate media indicators
305
+ all_indicators = set()
306
+ for f in cluster_frames:
307
+ all_indicators.update(detect_media_indicators(f["ocrText"]))
308
+
309
+ # Infer label using CLIP on representative
310
+ rep_path = Path(representative["imagePath"])
311
+ label = infer_label_with_clip(rep_path, model, preprocess, tokenizer)
312
+
313
+ result.append({
314
+ "id": cluster_id,
315
+ "heuristicLabel": label,
316
+ "timeRange": time_range,
317
+ "frameCount": len(cluster_frames),
318
+ "representativeIdx": representative["index"],
319
+ "avgOcrCharacters": avg_chars,
320
+ "mediaIndicators": list(all_indicators),
321
+ })
322
+
323
+ return result
324
+
325
+
326
+ def main():
327
+ args = parse_args()
328
+
329
+ print(f"Loading frames from {args.frames_dir}...")
330
+ frames = load_frames(args.frames_dir, args.frame_interval)
331
+
332
+ if not frames:
333
+ print("Error: No frames found")
334
+ return 1
335
+
336
+ print(f"Found {len(frames)} frames")
337
+
338
+ # Initialize timing
339
+ timing = {"ocrMs": 0, "clipMs": 0, "clusterMs": 0, "totalMs": 0}
340
+ total_start = time.time()
341
+
342
+ # Phase 1: OCR (Parallel)
343
+ print(f"Phase 1: Extracting text with OCR ({args.workers} workers)...")
344
+ ocr_start = time.time()
345
+
346
+ ocr_results = extract_ocr_parallel(frames, args.workers)
347
+
348
+ frames_data: list[FrameData] = []
349
+ for idx, timestamp, path in frames:
350
+ frames_data.append({
351
+ "index": idx,
352
+ "timestamp": timestamp,
353
+ "imagePath": str(path),
354
+ "ocrText": ocr_results.get(idx, ""),
355
+ "clusterId": -1, # Set later
356
+ "changeScore": 0.0, # TODO: Implement pixel delta if needed
357
+ })
358
+
359
+ timing["ocrMs"] = int((time.time() - ocr_start) * 1000)
360
+ print(f" OCR complete: {timing['ocrMs']}ms")
361
+
362
+ # Phase 2: CLIP embeddings
363
+ print(f"Phase 2: Computing CLIP embeddings on {DEVICE}...")
364
+ clip_start = time.time()
365
+
366
+ model, _, preprocess = open_clip.create_model_and_transforms(
367
+ CLIP_MODEL, pretrained=CLIP_PRETRAINED
368
+ )
369
+ model.eval()
370
+ model.to(DEVICE)
371
+ tokenizer = open_clip.get_tokenizer(CLIP_MODEL)
372
+
373
+ embeddings = compute_clip_embeddings(frames, model, preprocess)
374
+ timing["clipMs"] = int((time.time() - clip_start) * 1000)
375
+ print(f" CLIP complete: {timing['clipMs']}ms")
376
+
377
+ # Phase 3: Clustering
378
+ print("Phase 3: Clustering frames...")
379
+ cluster_start = time.time()
380
+
381
+ cluster_labels = cluster_frames(embeddings)
382
+
383
+ # Update frames with cluster IDs
384
+ for frame, label in zip(frames_data, cluster_labels):
385
+ frame["clusterId"] = label
386
+
387
+ timing["clusterMs"] = int((time.time() - cluster_start) * 1000)
388
+ print(f" Clustering complete: {timing['clusterMs']}ms")
389
+
390
+ # Phase 4: Build cluster metadata
391
+ print("Phase 4: Building cluster metadata...")
392
+ clusters = build_cluster_metadata(
393
+ frames_data, cluster_labels, model, preprocess, tokenizer
394
+ )
395
+ print(f" Found {len(clusters)} clusters")
396
+
397
+ timing["totalMs"] = int((time.time() - total_start) * 1000)
398
+
399
+ # Output
400
+ result: VisualIndex = {
401
+ "frames": frames_data,
402
+ "clusters": clusters,
403
+ "processingTime": timing,
404
+ }
405
+
406
+ args.output.parent.mkdir(parents=True, exist_ok=True)
407
+ with open(args.output, "w") as f:
408
+ json.dump(result, f, indent=2)
409
+
410
+ print(f"\nOutput written to {args.output}")
411
+ print(f"Total processing time: {timing['totalMs']}ms")
412
+
413
+ return 0
414
+
415
+
416
+ if __name__ == "__main__":
417
+ exit(main())