escribano 0.1.4 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/actions/generate-artifact-v3.js +4 -2
- package/dist/actions/generate-summary-v3.js +4 -2
- package/dist/adapters/audio.silero.adapter.js +50 -3
- package/dist/adapters/intelligence.ollama.adapter.js +9 -7
- package/dist/adapters/video.ffmpeg.adapter.js +9 -5
- package/dist/index.js +10 -0
- package/dist/services/subject-grouping.js +4 -2
- package/dist/tests/utils/env-logger.test.js +262 -0
- package/dist/utils/env-logger.js +166 -0
- package/package.json +6 -3
- package/scripts/backfill-releases.mjs +207 -0
- package/scripts/create-release.mjs +201 -0
- package/src/scripts/audio_preprocessor.py +109 -0
- package/src/scripts/visual_observer_base.py +417 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# /// script
|
|
3
|
+
# requires-python = ">=3.10"
|
|
4
|
+
# dependencies = [
|
|
5
|
+
# "torch>=2.0",
|
|
6
|
+
# "soundfile",
|
|
7
|
+
# "numpy",
|
|
8
|
+
# "silero-vad",
|
|
9
|
+
# ]
|
|
10
|
+
# ///
|
|
11
|
+
"""
|
|
12
|
+
Audio Preprocessor - Silero VAD for speech segment extraction.
|
|
13
|
+
Uses soundfile for I/O to avoid torchaudio/torchcodec native dependency issues.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
uv run audio_preprocessor.py --audio /path/to/audio.wav --output-dir /tmp/segments --output-json /path/to/segments.json
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
import torch
|
|
24
|
+
import soundfile as sf
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
def parse_args():
|
|
28
|
+
parser = argparse.ArgumentParser(description="Audio Preprocessor with Silero VAD")
|
|
29
|
+
parser.add_argument("--audio", type=Path, required=True, help="Path to input audio file")
|
|
30
|
+
parser.add_argument("--output-dir", type=Path, required=True, help="Directory to save segment WAV files")
|
|
31
|
+
parser.add_argument("--output-json", type=Path, required=True, help="Path to save segments manifest JSON")
|
|
32
|
+
parser.add_argument("--threshold", type=float, default=0.5, help="VAD threshold (default: 0.5)")
|
|
33
|
+
parser.add_argument("--min-speech-duration-ms", type=int, default=250, help="Min speech duration in ms")
|
|
34
|
+
parser.add_argument("--min-silence-duration-ms", type=int, default=1000, help="Min silence duration in ms")
|
|
35
|
+
return parser.parse_args()
|
|
36
|
+
|
|
37
|
+
def read_audio_sf(path: str, sampling_rate: int = 16000):
|
|
38
|
+
wav, sr = sf.read(path)
|
|
39
|
+
if len(wav.shape) > 1:
|
|
40
|
+
wav = np.mean(wav, axis=1)
|
|
41
|
+
if sr != sampling_rate:
|
|
42
|
+
# Note: We expect the input to be pre-converted by ffmpeg to 16000
|
|
43
|
+
# But if not, we would need a resampler. For now, we assume sr is correct.
|
|
44
|
+
pass
|
|
45
|
+
return torch.from_numpy(wav.astype(np.float32))
|
|
46
|
+
|
|
47
|
+
def main():
|
|
48
|
+
args = parse_args()
|
|
49
|
+
|
|
50
|
+
if not args.audio.exists():
|
|
51
|
+
print(f"Error: Audio file not found: {args.audio}")
|
|
52
|
+
return 1
|
|
53
|
+
|
|
54
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
|
|
56
|
+
# Load Silero VAD model
|
|
57
|
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
|
58
|
+
model='silero_vad',
|
|
59
|
+
force_reload=False,
|
|
60
|
+
onnx=False)
|
|
61
|
+
|
|
62
|
+
(get_speech_timestamps, _, _, _, _) = utils
|
|
63
|
+
|
|
64
|
+
# Load audio
|
|
65
|
+
sampling_rate = 16000
|
|
66
|
+
wav = read_audio_sf(str(args.audio), sampling_rate=sampling_rate)
|
|
67
|
+
|
|
68
|
+
# Get speech timestamps
|
|
69
|
+
speech_timestamps = get_speech_timestamps(
|
|
70
|
+
wav,
|
|
71
|
+
model,
|
|
72
|
+
sampling_rate=sampling_rate,
|
|
73
|
+
threshold=args.threshold,
|
|
74
|
+
min_speech_duration_ms=args.min_speech_duration_ms,
|
|
75
|
+
min_silence_duration_ms=args.min_silence_duration_ms
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
segments = []
|
|
79
|
+
|
|
80
|
+
for i, ts in enumerate(speech_timestamps):
|
|
81
|
+
start_sec = ts['start'] / sampling_rate
|
|
82
|
+
end_sec = ts['end'] / sampling_rate
|
|
83
|
+
|
|
84
|
+
# Extract segment
|
|
85
|
+
segment_wav = wav[ts['start']:ts['end']].numpy()
|
|
86
|
+
|
|
87
|
+
# Save segment to WAV using soundfile
|
|
88
|
+
segment_filename = f"segment_{i:04d}.wav"
|
|
89
|
+
segment_path = args.output_dir / segment_filename
|
|
90
|
+
|
|
91
|
+
sf.write(str(segment_path), segment_wav, sampling_rate)
|
|
92
|
+
|
|
93
|
+
segments.append({
|
|
94
|
+
"start": float(start_sec),
|
|
95
|
+
"end": float(end_sec),
|
|
96
|
+
"audioPath": str(segment_path)
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
# Write manifest
|
|
100
|
+
with open(args.output_json, "w") as f:
|
|
101
|
+
json.dump(segments, f, indent=2)
|
|
102
|
+
|
|
103
|
+
print(f"Extracted {len(segments)} speech segments to {args.output_dir}")
|
|
104
|
+
print(f"Manifest written to {args.output_json}")
|
|
105
|
+
|
|
106
|
+
return 0
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
exit(main())
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Visual Observer Base - OCR + CLIP indexing for screen recordings.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
uv run visual_observer_base.py --frames-dir /path/to/frames --output /path/to/visual-index.json
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TypedDict
|
|
16
|
+
|
|
17
|
+
import open_clip
|
|
18
|
+
import pytesseract
|
|
19
|
+
import torch
|
|
20
|
+
from PIL import Image
|
|
21
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Type definitions
|
|
25
|
+
class FrameData(TypedDict):
|
|
26
|
+
index: int
|
|
27
|
+
timestamp: float
|
|
28
|
+
imagePath: str
|
|
29
|
+
ocrText: str
|
|
30
|
+
clusterId: int
|
|
31
|
+
changeScore: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ClusterData(TypedDict):
|
|
35
|
+
id: int
|
|
36
|
+
heuristicLabel: str
|
|
37
|
+
timeRange: tuple[float, float]
|
|
38
|
+
frameCount: int
|
|
39
|
+
representativeIdx: int
|
|
40
|
+
avgOcrCharacters: float
|
|
41
|
+
mediaIndicators: list[str]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class VisualIndex(TypedDict):
|
|
45
|
+
frames: list[FrameData]
|
|
46
|
+
clusters: list[ClusterData]
|
|
47
|
+
processingTime: dict[str, int]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Constants
|
|
51
|
+
# Prefer MPS for Apple Silicon, fallback to CPU
|
|
52
|
+
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
|
|
53
|
+
CLIP_MODEL = "ViT-B-32"
|
|
54
|
+
CLIP_PRETRAINED = "laion2b_s34b_b79k"
|
|
55
|
+
CLUSTER_DISTANCE_THRESHOLD = 0.15 # 1 - 0.85 similarity
|
|
56
|
+
|
|
57
|
+
UI_CATEGORIES = [
|
|
58
|
+
"A screenshot of a code editor showing programming code",
|
|
59
|
+
"A screenshot of a terminal with command line interface",
|
|
60
|
+
"A screenshot of a web browser showing a website",
|
|
61
|
+
"A screenshot of a video player with playback controls",
|
|
62
|
+
"A screenshot of a document or PDF viewer",
|
|
63
|
+
"A screenshot of an image viewer or photo application",
|
|
64
|
+
"A screenshot of a chat or messaging application",
|
|
65
|
+
"A screenshot of a file manager or finder window",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
CATEGORY_LABELS = [
|
|
69
|
+
"code-editor",
|
|
70
|
+
"terminal",
|
|
71
|
+
"browser",
|
|
72
|
+
"video-player",
|
|
73
|
+
"document",
|
|
74
|
+
"image-viewer",
|
|
75
|
+
"chat",
|
|
76
|
+
"file-manager",
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_args() -> argparse.Namespace:
|
|
81
|
+
parser = argparse.ArgumentParser(description="Visual Observer Base")
|
|
82
|
+
parser.add_argument("--frames-dir", type=Path, required=True)
|
|
83
|
+
parser.add_argument("--output", type=Path, required=True)
|
|
84
|
+
parser.add_argument("--frame-interval", type=float, default=2.0,
|
|
85
|
+
help="Seconds between frames (default: 2)")
|
|
86
|
+
parser.add_argument("--workers", type=int, default=os.cpu_count(),
|
|
87
|
+
help="Number of parallel OCR workers (default: CPU count)")
|
|
88
|
+
return parser.parse_args()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def load_frames(frames_dir: Path, frame_interval: float) -> list[tuple[int, float, Path]]:
|
|
92
|
+
"""Load frame paths and compute timestamps.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
frames_dir: Directory containing frame images
|
|
96
|
+
frame_interval: Seconds between frames (e.g., 2.0 means frame 0 at 0s, frame 1 at 2s)
|
|
97
|
+
"""
|
|
98
|
+
frames = []
|
|
99
|
+
# Assumes filenames like scene_0001.jpg
|
|
100
|
+
# Using sorted glob to ensure chronological order
|
|
101
|
+
all_files = sorted(list(frames_dir.glob("*.jpg")))
|
|
102
|
+
|
|
103
|
+
for i, path in enumerate(all_files):
|
|
104
|
+
timestamp = i * frame_interval
|
|
105
|
+
frames.append((i, timestamp, path))
|
|
106
|
+
|
|
107
|
+
return frames
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def extract_ocr(image_path: Path) -> str:
|
|
111
|
+
"""Extract text from image using Tesseract.
|
|
112
|
+
|
|
113
|
+
Uses PSM 11 (sparse text) which works better for UI screenshots
|
|
114
|
+
where text is scattered across the screen (menus, buttons, tabs, URLs).
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
image = Image.open(image_path)
|
|
118
|
+
# PSM 11: Sparse text - finds text scattered anywhere (UI elements)
|
|
119
|
+
# OEM 3: Default OCR engine mode (LSTM if available)
|
|
120
|
+
custom_config = r'--psm 11 --oem 3'
|
|
121
|
+
text = pytesseract.image_to_string(image, config=custom_config)
|
|
122
|
+
return text.strip()
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f" Warning: OCR failed for {image_path.name}: {e}")
|
|
125
|
+
return ""
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def extract_ocr_parallel(
|
|
129
|
+
frames: list[tuple[int, float, Path]],
|
|
130
|
+
max_workers: int
|
|
131
|
+
) -> dict[int, str]:
|
|
132
|
+
"""Extract OCR in parallel using multiprocessing.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
frames: List of (index, timestamp, path) tuples
|
|
136
|
+
max_workers: Number of parallel workers
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Dictionary mapping frame index to OCR text
|
|
140
|
+
"""
|
|
141
|
+
results = {}
|
|
142
|
+
total = len(frames)
|
|
143
|
+
completed = 0
|
|
144
|
+
|
|
145
|
+
print(f" Using {max_workers} parallel workers...")
|
|
146
|
+
|
|
147
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
148
|
+
# Submit all tasks
|
|
149
|
+
future_to_idx = {
|
|
150
|
+
executor.submit(extract_ocr, path): idx
|
|
151
|
+
for idx, _, path in frames
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
# Collect results as they complete
|
|
155
|
+
for future in as_completed(future_to_idx):
|
|
156
|
+
idx = future_to_idx[future]
|
|
157
|
+
try:
|
|
158
|
+
results[idx] = future.result()
|
|
159
|
+
except Exception as e:
|
|
160
|
+
print(f" Warning: OCR failed for frame {idx}: {e}")
|
|
161
|
+
results[idx] = ""
|
|
162
|
+
|
|
163
|
+
completed += 1
|
|
164
|
+
# Progress indicator every 10%
|
|
165
|
+
if completed % max(1, total // 10) == 0:
|
|
166
|
+
pct = (completed / total) * 100
|
|
167
|
+
print(f" OCR progress: {completed}/{total} ({pct:.0f}%)")
|
|
168
|
+
|
|
169
|
+
return results
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def compute_clip_embeddings(
|
|
173
|
+
frames: list[tuple[int, float, Path]],
|
|
174
|
+
model,
|
|
175
|
+
preprocess,
|
|
176
|
+
) -> torch.Tensor:
|
|
177
|
+
"""Compute CLIP embeddings for all frames."""
|
|
178
|
+
embeddings = []
|
|
179
|
+
|
|
180
|
+
for _, _, path in frames:
|
|
181
|
+
try:
|
|
182
|
+
image = preprocess(Image.open(path)).unsqueeze(0).to(DEVICE)
|
|
183
|
+
|
|
184
|
+
with torch.no_grad():
|
|
185
|
+
embedding = model.encode_image(image)
|
|
186
|
+
embedding = embedding / embedding.norm(dim=-1, keepdim=True)
|
|
187
|
+
|
|
188
|
+
embeddings.append(embedding.cpu())
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print(f" Warning: CLIP embedding failed for {path.name}: {e}")
|
|
191
|
+
# Use zero vector as fallback to maintain alignment
|
|
192
|
+
embeddings.append(torch.zeros((1, 512)))
|
|
193
|
+
|
|
194
|
+
if not embeddings:
|
|
195
|
+
return torch.zeros((0, 512))
|
|
196
|
+
|
|
197
|
+
return torch.cat(embeddings, dim=0)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def cluster_frames(embeddings: torch.Tensor) -> list[int]:
|
|
201
|
+
"""Cluster frames by CLIP embedding similarity."""
|
|
202
|
+
if len(embeddings) < 2:
|
|
203
|
+
return [0] * len(embeddings)
|
|
204
|
+
|
|
205
|
+
clustering = AgglomerativeClustering(
|
|
206
|
+
n_clusters=None, # type: ignore
|
|
207
|
+
distance_threshold=CLUSTER_DISTANCE_THRESHOLD,
|
|
208
|
+
metric="cosine",
|
|
209
|
+
linkage="average",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
labels = clustering.fit_predict(embeddings.numpy())
|
|
213
|
+
return labels.tolist()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def infer_label_with_clip(
|
|
217
|
+
image_path: Path,
|
|
218
|
+
model,
|
|
219
|
+
preprocess,
|
|
220
|
+
tokenizer,
|
|
221
|
+
) -> str:
|
|
222
|
+
"""Use CLIP zero-shot to classify frame into UI category."""
|
|
223
|
+
try:
|
|
224
|
+
image = preprocess(Image.open(image_path)).unsqueeze(0).to(DEVICE)
|
|
225
|
+
text_tokens = tokenizer(UI_CATEGORIES).to(DEVICE)
|
|
226
|
+
|
|
227
|
+
with torch.no_grad():
|
|
228
|
+
image_features = model.encode_image(image)
|
|
229
|
+
text_features = model.encode_text(text_tokens)
|
|
230
|
+
|
|
231
|
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
|
232
|
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
|
233
|
+
|
|
234
|
+
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
|
235
|
+
best_idx = similarity.argmax().item()
|
|
236
|
+
|
|
237
|
+
return CATEGORY_LABELS[best_idx]
|
|
238
|
+
except Exception as e:
|
|
239
|
+
print(f" Warning: Zero-shot classification failed for {image_path.name}: {e}")
|
|
240
|
+
return "unknown"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def detect_media_indicators(ocr_text: str) -> list[str]:
|
|
244
|
+
"""
|
|
245
|
+
Detect indicators that frame shows media content.
|
|
246
|
+
|
|
247
|
+
TODO: Expand patterns based on real-world testing:
|
|
248
|
+
- Video platforms: Vimeo, Twitch, Netflix, Disney+
|
|
249
|
+
- Image formats: .gif, .webp, .svg, .bmp
|
|
250
|
+
- Media players: VLC, QuickTime, IINA, mpv
|
|
251
|
+
- Streaming: Spotify, Apple Music, SoundCloud
|
|
252
|
+
- Social media: Twitter/X, Instagram, TikTok
|
|
253
|
+
"""
|
|
254
|
+
indicators = []
|
|
255
|
+
text_lower = ocr_text.lower()
|
|
256
|
+
|
|
257
|
+
# Video platforms
|
|
258
|
+
if "youtube" in text_lower:
|
|
259
|
+
indicators.append("youtube")
|
|
260
|
+
|
|
261
|
+
if "vimeo" in text_lower:
|
|
262
|
+
indicators.append("vimeo")
|
|
263
|
+
|
|
264
|
+
if "netflix" in text_lower:
|
|
265
|
+
indicators.append("netflix")
|
|
266
|
+
|
|
267
|
+
# Image files
|
|
268
|
+
image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
|
|
269
|
+
if any(ext in text_lower for ext in image_extensions):
|
|
270
|
+
indicators.append("image-file")
|
|
271
|
+
|
|
272
|
+
# TODO: Add more patterns after dry-run testing
|
|
273
|
+
|
|
274
|
+
return indicators
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def build_cluster_metadata(
|
|
278
|
+
frames_data: list[FrameData],
|
|
279
|
+
cluster_labels: list[int],
|
|
280
|
+
model,
|
|
281
|
+
preprocess,
|
|
282
|
+
tokenizer,
|
|
283
|
+
) -> list[ClusterData]:
|
|
284
|
+
"""Build metadata for each cluster."""
|
|
285
|
+
clusters: dict[int, list[FrameData]] = {}
|
|
286
|
+
|
|
287
|
+
for frame, label in zip(frames_data, cluster_labels):
|
|
288
|
+
if label not in clusters:
|
|
289
|
+
clusters[label] = []
|
|
290
|
+
clusters[label].append(frame)
|
|
291
|
+
|
|
292
|
+
result = []
|
|
293
|
+
for cluster_id, cluster_frames in clusters.items():
|
|
294
|
+
# Find representative (middle frame)
|
|
295
|
+
representative = cluster_frames[len(cluster_frames) // 2]
|
|
296
|
+
|
|
297
|
+
# Compute average OCR characters
|
|
298
|
+
avg_chars = sum(len(f["ocrText"]) for f in cluster_frames) / len(cluster_frames)
|
|
299
|
+
|
|
300
|
+
# Get time range
|
|
301
|
+
timestamps = [f["timestamp"] for f in cluster_frames]
|
|
302
|
+
time_range = (float(min(timestamps)), float(max(timestamps)))
|
|
303
|
+
|
|
304
|
+
# Aggregate media indicators
|
|
305
|
+
all_indicators = set()
|
|
306
|
+
for f in cluster_frames:
|
|
307
|
+
all_indicators.update(detect_media_indicators(f["ocrText"]))
|
|
308
|
+
|
|
309
|
+
# Infer label using CLIP on representative
|
|
310
|
+
rep_path = Path(representative["imagePath"])
|
|
311
|
+
label = infer_label_with_clip(rep_path, model, preprocess, tokenizer)
|
|
312
|
+
|
|
313
|
+
result.append({
|
|
314
|
+
"id": cluster_id,
|
|
315
|
+
"heuristicLabel": label,
|
|
316
|
+
"timeRange": time_range,
|
|
317
|
+
"frameCount": len(cluster_frames),
|
|
318
|
+
"representativeIdx": representative["index"],
|
|
319
|
+
"avgOcrCharacters": avg_chars,
|
|
320
|
+
"mediaIndicators": list(all_indicators),
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
return result
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def main():
|
|
327
|
+
args = parse_args()
|
|
328
|
+
|
|
329
|
+
print(f"Loading frames from {args.frames_dir}...")
|
|
330
|
+
frames = load_frames(args.frames_dir, args.frame_interval)
|
|
331
|
+
|
|
332
|
+
if not frames:
|
|
333
|
+
print("Error: No frames found")
|
|
334
|
+
return 1
|
|
335
|
+
|
|
336
|
+
print(f"Found {len(frames)} frames")
|
|
337
|
+
|
|
338
|
+
# Initialize timing
|
|
339
|
+
timing = {"ocrMs": 0, "clipMs": 0, "clusterMs": 0, "totalMs": 0}
|
|
340
|
+
total_start = time.time()
|
|
341
|
+
|
|
342
|
+
# Phase 1: OCR (Parallel)
|
|
343
|
+
print(f"Phase 1: Extracting text with OCR ({args.workers} workers)...")
|
|
344
|
+
ocr_start = time.time()
|
|
345
|
+
|
|
346
|
+
ocr_results = extract_ocr_parallel(frames, args.workers)
|
|
347
|
+
|
|
348
|
+
frames_data: list[FrameData] = []
|
|
349
|
+
for idx, timestamp, path in frames:
|
|
350
|
+
frames_data.append({
|
|
351
|
+
"index": idx,
|
|
352
|
+
"timestamp": timestamp,
|
|
353
|
+
"imagePath": str(path),
|
|
354
|
+
"ocrText": ocr_results.get(idx, ""),
|
|
355
|
+
"clusterId": -1, # Set later
|
|
356
|
+
"changeScore": 0.0, # TODO: Implement pixel delta if needed
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
timing["ocrMs"] = int((time.time() - ocr_start) * 1000)
|
|
360
|
+
print(f" OCR complete: {timing['ocrMs']}ms")
|
|
361
|
+
|
|
362
|
+
# Phase 2: CLIP embeddings
|
|
363
|
+
print(f"Phase 2: Computing CLIP embeddings on {DEVICE}...")
|
|
364
|
+
clip_start = time.time()
|
|
365
|
+
|
|
366
|
+
model, _, preprocess = open_clip.create_model_and_transforms(
|
|
367
|
+
CLIP_MODEL, pretrained=CLIP_PRETRAINED
|
|
368
|
+
)
|
|
369
|
+
model.eval()
|
|
370
|
+
model.to(DEVICE)
|
|
371
|
+
tokenizer = open_clip.get_tokenizer(CLIP_MODEL)
|
|
372
|
+
|
|
373
|
+
embeddings = compute_clip_embeddings(frames, model, preprocess)
|
|
374
|
+
timing["clipMs"] = int((time.time() - clip_start) * 1000)
|
|
375
|
+
print(f" CLIP complete: {timing['clipMs']}ms")
|
|
376
|
+
|
|
377
|
+
# Phase 3: Clustering
|
|
378
|
+
print("Phase 3: Clustering frames...")
|
|
379
|
+
cluster_start = time.time()
|
|
380
|
+
|
|
381
|
+
cluster_labels = cluster_frames(embeddings)
|
|
382
|
+
|
|
383
|
+
# Update frames with cluster IDs
|
|
384
|
+
for frame, label in zip(frames_data, cluster_labels):
|
|
385
|
+
frame["clusterId"] = label
|
|
386
|
+
|
|
387
|
+
timing["clusterMs"] = int((time.time() - cluster_start) * 1000)
|
|
388
|
+
print(f" Clustering complete: {timing['clusterMs']}ms")
|
|
389
|
+
|
|
390
|
+
# Phase 4: Build cluster metadata
|
|
391
|
+
print("Phase 4: Building cluster metadata...")
|
|
392
|
+
clusters = build_cluster_metadata(
|
|
393
|
+
frames_data, cluster_labels, model, preprocess, tokenizer
|
|
394
|
+
)
|
|
395
|
+
print(f" Found {len(clusters)} clusters")
|
|
396
|
+
|
|
397
|
+
timing["totalMs"] = int((time.time() - total_start) * 1000)
|
|
398
|
+
|
|
399
|
+
# Output
|
|
400
|
+
result: VisualIndex = {
|
|
401
|
+
"frames": frames_data,
|
|
402
|
+
"clusters": clusters,
|
|
403
|
+
"processingTime": timing,
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
407
|
+
with open(args.output, "w") as f:
|
|
408
|
+
json.dump(result, f, indent=2)
|
|
409
|
+
|
|
410
|
+
print(f"\nOutput written to {args.output}")
|
|
411
|
+
print(f"Total processing time: {timing['totalMs']}ms")
|
|
412
|
+
|
|
413
|
+
return 0
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
if __name__ == "__main__":
|
|
417
|
+
exit(main())
|