vidclaude 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,285 @@
1
+ """Layer J: Evidence assembly and output generation.
2
+
3
+ Two modes:
4
+ 1. Skill mode (--extract): Generates evidence.md for Claude Code to read + reason over.
5
+ 2. API mode (--api): Builds Anthropic API message with base64 frames + evidence.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from pathlib import Path
12
+
13
+ from .models import (
14
+ Evidence, VideoMeta, Frame, TranscriptChunk, OCRResult, TimelineEvent,
15
+ save_json,
16
+ )
17
+ from .util import ms_to_hhmmss, image_to_base64
18
+
19
+ logger = logging.getLogger("vidclaude")
20
+
21
+ # Maximum frames to include in evidence.md frame listing
22
+ MAX_EVIDENCE_FRAMES = 20
23
+
24
+
25
+ def generate_evidence_md(evidence: Evidence, cache_dir: Path) -> str:
26
+ """Generate evidence.md — a structured report for Claude Code skill mode.
27
+
28
+ This file is read by Claude in the conversation to reason over the video.
29
+ Frame images are referenced by path so Claude Code can read them directly.
30
+ """
31
+ lines: list[str] = []
32
+
33
+ # Header
34
+ lines.append("# Video Analysis Evidence Report")
35
+ lines.append("")
36
+
37
+ # Video metadata
38
+ meta = evidence.video_meta
39
+ lines.append("## Video Information")
40
+ lines.append(f"- **File**: `{meta.path}`")
41
+ lines.append(f"- **Duration**: {ms_to_hhmmss(int(meta.duration_sec * 1000))}")
42
+ lines.append(f"- **Resolution**: {meta.resolution[0]}x{meta.resolution[1]}")
43
+ lines.append(f"- **FPS**: {meta.fps:.1f}")
44
+ lines.append(f"- **Audio tracks**: {meta.audio_tracks}")
45
+ lines.append(f"- **Format**: {meta.format}")
46
+ lines.append("")
47
+
48
+ # Question and intent
49
+ if evidence.question:
50
+ lines.append("## Question")
51
+ lines.append(evidence.question)
52
+ lines.append(f"- **Detected intent**: {evidence.intent}")
53
+ lines.append("")
54
+
55
+ # Global summary
56
+ if evidence.global_summary:
57
+ lines.append("## Global Summary")
58
+ lines.append(evidence.global_summary)
59
+ lines.append("")
60
+
61
+ # Scene summaries
62
+ if evidence.scene_summaries:
63
+ lines.append("## Scene Summaries")
64
+ for summary in evidence.scene_summaries:
65
+ lines.append(summary)
66
+ lines.append("")
67
+
68
+ # Extracted frames listing
69
+ lines.append("## Extracted Frames")
70
+ lines.append(f"Total frames: {len(evidence.frames)}")
71
+ lines.append("")
72
+
73
+ # Select frames to list (cap at MAX_EVIDENCE_FRAMES for readability)
74
+ display_frames = _select_display_frames(evidence.frames, MAX_EVIDENCE_FRAMES)
75
+ for frame in display_frames:
76
+ ts = ms_to_hhmmss(frame.timestamp_ms)
77
+ reasons = ", ".join(frame.sampling_reason)
78
+ lines.append(f"- **{frame.frame_id}** @ {ts} [{reasons}]: `{frame.image_path}`")
79
+ lines.append("")
80
+
81
+ # Transcript
82
+ if evidence.transcript_chunks:
83
+ lines.append("## Audio Transcript")
84
+ for chunk in evidence.transcript_chunks:
85
+ ts_start = ms_to_hhmmss(chunk.start_ms)
86
+ ts_end = ms_to_hhmmss(chunk.end_ms)
87
+ speaker = f"[{chunk.speaker}] " if chunk.speaker else ""
88
+ lines.append(f"- [{ts_start} → {ts_end}] {speaker}{chunk.text}")
89
+ lines.append("")
90
+
91
+ # OCR results
92
+ if evidence.ocr_results:
93
+ lines.append("## On-Screen Text (OCR)")
94
+ for ocr in evidence.ocr_results:
95
+ ts = ms_to_hhmmss(ocr.timestamp_ms)
96
+ lines.append(f"- [{ts}] (conf: {ocr.confidence:.0%}) {ocr.text}")
97
+ lines.append("")
98
+
99
+ # Timeline
100
+ if evidence.timeline_events:
101
+ lines.append("## Timeline")
102
+ for event in evidence.timeline_events:
103
+ ts = ms_to_hhmmss(event.start_ms)
104
+ end = f" → {ms_to_hhmmss(event.end_ms)}" if event.end_ms else ""
105
+ lines.append(f"- [{ts}{end}] **{event.modality}**: {event.summary}")
106
+ lines.append("")
107
+
108
+ # Frame paths for Claude Code to read
109
+ lines.append("## Frame Paths (for visual analysis)")
110
+ lines.append("The following frame images can be viewed for visual analysis:")
111
+ lines.append("")
112
+ for frame in display_frames:
113
+ lines.append(f"- `{frame.image_path}`")
114
+ lines.append("")
115
+
116
+ content = "\n".join(lines)
117
+
118
+ # Write to cache
119
+ evidence_path = cache_dir / "evidence.md"
120
+ evidence_path.write_text(content, encoding="utf-8")
121
+ logger.info("Evidence report written to %s", evidence_path)
122
+
123
+ return content
124
+
125
+
126
+ def build_api_message(evidence: Evidence) -> dict:
127
+ """Build the Anthropic API message for standalone mode.
128
+
129
+ Returns dict with 'system' and 'messages' for the API call.
130
+ """
131
+ system_prompt = (
132
+ "You are analyzing a video. You will be shown frames extracted from the video "
133
+ "at regular intervals, along with an audio transcript and other evidence if available. "
134
+ "Analyze the visual and audio content to answer the user's question.\n\n"
135
+ "Guidelines:\n"
136
+ "- Ground every claim in timestamps\n"
137
+ "- Distinguish observation from inference\n"
138
+ "- If evidence is insufficient or ambiguous, say so clearly\n"
139
+ "- Rate your confidence as high/medium/low"
140
+ )
141
+
142
+ content: list[dict] = []
143
+
144
+ # Video info header
145
+ meta = evidence.video_meta
146
+ content.append({
147
+ "type": "text",
148
+ "text": (
149
+ f"Video: {meta.duration_sec:.1f}s, {meta.resolution[0]}x{meta.resolution[1]}, "
150
+ f"{meta.audio_tracks} audio track(s)\n"
151
+ f"Sharing {len(evidence.frames)} frames. Intent: {evidence.intent}"
152
+ ),
153
+ })
154
+
155
+ # Global summary
156
+ if evidence.global_summary:
157
+ content.append({
158
+ "type": "text",
159
+ "text": f"<global_summary>\n{evidence.global_summary}\n</global_summary>",
160
+ })
161
+
162
+ # Frames as base64 images
163
+ display_frames = _select_display_frames(evidence.frames, MAX_EVIDENCE_FRAMES)
164
+ for frame in display_frames:
165
+ ts = ms_to_hhmmss(frame.timestamp_ms)
166
+ content.append({
167
+ "type": "text",
168
+ "text": f"Frame {frame.frame_id} @ {ts}:",
169
+ })
170
+ try:
171
+ b64 = image_to_base64(frame.image_path)
172
+ content.append({
173
+ "type": "image",
174
+ "source": {
175
+ "type": "base64",
176
+ "media_type": "image/jpeg",
177
+ "data": b64,
178
+ },
179
+ })
180
+ except Exception as e:
181
+ logger.warning("Could not encode frame %s: %s", frame.frame_id, e)
182
+
183
+ # Timeline
184
+ if evidence.timeline_events:
185
+ timeline_text = "\n".join(
186
+ f"[{ms_to_hhmmss(e.start_ms)}] {e.modality}: {e.summary}"
187
+ for e in evidence.timeline_events
188
+ )
189
+ content.append({
190
+ "type": "text",
191
+ "text": f"<timeline>\n{timeline_text}\n</timeline>",
192
+ })
193
+
194
+ # Transcript
195
+ if evidence.transcript_chunks:
196
+ transcript_text = "\n".join(
197
+ f"[{ms_to_hhmmss(c.start_ms)} → {ms_to_hhmmss(c.end_ms)}] {c.text}"
198
+ for c in evidence.transcript_chunks
199
+ )
200
+ content.append({
201
+ "type": "text",
202
+ "text": f"<transcript>\n{transcript_text}\n</transcript>",
203
+ })
204
+
205
+ # OCR
206
+ if evidence.ocr_results:
207
+ ocr_text = "\n".join(
208
+ f"[{ms_to_hhmmss(o.timestamp_ms)}] {o.text}"
209
+ for o in evidence.ocr_results
210
+ )
211
+ content.append({
212
+ "type": "text",
213
+ "text": f"<ocr>\n{ocr_text}\n</ocr>",
214
+ })
215
+
216
+ # Question
217
+ question = evidence.question or "Describe what's happening in this video in detail."
218
+ content.append({
219
+ "type": "text",
220
+ "text": question,
221
+ })
222
+
223
+ return {
224
+ "system": system_prompt,
225
+ "messages": [{"role": "user", "content": content}],
226
+ }
227
+
228
+
229
+ def call_claude_api(evidence: Evidence) -> str:
230
+ """Make the Claude API call for standalone mode. Requires ANTHROPIC_API_KEY."""
231
+ try:
232
+ import anthropic
233
+ except ImportError:
234
+ raise RuntimeError(
235
+ "anthropic package not installed. Install with: pip install anthropic\n"
236
+ "Or use skill mode (--extract) which doesn't need the API."
237
+ )
238
+
239
+ import os
240
+ if not os.environ.get("ANTHROPIC_API_KEY"):
241
+ raise RuntimeError(
242
+ "ANTHROPIC_API_KEY not set. Set it with:\n"
243
+ " export ANTHROPIC_API_KEY=sk-...\n"
244
+ "Or use skill mode (--extract) which doesn't need the API."
245
+ )
246
+
247
+ msg = build_api_message(evidence)
248
+ client = anthropic.Anthropic()
249
+
250
+ logger.info("Calling Claude API (model=claude-sonnet-4-20250514)...")
251
+ response = client.messages.create(
252
+ model="claude-sonnet-4-20250514",
253
+ max_tokens=4096,
254
+ system=msg["system"],
255
+ messages=msg["messages"],
256
+ )
257
+
258
+ return response.content[0].text
259
+
260
+
261
+ def _select_display_frames(frames: list[Frame], max_count: int) -> list[Frame]:
262
+ """Select the most representative frames for display.
263
+
264
+ Prioritizes shot boundary frames, then evenly spaces base frames.
265
+ """
266
+ if len(frames) <= max_count:
267
+ return frames
268
+
269
+ # Separate boundary and base frames
270
+ boundary = [f for f in frames if "shot_boundary" in f.sampling_reason]
271
+ base = [f for f in frames if "shot_boundary" not in f.sampling_reason]
272
+
273
+ # Keep all boundary frames up to half the budget
274
+ max_boundary = min(len(boundary), max_count // 2)
275
+ selected = boundary[:max_boundary]
276
+
277
+ # Fill remaining with evenly spaced base frames
278
+ remaining = max_count - len(selected)
279
+ if remaining > 0 and base:
280
+ step = max(1, len(base) // remaining)
281
+ selected.extend(base[::step][:remaining])
282
+
283
+ # Sort by timestamp
284
+ selected.sort(key=lambda f: f.timestamp_ms)
285
+ return selected
@@ -0,0 +1,239 @@
1
+ """Layers B+C: Shot boundary detection and adaptive visual sampling.
2
+
3
+ Shot detection uses ffmpeg's scene change filter.
4
+ Adaptive sampling adjusts frame rate based on content and mode.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import re
11
+ from pathlib import Path
12
+
13
+ from .models import VideoMeta, Shot, Frame
14
+ from .util import run_ffmpeg
15
+
16
+ logger = logging.getLogger("vidclaude")
17
+
18
+ # Mode-specific sampling parameters
19
+ MODE_CONFIG = {
20
+ "quick": {
21
+ "base_fps": 0.2,
22
+ "max_frames": 20,
23
+ "burst_frames": 0, # no extra frames at transitions
24
+ },
25
+ "standard": {
26
+ "base_fps": 0.5,
27
+ "max_frames": 60,
28
+ "burst_frames": 2, # +2 frames around each shot boundary
29
+ },
30
+ "deep": {
31
+ "base_fps": 1.0,
32
+ "max_frames": 150,
33
+ "burst_frames": 4, # +4 frames around transitions
34
+ },
35
+ }
36
+
37
+
38
+ def detect_shots(meta: VideoMeta, threshold: float = 0.3) -> list[Shot]:
39
+ """Detect shot boundaries using ffmpeg scene change filter.
40
+
41
+ Returns a list of Shot objects representing contiguous segments.
42
+ """
43
+ logger.info("Detecting shots (threshold=%.2f)...", threshold)
44
+
45
+ # Use select filter with scene detection + showinfo to get timestamps
46
+ result = run_ffmpeg([
47
+ "-i", meta.path,
48
+ "-vf", f"select='gt(scene,{threshold})',showinfo",
49
+ "-vsync", "vfr",
50
+ "-f", "null",
51
+ "-",
52
+ ])
53
+
54
+ # Parse showinfo output from stderr for timestamps
55
+ # Pattern: [Parsed_showinfo...] n:... pts:... pts_time:12.345 ...
56
+ boundary_times: list[float] = [0.0] # video always starts at 0
57
+ for line in result.stderr.split("\n"):
58
+ match = re.search(r"pts_time:\s*([\d.]+)", line)
59
+ if match:
60
+ t = float(match.group(1))
61
+ if t > 0:
62
+ boundary_times.append(t)
63
+
64
+ # Also get scene scores where available
65
+ scores: list[float] = []
66
+ for line in result.stderr.split("\n"):
67
+ match = re.search(r"scene_score=\s*([\d.]+)", line)
68
+ if match:
69
+ scores.append(float(match.group(1)))
70
+
71
+ # Add video end as final boundary
72
+ boundary_times.append(meta.duration_sec)
73
+
74
+ # Remove duplicates and sort
75
+ boundary_times = sorted(set(boundary_times))
76
+
77
+ # Build shots from consecutive boundary pairs
78
+ shots: list[Shot] = []
79
+ for i in range(len(boundary_times) - 1):
80
+ start_ms = int(boundary_times[i] * 1000)
81
+ end_ms = int(boundary_times[i + 1] * 1000)
82
+ if end_ms <= start_ms:
83
+ continue
84
+ score = scores[i] if i < len(scores) else threshold
85
+ shots.append(Shot(
86
+ shot_id=f"s_{i:04d}",
87
+ start_ms=start_ms,
88
+ end_ms=end_ms,
89
+ score=score,
90
+ ))
91
+
92
+ # If no shots detected (no scene changes), treat entire video as one shot
93
+ if not shots:
94
+ shots = [Shot(
95
+ shot_id="s_0000",
96
+ start_ms=0,
97
+ end_ms=int(meta.duration_sec * 1000),
98
+ score=0.0,
99
+ )]
100
+
101
+ logger.info("Detected %d shot(s)", len(shots))
102
+ return shots
103
+
104
+
105
+ def compute_sample_timestamps(
106
+ meta: VideoMeta,
107
+ shots: list[Shot],
108
+ mode: str,
109
+ fps_override: float | None = None,
110
+ max_frames_override: int | None = None,
111
+ ) -> list[tuple[int, list[str]]]:
112
+ """Compute which timestamps to sample frames at.
113
+
114
+ Returns list of (timestamp_ms, [sampling_reasons]).
115
+ """
116
+ config = MODE_CONFIG[mode]
117
+ base_fps = fps_override if fps_override is not None else config["base_fps"]
118
+ max_frames = max_frames_override if max_frames_override is not None else config["max_frames"]
119
+ burst = config["burst_frames"]
120
+
121
+ duration_ms = int(meta.duration_sec * 1000)
122
+
123
+ # Smart frame budget: reduce fps if we'd exceed max_frames
124
+ total_base = int(meta.duration_sec * base_fps)
125
+ if total_base > max_frames:
126
+ base_fps = max_frames / meta.duration_sec
127
+ logger.info("Auto-reduced fps to %.3f to stay under %d frames", base_fps, max_frames)
128
+
129
+ # Step 1: Base sampling at uniform intervals
130
+ samples: dict[int, list[str]] = {} # timestamp_ms -> reasons
131
+ if base_fps > 0 and meta.duration_sec > 0:
132
+ interval_ms = int(1000 / base_fps)
133
+ t = 0
134
+ while t < duration_ms:
135
+ samples[t] = ["base"]
136
+ t += interval_ms
137
+
138
+ # Ensure at least one frame
139
+ if not samples:
140
+ samples[0] = ["base"]
141
+
142
+ # Step 2: Burst frames around shot boundaries
143
+ if burst > 0 and len(shots) > 1:
144
+ for shot in shots[1:]: # skip first boundary (t=0)
145
+ boundary_ms = shot.start_ms
146
+ # Add frames before and after the boundary
147
+ for offset in range(-burst // 2, burst // 2 + 1):
148
+ t = boundary_ms + offset * 200 # 200ms spacing for burst frames
149
+ if 0 <= t < duration_ms:
150
+ if t in samples:
151
+ if "shot_boundary" not in samples[t]:
152
+ samples[t].append("shot_boundary")
153
+ else:
154
+ samples[t] = ["shot_boundary"]
155
+
156
+ # Step 3: Enforce max_frames budget
157
+ sorted_samples = sorted(samples.items(), key=lambda x: x[0])
158
+ if len(sorted_samples) > max_frames:
159
+ # Prioritize: shot_boundary frames first, then evenly spaced base frames
160
+ boundary_frames = [(t, r) for t, r in sorted_samples if "shot_boundary" in r]
161
+ base_frames = [(t, r) for t, r in sorted_samples if "shot_boundary" not in r]
162
+
163
+ # Keep all boundary frames up to half the budget
164
+ max_boundary = min(len(boundary_frames), max_frames // 2)
165
+ kept_boundary = boundary_frames[:max_boundary]
166
+
167
+ # Fill remaining budget with evenly spaced base frames
168
+ remaining = max_frames - len(kept_boundary)
169
+ if remaining > 0 and base_frames:
170
+ step = max(1, len(base_frames) // remaining)
171
+ kept_base = base_frames[::step][:remaining]
172
+ else:
173
+ kept_base = []
174
+
175
+ sorted_samples = sorted(kept_boundary + kept_base, key=lambda x: x[0])
176
+
177
+ logger.debug("Computed %d sample timestamps", len(sorted_samples))
178
+ return sorted_samples
179
+
180
+
181
+ def extract_frames(
182
+ meta: VideoMeta,
183
+ shots: list[Shot],
184
+ mode: str,
185
+ cache_dir: Path,
186
+ fps_override: float | None = None,
187
+ max_frames_override: int | None = None,
188
+ ) -> list[Frame]:
189
+ """Extract frames from video based on adaptive sampling.
190
+
191
+ Saves frames as JPEG to cache_dir/frames/ and returns Frame objects.
192
+ """
193
+ frames_dir = cache_dir / "frames"
194
+ frames_dir.mkdir(exist_ok=True)
195
+
196
+ timestamps = compute_sample_timestamps(
197
+ meta, shots, mode, fps_override, max_frames_override
198
+ )
199
+
200
+ logger.info("Extracting %d frames...", len(timestamps))
201
+
202
+ frames: list[Frame] = []
203
+ shot_index = 0
204
+
205
+ for seq, (ts_ms, reasons) in enumerate(timestamps):
206
+ # Find which shot this timestamp belongs to
207
+ while shot_index < len(shots) - 1 and ts_ms >= shots[shot_index].end_ms:
208
+ shot_index += 1
209
+ shot_id = shots[shot_index].shot_id if shot_index < len(shots) else "s_unknown"
210
+
211
+ # Output filename: frame_NNNN_TTTTTTTT.jpg
212
+ fname = f"frame_{seq:04d}_{ts_ms:08d}.jpg"
213
+ out_path = frames_dir / fname
214
+
215
+ # Extract single frame at this timestamp
216
+ ts_sec = ts_ms / 1000.0
217
+ result = run_ffmpeg([
218
+ "-ss", f"{ts_sec:.3f}",
219
+ "-i", meta.path,
220
+ "-frames:v", "1",
221
+ "-q:v", "2",
222
+ "-y",
223
+ str(out_path),
224
+ ])
225
+
226
+ if result.returncode != 0 or not out_path.exists():
227
+ logger.warning("Failed to extract frame at %.3fs", ts_sec)
228
+ continue
229
+
230
+ frames.append(Frame(
231
+ frame_id=f"f_{seq:04d}",
232
+ timestamp_ms=ts_ms,
233
+ shot_id=shot_id,
234
+ sampling_reason=reasons,
235
+ image_path=str(out_path),
236
+ ))
237
+
238
+ logger.info("Extracted %d frames successfully", len(frames))
239
+ return frames
@@ -0,0 +1,95 @@
1
+ """Layer G: Temporal event graph (simplified as sorted event list).
2
+
3
+ Merges evidence from all modalities into a unified, time-sorted event list
4
+ that gives Claude explicit temporal ordering for reasoning.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+
11
+ from .models import Shot, Frame, TranscriptChunk, OCRResult, TimelineEvent
12
+
13
+ logger = logging.getLogger("vidclaude")
14
+
15
+
16
+ def build_timeline(
17
+ shots: list[Shot],
18
+ frames: list[Frame],
19
+ transcript: list[TranscriptChunk],
20
+ ocr_results: list[OCRResult],
21
+ ) -> list[TimelineEvent]:
22
+ """Build a unified timeline from all modality outputs.
23
+
24
+ Merges shot boundaries, frame observations, transcript chunks,
25
+ and OCR results into a single sorted event list.
26
+ """
27
+ events: list[TimelineEvent] = []
28
+ event_counter = 0
29
+
30
+ # Shot boundaries as scene change events
31
+ for i, shot in enumerate(shots):
32
+ if i == 0:
33
+ continue # skip first (video start, not a transition)
34
+ events.append(TimelineEvent(
35
+ event_id=f"te_{event_counter:04d}",
36
+ start_ms=shot.start_ms,
37
+ end_ms=None,
38
+ modality="scene_change",
39
+ summary=f"Scene change (confidence: {shot.score:.2f})",
40
+ source_ids=[shot.shot_id],
41
+ ))
42
+ event_counter += 1
43
+
44
+ # Transcript chunks as speech events
45
+ for chunk in transcript:
46
+ events.append(TimelineEvent(
47
+ event_id=f"te_{event_counter:04d}",
48
+ start_ms=chunk.start_ms,
49
+ end_ms=chunk.end_ms,
50
+ modality="speech",
51
+ summary=chunk.text,
52
+ source_ids=[chunk.chunk_id],
53
+ ))
54
+ event_counter += 1
55
+
56
+ # OCR results as text-on-screen events
57
+ for ocr in ocr_results:
58
+ events.append(TimelineEvent(
59
+ event_id=f"te_{event_counter:04d}",
60
+ start_ms=ocr.timestamp_ms,
61
+ end_ms=None,
62
+ modality="ocr",
63
+ summary=f"On-screen text: {ocr.text}",
64
+ source_ids=[ocr.frame_id],
65
+ ))
66
+ event_counter += 1
67
+
68
+ # Frame extractions as visual observation markers
69
+ # Only include shot_boundary frames to avoid flooding the timeline
70
+ for frame in frames:
71
+ if "shot_boundary" in frame.sampling_reason:
72
+ events.append(TimelineEvent(
73
+ event_id=f"te_{event_counter:04d}",
74
+ start_ms=frame.timestamp_ms,
75
+ end_ms=None,
76
+ modality="visual",
77
+ summary=f"Keyframe captured ({', '.join(frame.sampling_reason)})",
78
+ source_ids=[frame.frame_id],
79
+ ))
80
+ event_counter += 1
81
+
82
+ # Sort by start time, then by modality priority for same-time events
83
+ modality_order = {"scene_change": 0, "visual": 1, "speech": 2, "ocr": 3}
84
+ events.sort(key=lambda e: (e.start_ms, modality_order.get(e.modality, 9)))
85
+
86
+ logger.info(
87
+ "Built timeline with %d events: %d speech, %d ocr, %d scene_change, %d visual",
88
+ len(events),
89
+ sum(1 for e in events if e.modality == "speech"),
90
+ sum(1 for e in events if e.modality == "ocr"),
91
+ sum(1 for e in events if e.modality == "scene_change"),
92
+ sum(1 for e in events if e.modality == "visual"),
93
+ )
94
+
95
+ return events