vidclaude 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ """Intent classification for query-conditioned processing.
2
+
3
+ Classifies user questions into intent categories and returns
4
+ processing configuration that adjusts pipeline behavior.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+
11
+
12
+ # Intent class definitions
13
+ INTENT_DESCRIBE = "describe"
14
+ INTENT_MOMENT = "moment_retrieval"
15
+ INTENT_TEMPORAL = "temporal_ordering"
16
+ INTENT_COUNTING = "counting"
17
+ INTENT_OCR = "ocr_extraction"
18
+ INTENT_SPEECH = "speech_understanding"
19
+ INTENT_GENERAL = "general_qa"
20
+
21
+ # Pattern-based classification rules (checked in order, first match wins)
22
+ _INTENT_PATTERNS: list[tuple[str, list[str]]] = [
23
+ (INTENT_COUNTING, [
24
+ r"\bhow many\b", r"\bcount\b", r"\bhow often\b",
25
+ r"\bnumber of\b", r"\bfrequency\b",
26
+ ]),
27
+ (INTENT_TEMPORAL, [
28
+ r"\bbefore\b.*\bafter\b", r"\bafter\b.*\bbefore\b",
29
+ r"\bfirst\b.*\bthen\b", r"\border\b", r"\bsequence\b",
30
+ r"\bbefore or after\b", r"\bwhich came first\b",
31
+ r"\bchronolog", r"\btimeline\b",
32
+ ]),
33
+ (INTENT_MOMENT, [
34
+ r"\bwhen\b", r"\bat what point\b", r"\bat what time\b",
35
+ r"\bwhat time\b", r"\btimestamp\b", r"\bmoment\b",
36
+ r"\bfind the part\b", r"\bshow me where\b",
37
+ ]),
38
+ (INTENT_OCR, [
39
+ r"\btext\b", r"\bread\b", r"\bsign\b", r"\bslide\b",
40
+ r"\bwritten\b", r"\bscreen\b", r"\bdisplay\b",
41
+ r"\btitle\b", r"\bcaption\b", r"\bsubtitle\b",
42
+ r"\bwhat does it say\b", r"\bwhat is written\b",
43
+ ]),
44
+ (INTENT_SPEECH, [
45
+ r"\bsay\b", r"\bsaid\b", r"\bmention\b", r"\bspeak\b",
46
+ r"\bspoke\b", r"\btalk\b", r"\bword\b", r"\bdiscuss\b",
47
+ r"\bconversat", r"\bdialog",
48
+ ]),
49
+ (INTENT_DESCRIBE, [
50
+ r"\bdescribe\b", r"\bsummar", r"\bwhat happens\b",
51
+ r"\bwhat is happening\b", r"\boverview\b", r"\bwhat.s going on\b",
52
+ ]),
53
+ ]
54
+
55
+
56
+ def classify_intent(question: str | None) -> str:
57
+ """Classify a user question into an intent category.
58
+
59
+ Returns one of the INTENT_* constants.
60
+ """
61
+ if not question:
62
+ return INTENT_DESCRIBE
63
+
64
+ q = question.lower().strip()
65
+
66
+ for intent, patterns in _INTENT_PATTERNS:
67
+ for pattern in patterns:
68
+ if re.search(pattern, q):
69
+ return intent
70
+
71
+ return INTENT_GENERAL
72
+
73
+
74
+ # Processing config per intent — adjusts pipeline behavior
75
+ _INTENT_CONFIGS: dict[str, dict] = {
76
+ INTENT_DESCRIBE: {
77
+ "prioritize_ocr": False,
78
+ "prioritize_transcript": False,
79
+ "dense_sampling": False,
80
+ },
81
+ INTENT_MOMENT: {
82
+ "prioritize_ocr": False,
83
+ "prioritize_transcript": True,
84
+ "dense_sampling": False,
85
+ },
86
+ INTENT_TEMPORAL: {
87
+ "prioritize_ocr": False,
88
+ "prioritize_transcript": True,
89
+ "dense_sampling": False,
90
+ },
91
+ INTENT_COUNTING: {
92
+ "prioritize_ocr": False,
93
+ "prioritize_transcript": False,
94
+ "dense_sampling": True,
95
+ },
96
+ INTENT_OCR: {
97
+ "prioritize_ocr": True,
98
+ "prioritize_transcript": False,
99
+ "dense_sampling": False,
100
+ },
101
+ INTENT_SPEECH: {
102
+ "prioritize_ocr": False,
103
+ "prioritize_transcript": True,
104
+ "dense_sampling": False,
105
+ },
106
+ INTENT_GENERAL: {
107
+ "prioritize_ocr": False,
108
+ "prioritize_transcript": False,
109
+ "dense_sampling": False,
110
+ },
111
+ }
112
+
113
+
114
+ def get_processing_config(intent: str) -> dict:
115
+ """Get processing configuration for an intent class."""
116
+ return dict(_INTENT_CONFIGS.get(intent, _INTENT_CONFIGS[INTENT_GENERAL]))
@@ -0,0 +1,174 @@
1
+ """Layer I: Hierarchical memory and summarization.
2
+
3
+ For longer videos, builds multi-level summaries:
4
+ - Atomic: individual timeline events (already exist)
5
+ - Scene: 60-second window summaries
6
+ - Chapter: 5-minute window summaries (for videos >30min)
7
+ - Global: overall video summary
8
+
9
+ In skill mode, summaries are text-based (no API calls).
10
+ In API mode, Claude generates them.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+
17
+ from .models import TimelineEvent
18
+ from .util import ms_to_hhmmss
19
+
20
+ logger = logging.getLogger("vidclaude")
21
+
22
+
23
+ def build_summaries(
24
+ timeline: list[TimelineEvent],
25
+ duration_sec: float,
26
+ mode: str = "standard",
27
+ ) -> dict:
28
+ """Build hierarchical summaries from timeline events.
29
+
30
+ Returns dict with keys: scene_summaries, chapter_summaries, global_summary.
31
+ For skill mode, these are text-based aggregations (no API calls).
32
+ """
33
+ result = {
34
+ "scene_summaries": [],
35
+ "chapter_summaries": [],
36
+ "global_summary": "",
37
+ }
38
+
39
+ # Quick mode: no summaries
40
+ if mode == "quick":
41
+ return result
42
+
43
+ # Short videos (<5min in standard mode): skip hierarchical summaries
44
+ if mode == "standard" and duration_sec < 300:
45
+ result["global_summary"] = _build_global_from_events(timeline)
46
+ return result
47
+
48
+ # Build scene summaries (60-second windows)
49
+ scene_summaries = _build_window_summaries(timeline, window_ms=60000)
50
+ result["scene_summaries"] = scene_summaries
51
+
52
+ # For long videos (>30min) or deep mode: add chapter summaries (5-min windows)
53
+ if duration_sec > 1800 or mode == "deep":
54
+ chapter_summaries = _build_window_summaries(timeline, window_ms=300000)
55
+ result["chapter_summaries"] = chapter_summaries
56
+
57
+ # Global summary from scene summaries
58
+ result["global_summary"] = _build_global_from_scenes(scene_summaries, duration_sec)
59
+
60
+ logger.info(
61
+ "Built summaries: %d scene, %d chapter",
62
+ len(result["scene_summaries"]),
63
+ len(result["chapter_summaries"]),
64
+ )
65
+
66
+ return result
67
+
68
+
69
+ def _build_window_summaries(
70
+ events: list[TimelineEvent],
71
+ window_ms: int,
72
+ ) -> list[str]:
73
+ """Group events into time windows and summarize each."""
74
+ if not events:
75
+ return []
76
+
77
+ max_ms = max(e.start_ms for e in events)
78
+ summaries = []
79
+ window_start = 0
80
+
81
+ while window_start <= max_ms:
82
+ window_end = window_start + window_ms
83
+ window_events = [
84
+ e for e in events
85
+ if window_start <= e.start_ms < window_end
86
+ ]
87
+
88
+ if window_events:
89
+ summary = _summarize_window(window_start, window_end, window_events)
90
+ summaries.append(summary)
91
+
92
+ window_start = window_end
93
+
94
+ return summaries
95
+
96
+
97
+ def _summarize_window(
98
+ start_ms: int,
99
+ end_ms: int,
100
+ events: list[TimelineEvent],
101
+ ) -> str:
102
+ """Create a text summary of events in a time window."""
103
+ ts_start = ms_to_hhmmss(start_ms)
104
+ ts_end = ms_to_hhmmss(end_ms)
105
+
106
+ parts = [f"[{ts_start} - {ts_end}]"]
107
+
108
+ # Group by modality
109
+ speech = [e for e in events if e.modality == "speech"]
110
+ ocr = [e for e in events if e.modality == "ocr"]
111
+ scene_changes = [e for e in events if e.modality == "scene_change"]
112
+
113
+ if scene_changes:
114
+ parts.append(f" {len(scene_changes)} scene change(s)")
115
+
116
+ if speech:
117
+ # Concatenate speech text
118
+ speech_text = " ".join(e.summary for e in speech)
119
+ if len(speech_text) > 300:
120
+ speech_text = speech_text[:297] + "..."
121
+ parts.append(f" Speech: {speech_text}")
122
+
123
+ if ocr:
124
+ ocr_texts = "; ".join(e.summary for e in ocr)
125
+ if len(ocr_texts) > 200:
126
+ ocr_texts = ocr_texts[:197] + "..."
127
+ parts.append(f" {ocr_texts}")
128
+
129
+ return "\n".join(parts)
130
+
131
+
132
+ def _build_global_from_events(events: list[TimelineEvent]) -> str:
133
+ """Build a simple global summary directly from events."""
134
+ if not events:
135
+ return "No events detected in video."
136
+
137
+ modality_counts = {}
138
+ for e in events:
139
+ modality_counts[e.modality] = modality_counts.get(e.modality, 0) + 1
140
+
141
+ parts = ["Video contains:"]
142
+ for mod, count in sorted(modality_counts.items()):
143
+ parts.append(f" - {count} {mod} event(s)")
144
+
145
+ # Include first few speech segments as overview
146
+ speech = [e for e in events if e.modality == "speech"]
147
+ if speech:
148
+ preview = " ".join(e.summary for e in speech[:5])
149
+ if len(preview) > 500:
150
+ preview = preview[:497] + "..."
151
+ parts.append(f"\nSpeech preview: {preview}")
152
+
153
+ return "\n".join(parts)
154
+
155
+
156
+ def _build_global_from_scenes(scene_summaries: list[str], duration_sec: float) -> str:
157
+ """Build global summary from scene-level summaries."""
158
+ if not scene_summaries:
159
+ return "No content detected."
160
+
161
+ duration_str = ms_to_hhmmss(int(duration_sec * 1000))
162
+ parts = [
163
+ f"Video duration: {duration_str}",
164
+ f"Total scenes: {len(scene_summaries)}",
165
+ "",
166
+ "Scene overview:",
167
+ ]
168
+
169
+ for i, summary in enumerate(scene_summaries):
170
+ # Just include the first line of each scene summary
171
+ first_line = summary.split("\n")[0]
172
+ parts.append(f" {i+1}. {first_line}")
173
+
174
+ return "\n".join(parts)
@@ -0,0 +1,162 @@
1
+ """Data models for video understanding pipeline.
2
+
3
+ All types are time-grounded with timestamp_ms or start_ms/end_ms fields.
4
+ Includes JSON serialization for caching.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass, field, asdict
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+
15
+ @dataclass
16
+ class VideoMeta:
17
+ """Layer A output: validated video metadata."""
18
+ path: str
19
+ duration_sec: float
20
+ fps: float
21
+ resolution: tuple[int, int]
22
+ audio_tracks: int
23
+ format: str
24
+ file_size_bytes: int = 0
25
+
26
+ def to_dict(self) -> dict[str, Any]:
27
+ d = asdict(self)
28
+ d["resolution"] = list(d["resolution"])
29
+ return d
30
+
31
+ @classmethod
32
+ def from_dict(cls, d: dict[str, Any]) -> VideoMeta:
33
+ d = dict(d)
34
+ d["resolution"] = tuple(d["resolution"])
35
+ return cls(**d)
36
+
37
+
38
+ @dataclass
39
+ class Shot:
40
+ """Layer B output: a detected shot boundary segment."""
41
+ shot_id: str
42
+ start_ms: int
43
+ end_ms: int
44
+ score: float # scene change confidence at the boundary
45
+
46
+ def to_dict(self) -> dict[str, Any]:
47
+ return asdict(self)
48
+
49
+ @classmethod
50
+ def from_dict(cls, d: dict[str, Any]) -> Shot:
51
+ return cls(**d)
52
+
53
+
54
+ @dataclass
55
+ class Frame:
56
+ """Layer C output: an extracted frame with metadata."""
57
+ frame_id: str
58
+ timestamp_ms: int
59
+ shot_id: str
60
+ sampling_reason: list[str] # e.g. ["base", "shot_boundary", "high_motion"]
61
+ image_path: str
62
+
63
+ def to_dict(self) -> dict[str, Any]:
64
+ return asdict(self)
65
+
66
+ @classmethod
67
+ def from_dict(cls, d: dict[str, Any]) -> Frame:
68
+ return cls(**d)
69
+
70
+
71
+ @dataclass
72
+ class TranscriptChunk:
73
+ """Layer D output: a timestamped speech segment."""
74
+ chunk_id: str
75
+ start_ms: int
76
+ end_ms: int
77
+ text: str
78
+ speaker: str | None = None
79
+
80
+ def to_dict(self) -> dict[str, Any]:
81
+ return asdict(self)
82
+
83
+ @classmethod
84
+ def from_dict(cls, d: dict[str, Any]) -> TranscriptChunk:
85
+ return cls(**d)
86
+
87
+
88
+ @dataclass
89
+ class OCRResult:
90
+ """Layer E output: text extracted from a frame."""
91
+ frame_id: str
92
+ timestamp_ms: int
93
+ text: str
94
+ confidence: float
95
+
96
+ def to_dict(self) -> dict[str, Any]:
97
+ return asdict(self)
98
+
99
+ @classmethod
100
+ def from_dict(cls, d: dict[str, Any]) -> OCRResult:
101
+ return cls(**d)
102
+
103
+
104
+ @dataclass
105
+ class TimelineEvent:
106
+ """Layer G output: a unified event from any modality."""
107
+ event_id: str
108
+ start_ms: int
109
+ end_ms: int | None
110
+ modality: str # "visual" | "speech" | "ocr" | "scene_change"
111
+ summary: str
112
+ source_ids: list[str] = field(default_factory=list)
113
+
114
+ def to_dict(self) -> dict[str, Any]:
115
+ return asdict(self)
116
+
117
+ @classmethod
118
+ def from_dict(cls, d: dict[str, Any]) -> TimelineEvent:
119
+ return cls(**d)
120
+
121
+
122
+ @dataclass
123
+ class Evidence:
124
+ """Layer J input: the assembled context pack for Claude reasoning."""
125
+ video_meta: VideoMeta
126
+ question: str
127
+ intent: str
128
+ frames: list[Frame]
129
+ transcript_chunks: list[TranscriptChunk]
130
+ ocr_results: list[OCRResult]
131
+ timeline_events: list[TimelineEvent]
132
+ scene_summaries: list[str] = field(default_factory=list)
133
+ global_summary: str = ""
134
+ cache_dir: str = ""
135
+
136
+ def to_dict(self) -> dict[str, Any]:
137
+ return {
138
+ "video_meta": self.video_meta.to_dict(),
139
+ "question": self.question,
140
+ "intent": self.intent,
141
+ "frames": [f.to_dict() for f in self.frames],
142
+ "transcript_chunks": [t.to_dict() for t in self.transcript_chunks],
143
+ "ocr_results": [o.to_dict() for o in self.ocr_results],
144
+ "timeline_events": [e.to_dict() for e in self.timeline_events],
145
+ "scene_summaries": self.scene_summaries,
146
+ "global_summary": self.global_summary,
147
+ "cache_dir": self.cache_dir,
148
+ }
149
+
150
+
151
+ # --- JSON file helpers ---
152
+
153
+ def save_json(data: list[dict] | dict, path: str | Path) -> None:
154
+ """Save data to a JSON file."""
155
+ with open(path, "w", encoding="utf-8") as f:
156
+ json.dump(data, f, indent=2, ensure_ascii=False)
157
+
158
+
159
+ def load_json(path: str | Path) -> Any:
160
+ """Load data from a JSON file."""
161
+ with open(path, "r", encoding="utf-8") as f:
162
+ return json.load(f)
@@ -0,0 +1,110 @@
1
+ """Layer E: OCR extraction from video frames using pytesseract."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ from .models import Frame, OCRResult
9
+
10
+ logger = logging.getLogger("vidclaude")
11
+
12
+
13
+ def extract_ocr(
14
+ frames: list[Frame],
15
+ no_ocr: bool = False,
16
+ mode: str = "standard",
17
+ ) -> list[OCRResult]:
18
+ """Extract text from frames using pytesseract.
19
+
20
+ Args:
21
+ frames: List of extracted frames.
22
+ no_ocr: If True, skip OCR entirely.
23
+ mode: Processing mode — "quick" skips OCR, "standard" does keyframes,
24
+ "deep" does all frames.
25
+
26
+ Returns list of OCRResult for frames where text was detected.
27
+ """
28
+ if no_ocr:
29
+ logger.info("OCR skipped (--no-ocr)")
30
+ return []
31
+
32
+ if mode == "quick":
33
+ logger.info("OCR skipped in quick mode")
34
+ return []
35
+
36
+ # Try to import pytesseract
37
+ try:
38
+ import pytesseract
39
+ from PIL import Image
40
+ except ImportError:
41
+ logger.warning(
42
+ "pytesseract not installed. Skipping OCR. "
43
+ "Install with: pip install pytesseract "
44
+ "(also requires Tesseract system binary)"
45
+ )
46
+ return []
47
+
48
+ # In standard mode, only process keyframes (shot boundaries + every Nth frame)
49
+ if mode == "standard":
50
+ selected = _select_keyframes(frames)
51
+ else:
52
+ selected = frames
53
+
54
+ logger.info("Running OCR on %d frame(s)...", len(selected))
55
+
56
+ results: list[OCRResult] = []
57
+ for frame in selected:
58
+ if not Path(frame.image_path).exists():
59
+ continue
60
+
61
+ try:
62
+ img = Image.open(frame.image_path)
63
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
64
+
65
+ # Collect text blocks with confidence > 60
66
+ texts = []
67
+ for i, conf in enumerate(data["conf"]):
68
+ try:
69
+ conf_val = int(conf)
70
+ except (ValueError, TypeError):
71
+ continue
72
+ if conf_val > 60 and data["text"][i].strip():
73
+ texts.append(data["text"][i].strip())
74
+
75
+ if texts:
76
+ combined = " ".join(texts)
77
+ avg_conf = sum(
78
+ int(c) for c in data["conf"]
79
+ if str(c).strip() and int(c) > 60
80
+ ) / max(1, len(texts))
81
+
82
+ results.append(OCRResult(
83
+ frame_id=frame.frame_id,
84
+ timestamp_ms=frame.timestamp_ms,
85
+ text=combined,
86
+ confidence=avg_conf / 100.0,
87
+ ))
88
+ except Exception as e:
89
+ logger.debug("OCR failed on %s: %s", frame.frame_id, e)
90
+ continue
91
+
92
+ logger.info("OCR found text in %d frame(s)", len(results))
93
+ return results
94
+
95
+
96
+ def _select_keyframes(frames: list[Frame]) -> list[Frame]:
97
+ """Select keyframes for OCR in standard mode.
98
+
99
+ Picks: shot boundary frames + every 5th base frame.
100
+ """
101
+ selected = []
102
+ base_count = 0
103
+ for frame in frames:
104
+ if "shot_boundary" in frame.sampling_reason:
105
+ selected.append(frame)
106
+ elif "base" in frame.sampling_reason:
107
+ if base_count % 5 == 0:
108
+ selected.append(frame)
109
+ base_count += 1
110
+ return selected if selected else frames[:5]