slidecap 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
slidecap/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """slidecap package."""
2
+
3
+ from .core import PipelineResult, run_pipeline
4
+
5
+ __all__ = ["run_pipeline", "PipelineResult"]
slidecap/cli.py ADDED
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import sys
7
+
8
+ try:
9
+ from .core import run_pipeline
10
+ except ImportError: # pragma: no cover - enables direct script invocation
11
+ from core import run_pipeline # type: ignore[no-redef]
12
+
13
+
14
+ def build_parser() -> argparse.ArgumentParser:
15
+ parser = argparse.ArgumentParser(
16
+ prog="slidecap",
17
+ description="Generate slide-aligned markdown notes from a YouTube URL.",
18
+ )
19
+ parser.add_argument("--url", required=True, help="YouTube URL to process.")
20
+ parser.add_argument(
21
+ "--out-md",
22
+ default=None,
23
+ help="Output markdown file path. Defaults to ./slidecap/<video title>.md",
24
+ )
25
+ parser.add_argument(
26
+ "--images-dir",
27
+ default=None,
28
+ help="Directory where slide images are written. Defaults to ./slidecap/slides/",
29
+ )
30
+ parser.add_argument(
31
+ "--similarity-threshold",
32
+ type=float,
33
+ default=0.85,
34
+ help="Slide detection sensitivity threshold (default: 0.85).",
35
+ )
36
+ parser.add_argument(
37
+ "--sample-rate",
38
+ type=float,
39
+ default=1.0,
40
+ help="Frame sampling rate in seconds (default: 1.0).",
41
+ )
42
+ parser.add_argument(
43
+ "--whisper-model",
44
+ default="base",
45
+ help=(
46
+ "Whisper model name. Default: base. "
47
+ "Multilingual: tiny, base, small, medium, large, large-v2, large-v3, turbo. "
48
+ "English-only (faster): tiny.en, base.en, small.en, medium.en."
49
+ ),
50
+ )
51
+ parser.add_argument("--language", default=None, help="Optional transcription language code (e.g. en).")
52
+ parser.add_argument(
53
+ "--image-format",
54
+ default="jpg",
55
+ choices=["jpg", "png"],
56
+ help="Image format for saved slides (default: jpg).",
57
+ )
58
+ parser.add_argument(
59
+ "--image-quality",
60
+ type=int,
61
+ default=90,
62
+ help="JPEG quality 1-100 (used only for jpg output). Default: 90.",
63
+ )
64
+ parser.add_argument("--overwrite", action="store_true", help="Overwrite output markdown if it exists.")
65
+ parser.add_argument("--keep-temp", action="store_true", help="Keep temporary downloaded files for debugging.")
66
+ parser.add_argument(
67
+ "--allow-lower-quality",
68
+ action="store_true",
69
+ help="Fallback to sub-1080p quality if exact 1080p is not available.",
70
+ )
71
+ parser.add_argument(
72
+ "--log-level",
73
+ default="info",
74
+ choices=["debug", "info", "warn", "error"],
75
+ help="Logging verbosity (default: info).",
76
+ )
77
+ parser.add_argument(
78
+ "--json",
79
+ action="store_true",
80
+ help="Print structured JSON output for agent workflows.",
81
+ )
82
+ return parser
83
+
84
+
85
+ def _validate_args(args: argparse.Namespace) -> None:
86
+ if not (0.5 <= args.similarity_threshold <= 0.95):
87
+ raise ValueError("--similarity-threshold must be between 0.5 and 0.95.")
88
+ if args.sample_rate <= 0:
89
+ raise ValueError("--sample-rate must be > 0.")
90
+ if not (1 <= args.image_quality <= 100):
91
+ raise ValueError("--image-quality must be between 1 and 100.")
92
+
93
+
94
+ def main() -> int:
95
+ parser = build_parser()
96
+ args = parser.parse_args()
97
+
98
+ level_map = {
99
+ "debug": logging.DEBUG,
100
+ "info": logging.INFO,
101
+ "warn": logging.WARNING,
102
+ "error": logging.ERROR,
103
+ }
104
+ logging.basicConfig(level=level_map[args.log_level], format="%(levelname)s: %(message)s")
105
+
106
+ try:
107
+ _validate_args(args)
108
+ result = run_pipeline(args)
109
+ except Exception as exc: # noqa: BLE001
110
+ if args.json:
111
+ error_payload = {
112
+ "status": "error",
113
+ "error": str(exc),
114
+ "error_type": type(exc).__name__,
115
+ }
116
+ print(json.dumps(error_payload, ensure_ascii=True))
117
+ else:
118
+ logging.error(str(exc))
119
+ return 1
120
+
121
+ if args.json:
122
+ print(json.dumps(result.to_dict(), ensure_ascii=True))
123
+ else:
124
+ print(result.output_markdown)
125
+ return 0
126
+
127
+
128
+ if __name__ == "__main__":
129
+ sys.exit(main())
slidecap/core.py ADDED
@@ -0,0 +1,530 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import shutil
9
+ import subprocess
10
+ import tempfile
11
+ import time
12
+ from dataclasses import dataclass
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Iterable
16
+ from urllib.parse import parse_qs, urlparse
17
+
18
+ import cv2
19
+ import whisper
20
+ from PIL import Image
21
+ from skimage.metrics import structural_similarity as ssim
22
+
23
+ LOGGER = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class Slide:
28
+ slide_num: int
29
+ timestamp: float
30
+ image_path: str
31
+ transcript: str
32
+ youtube_link: str
33
+
34
+
35
+ @dataclass
36
+ class PipelineResult:
37
+ status: str
38
+ url: str
39
+ video_id: str
40
+ output_markdown: str
41
+ images_dir: str
42
+ slide_count: int
43
+ image_files: list[str]
44
+ downloaded_resolution: str
45
+ downloaded_fps: str
46
+ format_note: str
47
+ download_format: str
48
+ similarity_threshold: float
49
+ sample_rate: float
50
+ whisper_model: str
51
+ language: str | None
52
+ started_at: str
53
+ completed_at: str
54
+ duration_seconds: float
55
+ warnings: list[str]
56
+
57
+ def to_dict(self) -> dict:
58
+ return {
59
+ "status": self.status,
60
+ "url": self.url,
61
+ "video_id": self.video_id,
62
+ "output_markdown": self.output_markdown,
63
+ "images_dir": self.images_dir,
64
+ "slide_count": self.slide_count,
65
+ "image_files": self.image_files,
66
+ "downloaded_resolution": self.downloaded_resolution,
67
+ "downloaded_fps": self.downloaded_fps,
68
+ "format_note": self.format_note,
69
+ "download_format": self.download_format,
70
+ "similarity_threshold": self.similarity_threshold,
71
+ "sample_rate": self.sample_rate,
72
+ "whisper_model": self.whisper_model,
73
+ "language": self.language,
74
+ "started_at": self.started_at,
75
+ "completed_at": self.completed_at,
76
+ "duration_seconds": self.duration_seconds,
77
+ "warnings": self.warnings,
78
+ }
79
+
80
+
81
+ def _check_runtime_dependencies() -> None:
82
+ missing = [cmd for cmd in ("yt-dlp", "ffmpeg") if shutil.which(cmd) is None]
83
+ if missing:
84
+ missing_list = ", ".join(missing)
85
+ raise RuntimeError(f"Missing required system binaries: {missing_list}")
86
+
87
+
88
+ def _extract_video_id(youtube_url: str) -> str:
89
+ parsed = urlparse(youtube_url)
90
+ host = parsed.netloc.lower()
91
+
92
+ if "youtu.be" in host:
93
+ candidate = parsed.path.strip("/").split("/")[0]
94
+ if candidate:
95
+ return candidate
96
+
97
+ if "youtube.com" in host:
98
+ params = parse_qs(parsed.query)
99
+ video_ids = params.get("v", [])
100
+ if video_ids:
101
+ return video_ids[0]
102
+
103
+ parts = parsed.path.strip("/").split("/")
104
+ if len(parts) >= 2 and parts[0] in {"shorts", "embed", "live"}:
105
+ return parts[1]
106
+
107
+ return "unknown_video"
108
+
109
+
110
+ def _canonical_watch_url(youtube_url: str, video_id: str) -> str:
111
+ if video_id != "unknown_video":
112
+ return f"https://www.youtube.com/watch?v={video_id}"
113
+ parsed = urlparse(youtube_url)
114
+ path = parsed.path
115
+ if not path:
116
+ return youtube_url
117
+ return f"{parsed.scheme}://{parsed.netloc}{path}"
118
+
119
+
120
+ def generate_youtube_timestamp_url(youtube_url: str, timestamp_seconds: float) -> str:
121
+ video_id = _extract_video_id(youtube_url)
122
+ base = _canonical_watch_url(youtube_url, video_id)
123
+ return f"{base}&t={int(timestamp_seconds)}s" if "?" in base else f"{base}?t={int(timestamp_seconds)}s"
124
+
125
+
126
+ def build_1080p_format_string(allow_lower_quality: bool) -> str:
127
+ strict_1080 = "bestvideo[height=1080]+bestaudio/best[height=1080]"
128
+ if not allow_lower_quality:
129
+ return strict_1080
130
+
131
+ fallback = (
132
+ "bestvideo[height<=720]+bestaudio/best[height<=720]/"
133
+ "bestvideo[height<=480]+bestaudio/best[height<=480]/"
134
+ "bestvideo[height<=360]+bestaudio/best[height<=360]/best"
135
+ )
136
+ return f"{strict_1080}/{fallback}"
137
+
138
+
139
+ def _run_command(cmd: Iterable[str]) -> subprocess.CompletedProcess[str]:
140
+ LOGGER.debug("Running command: %s", " ".join(cmd))
141
+ return subprocess.run(list(cmd), capture_output=True, text=True)
142
+
143
+
144
+ def _download_video(youtube_url: str, video_path: Path, allow_lower_quality: bool) -> tuple[Path, str]:
145
+ format_string = build_1080p_format_string(allow_lower_quality)
146
+ cmd = [
147
+ "yt-dlp",
148
+ "-f",
149
+ format_string,
150
+ "--merge-output-format",
151
+ "mp4",
152
+ "--write-info-json",
153
+ "--no-playlist",
154
+ "-o",
155
+ str(video_path),
156
+ youtube_url,
157
+ ]
158
+ result = _run_command(cmd)
159
+ if result.returncode != 0:
160
+ raise RuntimeError(f"yt-dlp download failed:\n{result.stderr.strip()}")
161
+ LOGGER.info("Video downloaded: %s", video_path)
162
+ return video_path, format_string
163
+
164
+
165
+ def _extract_audio(video_path: Path, audio_path: Path) -> None:
166
+ cmd = ["ffmpeg", "-i", str(video_path), "-q:a", "0", "-map", "a", str(audio_path), "-y"]
167
+ result = _run_command(cmd)
168
+ if result.returncode != 0:
169
+ raise RuntimeError(f"ffmpeg audio extraction failed:\n{result.stderr.strip()}")
170
+ LOGGER.info("Audio extracted: %s", audio_path)
171
+
172
+
173
+ def detect_slides(video_path: Path, similarity_threshold: float = 0.85, sample_rate: float = 1.0) -> list[tuple[float, Image.Image]]:
174
+ """
175
+ Reuses the slide detection behavior from the existing app:
176
+ - SSIM frame comparison on sampled grayscale frames.
177
+ - Dynamic threshold adjustment based on video height.
178
+ - Extra blur for 1080p+ input.
179
+ """
180
+ cap = cv2.VideoCapture(str(video_path))
181
+ cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc(*"mp4v"))
182
+ cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
183
+
184
+ fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
185
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
186
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
187
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
188
+
189
+ if fps <= 0:
190
+ cap.release()
191
+ raise RuntimeError("Failed to read video FPS; cannot perform slide detection.")
192
+
193
+ LOGGER.info("Video info: %sx%s, %.1f FPS, %s frames", width, height, fps, total_frames)
194
+
195
+ if height >= 1080:
196
+ adjusted_threshold = similarity_threshold * 1.05
197
+ LOGGER.info("High quality video detected. Adjusted threshold: %.3f", adjusted_threshold)
198
+ elif height >= 720:
199
+ adjusted_threshold = similarity_threshold * 1.02
200
+ LOGGER.info("Medium quality video detected. Adjusted threshold: %.3f", adjusted_threshold)
201
+ else:
202
+ adjusted_threshold = similarity_threshold
203
+ LOGGER.info("Standard quality video detected. Threshold: %.3f", adjusted_threshold)
204
+
205
+ slides: list[tuple[float, Image.Image]] = []
206
+ prev_frame = None
207
+ frame_count = 0
208
+
209
+ sample_interval = int(fps * sample_rate)
210
+ sample_interval = max(sample_interval, 1)
211
+
212
+ if height >= 1080:
213
+ sample_interval = max(sample_interval, int(fps * 0.5))
214
+
215
+ while True:
216
+ ret, frame = cap.read()
217
+ if not ret:
218
+ break
219
+
220
+ frame_count += 1
221
+ if frame_count % sample_interval != 0:
222
+ continue
223
+
224
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
225
+ if height >= 1080:
226
+ gray = cv2.GaussianBlur(gray, (3, 3), 0)
227
+
228
+ if prev_frame is not None:
229
+ similarity = ssim(prev_frame, gray)
230
+ if similarity < adjusted_threshold:
231
+ timestamp = frame_count / fps
232
+ img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
233
+ pil_img = Image.fromarray(img_rgb)
234
+ slides.append((timestamp, pil_img))
235
+ LOGGER.info(
236
+ "Slide detected at %.1fs (similarity %.3f < threshold %.3f)",
237
+ timestamp,
238
+ similarity,
239
+ adjusted_threshold,
240
+ )
241
+
242
+ prev_frame = gray
243
+
244
+ cap.release()
245
+
246
+ if not slides:
247
+ fallback_cap = cv2.VideoCapture(str(video_path))
248
+ fallback_cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc(*"mp4v"))
249
+ fallback_cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
250
+ ret, frame = fallback_cap.read()
251
+ if ret:
252
+ img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
253
+ pil_img = Image.fromarray(img_rgb)
254
+ slides.insert(0, (0.0, pil_img))
255
+ fallback_cap.release()
256
+
257
+ return slides
258
+
259
+
260
+ def _transcribe_audio(audio_path: Path, whisper_model: str, language: str | None) -> tuple[str, list[dict]]:
261
+ LOGGER.info("Loading Whisper model: %s", whisper_model)
262
+ model = whisper.load_model(whisper_model)
263
+ kwargs = {"language": language} if language else {}
264
+ LOGGER.info("Transcribing audio...")
265
+ result = model.transcribe(str(audio_path), **kwargs)
266
+ return result.get("text", "").strip(), result.get("segments", [])
267
+
268
+
269
+ def _align_transcript_to_slides(segments: list[dict], slide_timestamps: list[float]) -> list[str]:
270
+ """Assign Whisper segments to slides based on actual segment start times."""
271
+ if not slide_timestamps:
272
+ return []
273
+
274
+ chunks: list[str] = []
275
+ for i, timestamp in enumerate(slide_timestamps):
276
+ next_timestamp = slide_timestamps[i + 1] if i + 1 < len(slide_timestamps) else float("inf")
277
+
278
+ if i == 0:
279
+ # First slide gets all segments that start before the second slide
280
+ slide_segs = [seg for seg in segments if seg["start"] < next_timestamp]
281
+ else:
282
+ slide_segs = [seg for seg in segments if timestamp <= seg["start"] < next_timestamp]
283
+
284
+ chunks.append(" ".join(seg["text"].strip() for seg in slide_segs))
285
+
286
+ return chunks
287
+
288
+
289
+ def _write_slide_images(
290
+ slides: list[tuple[float, Image.Image]],
291
+ images_dir: Path,
292
+ video_id: str,
293
+ image_format: str,
294
+ image_quality: int,
295
+ ) -> list[Path]:
296
+ images_dir.mkdir(parents=True, exist_ok=True)
297
+ output_paths: list[Path] = []
298
+
299
+ pil_format = "JPEG" if image_format == "jpg" else "PNG"
300
+ extension = "jpg" if image_format == "jpg" else "png"
301
+
302
+ for idx, (timestamp, image) in enumerate(slides, start=1):
303
+ file_name = f"yt_{video_id}_slide_{idx:03d}_t{int(timestamp):06d}.{extension}"
304
+ out_path = images_dir / file_name
305
+ if image_format == "jpg":
306
+ if image.mode in ("RGBA", "LA", "P"):
307
+ rgb_image = Image.new("RGB", image.size, (255, 255, 255))
308
+ rgb_image.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
309
+ image = rgb_image
310
+ image.save(out_path, format=pil_format, quality=image_quality, optimize=True)
311
+ else:
312
+ image.save(out_path, format=pil_format, optimize=True)
313
+ output_paths.append(out_path)
314
+
315
+ return output_paths
316
+
317
+
318
+ def _anchor_for_slide(slide_num: int, timestamp: float) -> str:
319
+ ts = f"{timestamp:.1f}".replace(".", "")
320
+ return f"slide-{slide_num}-{ts}s"
321
+
322
+
323
+ def _build_markdown(
324
+ youtube_url: str,
325
+ video_id: str,
326
+ slide_records: list[Slide],
327
+ format_string: str,
328
+ ) -> str:
329
+ generated = datetime.now(timezone.utc).isoformat()
330
+ lines = [
331
+ "# YouTube Slide Transcript",
332
+ "",
333
+ f"- Source: {youtube_url}",
334
+ f"- Video ID: {video_id}",
335
+ f"- Generated: {generated}",
336
+ f"- Slide Count: {len(slide_records)}",
337
+ f"- Download Format: `{format_string}`",
338
+ "",
339
+ "## Table of Contents",
340
+ ]
341
+
342
+ for slide in slide_records:
343
+ anchor = _anchor_for_slide(slide.slide_num, slide.timestamp)
344
+ lines.append(f"- [Slide {slide.slide_num} ({slide.timestamp:.1f}s)](#{anchor})")
345
+
346
+ lines.extend(["", "---", ""])
347
+
348
+ for slide in slide_records:
349
+ anchor = _anchor_for_slide(slide.slide_num, slide.timestamp)
350
+ lines.extend(
351
+ [
352
+ f"## Slide {slide.slide_num} ({slide.timestamp:.1f}s)",
353
+ f'<a id="{anchor}"></a>',
354
+ "",
355
+ f"YouTube Link: {slide.youtube_link}",
356
+ "",
357
+ f"![Slide {slide.slide_num}]({slide.image_path})",
358
+ "",
359
+ "Transcript:",
360
+ slide.transcript or "_No transcript text for this segment._",
361
+ "",
362
+ "---",
363
+ "",
364
+ ]
365
+ )
366
+
367
+ return "\n".join(lines).rstrip() + "\n"
368
+
369
+
370
+ def _read_download_info(video_path: Path) -> dict:
371
+ info_path = video_path.with_suffix(".info.json")
372
+ if not info_path.exists():
373
+ return {}
374
+ try:
375
+ with info_path.open("r", encoding="utf-8") as f:
376
+ return json.load(f)
377
+ except (json.JSONDecodeError, OSError):
378
+ return {}
379
+
380
+
381
+ def _fetch_video_title(url: str) -> str | None:
382
+ """Fetch video title via yt-dlp without downloading the video."""
383
+ result = _run_command(["yt-dlp", "--print", "title", "--no-playlist", url])
384
+ if result.returncode == 0:
385
+ title = result.stdout.strip()
386
+ if title:
387
+ return title
388
+ return None
389
+
390
+
391
+ def _sanitize_filename(name: str, max_length: int = 100) -> str:
392
+ """Remove filesystem-invalid characters and truncate."""
393
+ sanitized = re.sub(r'[\\/:*?"<>|]', "", name)
394
+ sanitized = re.sub(r"\s+", " ", sanitized).strip().strip(".")
395
+ return (sanitized[:max_length].rstrip() if len(sanitized) > max_length else sanitized) or "untitled"
396
+
397
+
398
+ def resolve_output_paths(url: str, out_md: str | None, images_dir: str | None) -> tuple[Path, Path]:
399
+ """Return resolved (out_md, images_dir) paths, filling in defaults when not provided."""
400
+ base_dir = Path.cwd() / "slidecap"
401
+
402
+ if out_md is None:
403
+ title = _fetch_video_title(url)
404
+ filename = (_sanitize_filename(title) if title else _extract_video_id(url)) + ".md"
405
+ resolved_out_md = base_dir / filename
406
+ else:
407
+ resolved_out_md = Path(out_md).expanduser().resolve()
408
+
409
+ if images_dir is None:
410
+ resolved_images_dir = base_dir / "slides"
411
+ else:
412
+ resolved_images_dir = Path(images_dir).expanduser().resolve()
413
+
414
+ return resolved_out_md, resolved_images_dir
415
+
416
+
417
+ def run_pipeline(args: argparse.Namespace) -> PipelineResult:
418
+ _check_runtime_dependencies()
419
+ started_at_dt = datetime.now(timezone.utc)
420
+ started_at = started_at_dt.isoformat()
421
+ start_time = time.time()
422
+
423
+ out_md, images_dir = resolve_output_paths(args.url, args.out_md, args.images_dir)
424
+
425
+ if out_md.exists() and not args.overwrite:
426
+ raise FileExistsError(f"Output markdown already exists: {out_md}. Use --overwrite.")
427
+
428
+ out_md.parent.mkdir(parents=True, exist_ok=True)
429
+ images_dir.mkdir(parents=True, exist_ok=True)
430
+
431
+ if args.keep_temp:
432
+ temp_dir = Path(tempfile.mkdtemp(prefix="slidecap_"))
433
+ LOGGER.info("Keeping temp directory: %s", temp_dir)
434
+ should_cleanup = False
435
+ else:
436
+ temp_dir = Path(tempfile.mkdtemp(prefix="slidecap_"))
437
+ should_cleanup = True
438
+
439
+ try:
440
+ video_path = temp_dir / "video.mp4"
441
+ audio_path = temp_dir / "audio.mp3"
442
+
443
+ downloaded_video, format_string = _download_video(args.url, video_path, args.allow_lower_quality)
444
+ info = _read_download_info(downloaded_video)
445
+ if info:
446
+ LOGGER.info(
447
+ "Downloaded resolution: %sx%s, fps: %s, format: %s",
448
+ info.get("width", "unknown"),
449
+ info.get("height", "unknown"),
450
+ info.get("fps", "unknown"),
451
+ info.get("format_note", "unknown"),
452
+ )
453
+
454
+ _extract_audio(video_path, audio_path)
455
+ slides = detect_slides(video_path, similarity_threshold=args.similarity_threshold, sample_rate=args.sample_rate)
456
+ if not slides:
457
+ raise RuntimeError("No slides detected.")
458
+
459
+ _transcript, segments = _transcribe_audio(audio_path, args.whisper_model, args.language)
460
+ slide_timestamps = [s[0] for s in slides]
461
+ transcript_chunks = _align_transcript_to_slides(segments, slide_timestamps)
462
+ video_id = _extract_video_id(args.url)
463
+ image_paths = _write_slide_images(
464
+ slides,
465
+ images_dir,
466
+ video_id=video_id,
467
+ image_format=args.image_format,
468
+ image_quality=args.image_quality,
469
+ )
470
+
471
+ slide_records: list[Slide] = []
472
+ for idx, ((timestamp, _image), transcript_chunk, image_path) in enumerate(
473
+ zip(slides, transcript_chunks, image_paths), start=1
474
+ ):
475
+ relative_img_path = os.path.relpath(image_path, start=out_md.parent).replace(os.sep, "/")
476
+ slide_records.append(
477
+ Slide(
478
+ slide_num=idx,
479
+ timestamp=timestamp,
480
+ image_path=relative_img_path,
481
+ transcript=transcript_chunk,
482
+ youtube_link=generate_youtube_timestamp_url(args.url, timestamp),
483
+ )
484
+ )
485
+
486
+ markdown = _build_markdown(
487
+ youtube_url=args.url,
488
+ video_id=video_id,
489
+ slide_records=slide_records,
490
+ format_string=format_string,
491
+ )
492
+ out_md.write_text(markdown, encoding="utf-8")
493
+ LOGGER.info("Markdown written: %s", out_md)
494
+ LOGGER.info("Slides written to: %s", images_dir)
495
+
496
+ width = info.get("width")
497
+ height = info.get("height")
498
+ downloaded_resolution = (
499
+ f"{width}x{height}" if isinstance(width, int) and isinstance(height, int) else "unknown"
500
+ )
501
+ downloaded_fps = str(info.get("fps", "unknown"))
502
+ format_note = str(info.get("format_note", "unknown"))
503
+
504
+ completed_at = datetime.now(timezone.utc).isoformat()
505
+ duration_seconds = round(time.time() - start_time, 3)
506
+
507
+ return PipelineResult(
508
+ status="ok",
509
+ url=args.url,
510
+ video_id=video_id,
511
+ output_markdown=str(out_md),
512
+ images_dir=str(images_dir),
513
+ slide_count=len(slide_records),
514
+ image_files=[str(path) for path in image_paths],
515
+ downloaded_resolution=downloaded_resolution,
516
+ downloaded_fps=downloaded_fps,
517
+ format_note=format_note,
518
+ download_format=format_string,
519
+ similarity_threshold=args.similarity_threshold,
520
+ sample_rate=args.sample_rate,
521
+ whisper_model=args.whisper_model,
522
+ language=args.language,
523
+ started_at=started_at,
524
+ completed_at=completed_at,
525
+ duration_seconds=duration_seconds,
526
+ warnings=[],
527
+ )
528
+ finally:
529
+ if should_cleanup:
530
+ shutil.rmtree(temp_dir, ignore_errors=True)
@@ -0,0 +1,154 @@
1
+ Metadata-Version: 2.4
2
+ Name: slidecap
3
+ Version: 0.1.0
4
+ Summary: CLI tool to generate slide-aligned markdown notes from YouTube videos.
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: numpy
10
+ Requires-Dist: opencv-python
11
+ Requires-Dist: openai-whisper
12
+ Requires-Dist: Pillow
13
+ Requires-Dist: scikit-image
14
+ Requires-Dist: yt-dlp
15
+ Dynamic: license-file
16
+
17
+ # slidecap
18
+
19
+ CLI tool that turns a YouTube video into slide-aligned markdown notes for agent/LLM workflows.
20
+
21
+ ## What It Does
22
+
23
+ 1. Downloads a YouTube video at 1080p (strict by default).
24
+ 2. Detects slide changes using SSIM-based frame comparison (same behavior as your existing app).
25
+ 3. Transcribes audio with Whisper.
26
+ 4. Aligns transcript chunks to detected slide timestamps.
27
+ 5. Writes:
28
+ - A markdown file with slide sections + transcript text.
29
+ - Slide images to one shared images folder.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install slidecap
35
+ ```
36
+
37
+ System dependencies:
38
+ - `ffmpeg` in PATH
39
+ - `yt-dlp` in PATH
40
+
41
+ ## Usage
42
+
43
+ Minimal — just pass a URL:
44
+
45
+ ```bash
46
+ slidecap --url "https://www.youtube.com/watch?v=abc123"
47
+ ```
48
+
49
+ This creates:
50
+ ```
51
+ ./slidecap/
52
+ My Video Title.md
53
+ slides/
54
+ yt_abc123_slide_001_t000005.jpg
55
+ ...
56
+ ```
57
+
58
+ Custom output paths:
59
+
60
+ ```bash
61
+ slidecap \
62
+ --url "https://www.youtube.com/watch?v=abc123" \
63
+ --out-md "/path/to/vault/notes/abc123.md" \
64
+ --images-dir "/path/to/vault/assets/youtube-slides"
65
+ ```
66
+
67
+ JSON output (for agents):
68
+
69
+ ```bash
70
+ slidecap --url "https://www.youtube.com/watch?v=abc123" --json
71
+ ```
72
+
73
+ ## Flags
74
+
75
+ - `--url` (required): YouTube URL.
76
+ - `--out-md` (optional): Output markdown path. Defaults to `./slidecap/<video title>.md`.
77
+ - `--images-dir` (optional): Slide images folder. Defaults to `./slidecap/slides/`.
78
+ - `--similarity-threshold` (default `0.85`): Slide detection threshold.
79
+ - `--sample-rate` (default `1.0`): Frame sampling interval in seconds.
80
+ - `--whisper-model` (default `base`): Whisper model name. See [Whisper Models](#whisper-models) below.
81
+ - `--language`: Optional transcription language code (e.g. `en`, `de`, `ja`).
82
+ - `--image-format` (default `jpg`): `jpg` or `png`.
83
+ - `--image-quality` (default `90`): JPEG quality.
84
+ - `--allow-lower-quality`: Fallback below 1080p if exact 1080p is unavailable.
85
+ - `--overwrite`: Overwrite existing markdown output.
86
+ - `--keep-temp`: Keep temp downloads for debugging.
87
+ - `--log-level` (default `info`): `debug|info|warn|error`.
88
+ - `--json`: Print structured JSON result (success or error) for agent workflows.
89
+
90
+ ## Whisper Models
91
+
92
+ | Model | Speed | Accuracy | Notes |
93
+ |---|---|---|---|
94
+ | `tiny` | Fastest | Lowest | Quick drafts |
95
+ | `base` | Fast | Good | **Default** |
96
+ | `small` | Moderate | Better | Good general choice |
97
+ | `medium` | Slow | Strong | |
98
+ | `large` | Slowest | Best | |
99
+ | `large-v2` | Slowest | Best | Improved large |
100
+ | `large-v3` | Slowest | Best | Latest multilingual |
101
+ | `turbo` | Fast | Very good | Efficient alternative to large |
102
+
103
+ English-only variants (`tiny.en`, `base.en`, `small.en`, `medium.en`) are faster than their multilingual counterparts when transcribing English content.
104
+
105
+ ## Output Format
106
+
107
+ The markdown file includes:
108
+ - source metadata
109
+ - table of contents
110
+ - per-slide sections with:
111
+ - timestamped YouTube link
112
+ - relative markdown image link
113
+ - transcript chunk aligned to that slide
114
+
115
+ Images are named to avoid collisions:
116
+ - `yt_<video-id>_slide_<nnn>_t<seconds>.<ext>`
117
+
118
+ ## JSON Response Shape
119
+
120
+ Success:
121
+
122
+ ```json
123
+ {
124
+ "status": "ok",
125
+ "url": "https://www.youtube.com/watch?v=abc123",
126
+ "video_id": "abc123",
127
+ "output_markdown": "/path/to/vault/notes/abc123.md",
128
+ "images_dir": "/path/to/vault/assets/youtube-slides",
129
+ "slide_count": 12,
130
+ "image_files": [],
131
+ "downloaded_resolution": "1920x1080",
132
+ "downloaded_fps": "30",
133
+ "format_note": "1080p",
134
+ "download_format": "bestvideo[height=1080]+bestaudio/best[height=1080]",
135
+ "similarity_threshold": 0.85,
136
+ "sample_rate": 1.0,
137
+ "whisper_model": "base",
138
+ "language": "en",
139
+ "started_at": "2026-02-23T00:00:00+00:00",
140
+ "completed_at": "2026-02-23T00:03:00+00:00",
141
+ "duration_seconds": 180.123,
142
+ "warnings": []
143
+ }
144
+ ```
145
+
146
+ Error:
147
+
148
+ ```json
149
+ {
150
+ "status": "error",
151
+ "error": "message",
152
+ "error_type": "RuntimeError"
153
+ }
154
+ ```
@@ -0,0 +1,9 @@
1
+ slidecap/__init__.py,sha256=lTJYDmoHO7U-ssMprVoySTwM7DZTl-Mo_OUzemBgnnw,118
2
+ slidecap/cli.py,sha256=6kz0fCH5NvLTtO9uqNfeaife6MT8ul7OeRYW_ve3TWM,4070
3
+ slidecap/core.py,sha256=k-cADRg6iKdN5QqZzCdjuq-hqZBcF2E_Zw5nzaTHZLw,18421
4
+ slidecap-0.1.0.dist-info/licenses/LICENSE,sha256=i3dxjfqY5jUHFWG0wcttwDSBMGM6Amty7c4JheB1GUA,1073
5
+ slidecap-0.1.0.dist-info/METADATA,sha256=UQmhxmttu76wE-As_v0mETTMGuSii1oXDZg3jqkIkmQ,4205
6
+ slidecap-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
7
+ slidecap-0.1.0.dist-info/entry_points.txt,sha256=bEUTtoW1EF6UpJBy4rMGptMx9grKWZ2FVy2wVuuSLBI,47
8
+ slidecap-0.1.0.dist-info/top_level.txt,sha256=-BSzqHMmzqatHCzbKR8SRP3mLDFKAhk3UmlBaADdGTg,9
9
+ slidecap-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ slidecap = slidecap.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aadhil A. Majeed
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ slidecap