reelrecon 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1150 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ import os
6
+ import re
7
+ import shutil
8
+ import tempfile
9
+ import time
10
+ import warnings
11
+ from collections import Counter
12
+ from dataclasses import dataclass
13
+ from datetime import datetime, timezone
14
+ from functools import lru_cache
15
+ from hashlib import sha1
16
+ from pathlib import Path
17
+ from typing import Any, Callable, Dict, Iterable, Optional
18
+ from urllib.error import HTTPError, URLError
19
+ from urllib.parse import urlparse
20
+ from urllib.request import Request, urlopen
21
+
22
+ warnings.filterwarnings("ignore", message="urllib3 v2 only supports OpenSSL 1.1.1+.*")
23
+ warnings.filterwarnings("ignore", message="Support for Python version 3.9 has been deprecated.*")
24
+
25
+
26
+ def _env_int(name: str, default: int, *, minimum: int = 0) -> int:
27
+ # REELRECON_* is the primary prefix; the legacy IG_TRANSCRIBER_* prefix
28
+ # remains supported so existing setups keep working after the rename.
29
+ raw = os.environ.get(f"REELRECON_{name}", os.environ.get(f"IG_TRANSCRIBER_{name}", default))
30
+ try:
31
+ return max(int(raw), minimum)
32
+ except (TypeError, ValueError):
33
+ return default
34
+
35
+
36
+ INSTAGRAM_APP_ID = "936619743392459"
37
+ DEFAULT_TIMEOUT_SECONDS = _env_int("HTTP_TIMEOUT_SECONDS", 30, minimum=1)
38
+ FETCH_RETRY_ATTEMPTS = _env_int("FETCH_RETRIES", 3, minimum=1)
39
+ INSTAGRAM_VIDEO_LIMIT = 10
40
+ GROQ_BASE_URL = "https://api.groq.com/openai/v1/chat/completions"
41
+ DEFAULT_GROQ_MODEL = "openai/gpt-oss-20b"
42
+ ProgressCallback = Callable[[str, int, str], None]
43
+
44
+ STOPWORDS = {
45
+ "a",
46
+ "about",
47
+ "all",
48
+ "an",
49
+ "and",
50
+ "are",
51
+ "as",
52
+ "at",
53
+ "be",
54
+ "but",
55
+ "by",
56
+ "for",
57
+ "from",
58
+ "has",
59
+ "have",
60
+ "how",
61
+ "i",
62
+ "if",
63
+ "in",
64
+ "into",
65
+ "is",
66
+ "it",
67
+ "its",
68
+ "just",
69
+ "more",
70
+ "my",
71
+ "not",
72
+ "of",
73
+ "on",
74
+ "or",
75
+ "our",
76
+ "out",
77
+ "so",
78
+ "that",
79
+ "the",
80
+ "their",
81
+ "them",
82
+ "there",
83
+ "these",
84
+ "they",
85
+ "this",
86
+ "to",
87
+ "up",
88
+ "use",
89
+ "was",
90
+ "we",
91
+ "what",
92
+ "when",
93
+ "with",
94
+ "you",
95
+ "your",
96
+ }
97
+
98
+ POSITIVE_WORDS = {
99
+ "amazing",
100
+ "best",
101
+ "better",
102
+ "boost",
103
+ "easy",
104
+ "fast",
105
+ "great",
106
+ "improve",
107
+ "love",
108
+ "powerful",
109
+ "simple",
110
+ "smart",
111
+ "strong",
112
+ "win",
113
+ }
114
+
115
+ NEGATIVE_WORDS = {
116
+ "bad",
117
+ "broken",
118
+ "difficult",
119
+ "fail",
120
+ "hard",
121
+ "issue",
122
+ "mistake",
123
+ "problem",
124
+ "risk",
125
+ "slow",
126
+ "stuck",
127
+ "worse",
128
+ }
129
+
130
+ CTA_PATTERNS = (
131
+ "follow",
132
+ "subscribe",
133
+ "comment",
134
+ "share",
135
+ "like",
136
+ "buy",
137
+ "dm",
138
+ "message me",
139
+ "link in bio",
140
+ "sign up",
141
+ "download",
142
+ "book a call",
143
+ "join",
144
+ )
145
+
146
+
147
+ class PipelineError(RuntimeError):
148
+ pass
149
+
150
+
151
+ def _import_whisper() -> Any:
152
+ # Imported lazily: pulling in whisper/torch takes seconds and should not
153
+ # delay (or crash) callers that never transcribe, such as MCP server startup.
154
+ try:
155
+ import whisper
156
+ except Exception as exc:
157
+ raise PipelineError(
158
+ f"The 'openai-whisper' package is not usable: {exc}. "
159
+ "Install dependencies with: pip install -r requirements.txt"
160
+ ) from exc
161
+ return whisper
162
+
163
+
164
+ def _import_yt_dlp() -> Any:
165
+ try:
166
+ from yt_dlp import YoutubeDL
167
+ except Exception as exc:
168
+ raise PipelineError(
169
+ f"The 'yt-dlp' package is not usable: {exc}. "
170
+ "Install dependencies with: pip install -r requirements.txt"
171
+ ) from exc
172
+ return YoutubeDL
173
+
174
+
175
+ def require_ffmpeg() -> str:
176
+ ffmpeg_path = shutil.which("ffmpeg")
177
+ if not ffmpeg_path:
178
+ raise PipelineError(
179
+ "ffmpeg was not found on PATH. It is required for audio extraction and transcription. "
180
+ "Install it (e.g. `apt install ffmpeg` or `brew install ffmpeg`) and retry."
181
+ )
182
+ return ffmpeg_path
183
+
184
+
185
+ def _atomic_write_text(path: Path, text: str) -> None:
186
+ # Write via a temp file in the same directory and atomically replace, so a
187
+ # crash mid-write can never leave a truncated/corrupt file behind.
188
+ path.parent.mkdir(parents=True, exist_ok=True)
189
+ fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
190
+ tmp_path = Path(tmp_name)
191
+ try:
192
+ with os.fdopen(fd, "w", encoding="utf-8") as handle:
193
+ handle.write(text)
194
+ os.replace(tmp_path, path)
195
+ except BaseException:
196
+ tmp_path.unlink(missing_ok=True)
197
+ raise
198
+
199
+
200
+ @dataclass(frozen=True)
201
+ class VideoCandidate:
202
+ source_kind: str
203
+ input_url: str
204
+ canonical_url: str
205
+ source_label: str
206
+ source_group: str
207
+ video_id: str
208
+ timestamp: int
209
+ title: str
210
+ caption: str
211
+ video_url: str
212
+ uploader: str
213
+ platform: str
214
+ position: int = 1
215
+ total_videos: int = 1
216
+
217
+
218
+ def _emit(progress_callback: Optional[ProgressCallback], stage: str, percent: int, message: str) -> None:
219
+ if progress_callback is not None:
220
+ progress_callback(stage, percent, message)
221
+
222
+
223
+ def _safe_slug(value: str, fallback: str = "item") -> str:
224
+ # Strip leading/trailing dots too so a slug can never be a path-traversal
225
+ # component like "." or "..".
226
+ slug = re.sub(r"[^a-zA-Z0-9._-]+", "-", value.strip()).strip("-.").lower()
227
+ if not slug or set(slug) <= {".", "-"}:
228
+ return fallback
229
+ return slug
230
+
231
+
232
+ def _timestamp_to_iso(timestamp: int) -> Optional[str]:
233
+ if not timestamp:
234
+ return None
235
+ return datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat()
236
+
237
+
238
+ def _file_sha1(path: Path) -> str:
239
+ digest = sha1()
240
+ with path.open("rb") as handle:
241
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
242
+ digest.update(chunk)
243
+ return digest.hexdigest()
244
+
245
+
246
+ def _is_instagram_host(netloc: str) -> bool:
247
+ return "instagram.com" in netloc.lower()
248
+
249
+
250
+ def normalize_input_url(input_url: str) -> tuple[str, str]:
251
+ parsed = urlparse(input_url.strip())
252
+ if parsed.scheme not in {"http", "https"}:
253
+ raise PipelineError("The input URL must start with http:// or https://")
254
+ return input_url.strip(), parsed.geturl()
255
+
256
+
257
+ def detect_input_kind(input_url: str) -> tuple[str, str]:
258
+ parsed = urlparse(input_url)
259
+ if _is_instagram_host(parsed.netloc):
260
+ parts = [part for part in parsed.path.split("/") if part]
261
+ if not parts:
262
+ raise PipelineError("Could not determine the Instagram target from the URL")
263
+ if parts[0] in {"reel", "p", "tv"}:
264
+ return "video", input_url
265
+ username = parts[0].lstrip("@")
266
+ if not re.fullmatch(r"[A-Za-z0-9._]+", username):
267
+ raise PipelineError(f"Invalid Instagram username parsed from URL: {username}")
268
+ return "instagram_profile", f"https://www.instagram.com/{username}/"
269
+ return "video", input_url
270
+
271
+
272
+ def fetch_profile(username: str, canonical_url: str) -> Dict[str, Any]:
273
+ api_url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
274
+
275
+ payload: Optional[Dict[str, Any]] = None
276
+ last_error: Optional[PipelineError] = None
277
+ for attempt in range(1, FETCH_RETRY_ATTEMPTS + 1):
278
+ request = Request(
279
+ api_url,
280
+ headers={
281
+ "User-Agent": "Mozilla/5.0",
282
+ "x-ig-app-id": INSTAGRAM_APP_ID,
283
+ "Referer": canonical_url,
284
+ "Accept": "application/json",
285
+ },
286
+ )
287
+ try:
288
+ with urlopen(request, timeout=DEFAULT_TIMEOUT_SECONDS) as response:
289
+ payload = json.load(response)
290
+ last_error = None
291
+ break
292
+ except HTTPError as exc:
293
+ if exc.code == 404:
294
+ raise PipelineError(f"Instagram profile not found: {canonical_url}") from exc
295
+ if exc.code in {401, 403}:
296
+ raise PipelineError(
297
+ "Instagram blocked the profile lookup. This pipeline currently supports public profiles only."
298
+ ) from exc
299
+ if exc.code == 429:
300
+ last_error = PipelineError(
301
+ "Instagram rate-limited the request. Wait a few minutes and try again."
302
+ )
303
+ elif exc.code >= 500:
304
+ last_error = PipelineError(f"Instagram profile lookup failed with HTTP {exc.code}")
305
+ else:
306
+ raise PipelineError(f"Instagram profile lookup failed with HTTP {exc.code}") from exc
307
+ except URLError as exc:
308
+ last_error = PipelineError(f"Network error while fetching Instagram profile: {exc.reason}")
309
+ except (json.JSONDecodeError, TimeoutError) as exc:
310
+ last_error = PipelineError(f"Instagram returned an unreadable profile response: {exc}")
311
+
312
+ if attempt < FETCH_RETRY_ATTEMPTS:
313
+ time.sleep(min(2 ** attempt, 10))
314
+
315
+ if last_error is not None:
316
+ raise last_error
317
+ if not isinstance(payload, dict):
318
+ raise PipelineError("Instagram returned an unexpected profile response")
319
+
320
+ user = payload.get("data", {}).get("user")
321
+ if not user:
322
+ raise PipelineError("Instagram returned an unexpected profile response")
323
+ if user.get("is_private"):
324
+ raise PipelineError("This Instagram profile is private. Only public profiles are supported.")
325
+ return user
326
+
327
+
328
+ def iter_candidate_nodes(user: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
329
+ for key in ("edge_owner_to_timeline_media", "edge_felix_video_timeline"):
330
+ section = user.get(key) or {}
331
+ for edge in section.get("edges") or []:
332
+ node = edge.get("node") or {}
333
+ if node:
334
+ yield node
335
+
336
+
337
+ def extract_caption(node: Dict[str, Any]) -> str:
338
+ edges = (((node.get("edge_media_to_caption") or {}).get("edges")) or [])
339
+ if not edges:
340
+ return ""
341
+ first = edges[0].get("node") or {}
342
+ return (first.get("text") or "").strip()
343
+
344
+
345
+ def _instagram_video_url(shortcode: str, product_type: Optional[str]) -> str:
346
+ if product_type == "clips":
347
+ return f"https://www.instagram.com/reel/{shortcode}/"
348
+ if product_type == "igtv":
349
+ return f"https://www.instagram.com/tv/{shortcode}/"
350
+ return f"https://www.instagram.com/p/{shortcode}/"
351
+
352
+
353
+ def collect_instagram_profile_videos(canonical_url: str) -> list[VideoCandidate]:
354
+ username = urlparse(canonical_url).path.strip("/").split("/")[0]
355
+ user = fetch_profile(username, canonical_url)
356
+
357
+ seen: set[str] = set()
358
+ candidates: list[VideoCandidate] = []
359
+
360
+ for node in iter_candidate_nodes(user):
361
+ shortcode = node.get("shortcode")
362
+ if not shortcode or shortcode in seen:
363
+ continue
364
+ seen.add(shortcode)
365
+
366
+ is_video = bool(node.get("is_video")) or node.get("product_type") in {"clips", "igtv"}
367
+ if not is_video:
368
+ continue
369
+
370
+ timestamp = int(node.get("taken_at_timestamp") or 0)
371
+ if not timestamp:
372
+ continue
373
+
374
+ caption = extract_caption(node)
375
+ title = caption.splitlines()[0].strip() if caption.strip() else shortcode
376
+ candidates.append(
377
+ VideoCandidate(
378
+ source_kind="instagram_profile",
379
+ input_url=canonical_url,
380
+ canonical_url=canonical_url,
381
+ source_label=username,
382
+ source_group="instagram_profiles",
383
+ video_id=shortcode,
384
+ timestamp=timestamp,
385
+ title=title,
386
+ caption=caption,
387
+ video_url=_instagram_video_url(shortcode, node.get("product_type")),
388
+ uploader=user.get("username") or username,
389
+ platform="instagram",
390
+ )
391
+ )
392
+
393
+ if not candidates:
394
+ raise PipelineError("No videos were found in the public Instagram profile data.")
395
+
396
+ candidates.sort(key=lambda item: item.timestamp, reverse=True)
397
+ selected = candidates[:INSTAGRAM_VIDEO_LIMIT]
398
+ total = len(selected)
399
+
400
+ return [
401
+ VideoCandidate(
402
+ **{**candidate.__dict__, "position": index, "total_videos": total}
403
+ )
404
+ for index, candidate in enumerate(selected, start=1)
405
+ ]
406
+
407
+
408
+ def _yt_dlp_base_options() -> Dict[str, Any]:
409
+ return {
410
+ "quiet": True,
411
+ "no_warnings": True,
412
+ "noprogress": True,
413
+ "socket_timeout": DEFAULT_TIMEOUT_SECONDS,
414
+ "retries": 3,
415
+ "fragment_retries": 3,
416
+ "extractor_retries": 2,
417
+ }
418
+
419
+
420
+ def _yt_dlp_extract_info(target_url: str) -> Dict[str, Any]:
421
+ YoutubeDL = _import_yt_dlp()
422
+ try:
423
+ with YoutubeDL(_yt_dlp_base_options()) as ydl:
424
+ info = ydl.extract_info(target_url, download=False)
425
+ except Exception as exc:
426
+ raise PipelineError(f"Failed to inspect video URL: {exc}") from exc
427
+ if not isinstance(info, dict):
428
+ raise PipelineError(f"Could not extract video information from URL: {target_url}")
429
+ return info
430
+
431
+
432
+ def collect_direct_video(target_url: str) -> list[VideoCandidate]:
433
+ info = _yt_dlp_extract_info(target_url)
434
+ page_url = info.get("webpage_url") or target_url
435
+ uploader = info.get("uploader") or info.get("channel") or info.get("extractor_key") or "video"
436
+ source_label = info.get("uploader_id") or _safe_slug(uploader, "video")
437
+ title = (info.get("title") or info.get("fulltitle") or "Video").strip()
438
+ description = (info.get("description") or "").strip()
439
+ timestamp = int(info.get("timestamp") or 0)
440
+ video_id = str(info.get("id") or _safe_slug(page_url))
441
+
442
+ return [
443
+ VideoCandidate(
444
+ source_kind="video_url",
445
+ input_url=target_url,
446
+ canonical_url=page_url,
447
+ source_label=source_label,
448
+ source_group="video_urls",
449
+ video_id=video_id,
450
+ timestamp=timestamp,
451
+ title=title,
452
+ caption=description,
453
+ video_url=target_url,
454
+ uploader=uploader,
455
+ platform=(info.get("extractor_key") or urlparse(target_url).netloc or "video").lower(),
456
+ position=1,
457
+ total_videos=1,
458
+ )
459
+ ]
460
+
461
+
462
+ def resolve_candidates(input_url: str) -> tuple[str, str, list[VideoCandidate]]:
463
+ normalized_input, canonical = normalize_input_url(input_url)
464
+ input_kind, canonical_target = detect_input_kind(canonical)
465
+ if input_kind == "instagram_profile":
466
+ return input_kind, canonical_target, collect_instagram_profile_videos(canonical_target)
467
+ return input_kind, canonical_target, collect_direct_video(normalized_input)
468
+
469
+
470
+ def ensure_run_dir(base_output_dir: Path, candidate: VideoCandidate) -> Path:
471
+ run_dir = base_output_dir / candidate.source_group / _safe_slug(candidate.source_label) / _safe_slug(candidate.video_id)
472
+ run_dir.mkdir(parents=True, exist_ok=True)
473
+ return run_dir
474
+
475
+
476
+ def _paths_for_run(run_dir: Path) -> tuple[Path, Path, Path]:
477
+ return run_dir / "audio.mp3", run_dir / "transcript.txt", run_dir / "metadata.json"
478
+
479
+
480
+ def _paths_for_batch(base_output_dir: Path, source_group: str, source_label: str) -> tuple[Path, Path]:
481
+ group_dir = base_output_dir / source_group / _safe_slug(source_label)
482
+ group_dir.mkdir(parents=True, exist_ok=True)
483
+ return group_dir, group_dir / "manifest.json"
484
+
485
+
486
+ def _copy_uploaded_audio(source_audio_path: Path, run_dir: Path) -> Path:
487
+ suffix = source_audio_path.suffix.lower() or ".bin"
488
+ destination = run_dir / f"audio{suffix}"
489
+ if destination.exists():
490
+ return destination
491
+ shutil.copy2(source_audio_path, destination)
492
+ return destination
493
+
494
+
495
+ def _sentence_split(text: str) -> list[str]:
496
+ normalized = re.sub(r"\s+", " ", text).strip()
497
+ if not normalized:
498
+ return []
499
+ parts = re.split(r"(?<=[.!?])\s+", normalized)
500
+ return [part.strip() for part in parts if part.strip()]
501
+
502
+
503
+ def _top_keywords(*texts: str, limit: int = 8) -> list[str]:
504
+ words: list[str] = []
505
+ for text in texts:
506
+ words.extend(re.findall(r"[A-Za-z][A-Za-z0-9'-]{2,}", text.lower()))
507
+ counts = Counter(word for word in words if word not in STOPWORDS)
508
+ return [word for word, _ in counts.most_common(limit)]
509
+
510
+
511
+ def _sentiment(text: str) -> str:
512
+ words = re.findall(r"[A-Za-z][A-Za-z0-9'-]{2,}", text.lower())
513
+ positive = sum(word in POSITIVE_WORDS for word in words)
514
+ negative = sum(word in NEGATIVE_WORDS for word in words)
515
+ if positive > negative:
516
+ return "positive"
517
+ if negative > positive:
518
+ return "negative"
519
+ return "neutral"
520
+
521
+
522
+ def _cta_detected(text: str) -> Optional[str]:
523
+ lowered = text.lower()
524
+ for pattern in CTA_PATTERNS:
525
+ if pattern in lowered:
526
+ return pattern
527
+ return None
528
+
529
+
530
+ def _strip_json_fence(content: str) -> str:
531
+ cleaned = content.strip()
532
+ if cleaned.startswith("```"):
533
+ cleaned = re.sub(r"^```(?:json)?\s*", "", cleaned)
534
+ cleaned = re.sub(r"\s*```$", "", cleaned)
535
+ return cleaned.strip()
536
+
537
+
538
+ def _groq_chat(messages: list[Dict[str, str]]) -> Optional[str]:
539
+ api_key = os.environ.get("GROQ_API_KEY")
540
+ if not api_key:
541
+ return None
542
+
543
+ request = Request(
544
+ GROQ_BASE_URL,
545
+ data=json.dumps(
546
+ {
547
+ "model": os.environ.get("GROQ_MODEL", DEFAULT_GROQ_MODEL),
548
+ "messages": messages,
549
+ "temperature": 0.2,
550
+ }
551
+ ).encode("utf-8"),
552
+ headers={
553
+ "Authorization": f"Bearer {api_key}",
554
+ "Content-Type": "application/json",
555
+ },
556
+ method="POST",
557
+ )
558
+
559
+ try:
560
+ with urlopen(request, timeout=DEFAULT_TIMEOUT_SECONDS) as response:
561
+ payload = json.load(response)
562
+ except Exception:
563
+ return None
564
+
565
+ choices = payload.get("choices") or []
566
+ if not choices:
567
+ return None
568
+ return choices[0].get("message", {}).get("content")
569
+
570
+
571
+ def _heuristic_video_ai_insights(transcript_text: str, caption: str, title: str) -> Dict[str, Any]:
572
+ sentences = _sentence_split(transcript_text)
573
+ summary_sentences = sentences[:2] or _sentence_split(caption)[:2] or ([title] if title else [])
574
+ keywords = _top_keywords(title, caption, transcript_text)
575
+ hook = sentences[0] if sentences else (caption.splitlines()[0].strip() if caption.strip() else title)
576
+ cta = _cta_detected(f"{caption}\n{transcript_text}")
577
+ insight_focus = keywords[:3] if keywords else ["content", "message", "audience"]
578
+ summary = " ".join(summary_sentences).strip()
579
+
580
+ return {
581
+ "summary": summary,
582
+ "hook": hook,
583
+ "keywords": keywords,
584
+ "sentiment": _sentiment(f"{caption}\n{transcript_text}"),
585
+ "cta": cta,
586
+ "title_suggestions": [
587
+ f"{title or hook}: what matters most",
588
+ f"{' / '.join(insight_focus)} breakdown",
589
+ f"The core idea behind {insight_focus[0] if insight_focus else 'this video'}",
590
+ ],
591
+ "content_angles": [
592
+ f"Turn {insight_focus[0] if insight_focus else 'the topic'} into a carousel or thread.",
593
+ f"Clip the opening hook: {hook[:120] if hook else title}.",
594
+ f"Use {insight_focus[1] if len(insight_focus) > 1 else insight_focus[0] if insight_focus else 'the message'} as the CTA angle for a follow-up post.",
595
+ ],
596
+ }
597
+
598
+
599
+ def generate_video_ai_insights(transcript_text: str, caption: str, title: str) -> Dict[str, Any]:
600
+ fallback = _heuristic_video_ai_insights(transcript_text, caption, title)
601
+ content = _groq_chat(
602
+ [
603
+ {
604
+ "role": "system",
605
+ "content": (
606
+ "You generate structured creator insights for short-form video transcripts. "
607
+ "Return only valid JSON with keys: summary, hook, keywords, sentiment, cta, "
608
+ "title_suggestions, content_angles. keywords/title_suggestions/content_angles must be arrays."
609
+ ),
610
+ },
611
+ {
612
+ "role": "user",
613
+ "content": json.dumps(
614
+ {
615
+ "title": title,
616
+ "caption": caption,
617
+ "transcript": transcript_text[:12000],
618
+ },
619
+ ensure_ascii=False,
620
+ ),
621
+ },
622
+ ]
623
+ )
624
+ if not content:
625
+ return {**fallback, "provider": "heuristic"}
626
+
627
+ try:
628
+ parsed = json.loads(_strip_json_fence(content))
629
+ except json.JSONDecodeError:
630
+ return {**fallback, "provider": "heuristic"}
631
+
632
+ return {
633
+ "summary": str(parsed.get("summary") or fallback["summary"]).strip(),
634
+ "hook": str(parsed.get("hook") or fallback["hook"]).strip(),
635
+ "keywords": [str(item).strip() for item in (parsed.get("keywords") or fallback["keywords"]) if str(item).strip()][:8],
636
+ "sentiment": str(parsed.get("sentiment") or fallback["sentiment"]).strip().lower(),
637
+ "cta": str(parsed.get("cta")).strip() if parsed.get("cta") not in {None, "", "null"} else fallback["cta"],
638
+ "title_suggestions": [str(item).strip() for item in (parsed.get("title_suggestions") or fallback["title_suggestions"]) if str(item).strip()][:3],
639
+ "content_angles": [str(item).strip() for item in (parsed.get("content_angles") or fallback["content_angles"]) if str(item).strip()][:3],
640
+ "provider": "groq",
641
+ }
642
+
643
+
644
+ def _heuristic_batch_ai_overview(videos: list[Dict[str, Any]]) -> Dict[str, Any]:
645
+ transcripts = [video.get("transcript_text", "") for video in videos]
646
+ captions = [video.get("caption", "") for video in videos]
647
+ keywords = _top_keywords(*transcripts, *captions, limit=12)
648
+ hooks = [video.get("ai_insights", {}).get("hook") for video in videos if video.get("ai_insights", {}).get("hook")]
649
+ ctas = [video.get("ai_insights", {}).get("cta") for video in videos if video.get("ai_insights", {}).get("cta")]
650
+
651
+ return {
652
+ "summary": f"Processed {len(videos)} videos. The strongest recurring topics were {', '.join(keywords[:5]) or 'the uploaded themes'}.",
653
+ "recurring_keywords": keywords,
654
+ "top_hooks": hooks[:5],
655
+ "cta_patterns": Counter(ctas).most_common(5),
656
+ "video_titles": [video.get("title") for video in videos],
657
+ }
658
+
659
+
660
+ def generate_batch_ai_overview(videos: list[Dict[str, Any]]) -> Dict[str, Any]:
661
+ fallback = _heuristic_batch_ai_overview(videos)
662
+ content = _groq_chat(
663
+ [
664
+ {
665
+ "role": "system",
666
+ "content": (
667
+ "You generate concise batch insights across multiple short-form video transcripts. "
668
+ "Return only valid JSON with keys: summary, recurring_keywords, top_hooks, cta_patterns, video_titles. "
669
+ "recurring_keywords and top_hooks must be arrays of strings. cta_patterns must be an array of [string, number] pairs."
670
+ ),
671
+ },
672
+ {
673
+ "role": "user",
674
+ "content": json.dumps(
675
+ {
676
+ "videos": [
677
+ {
678
+ "title": video.get("title"),
679
+ "caption": video.get("caption"),
680
+ "transcript": video.get("transcript_text", "")[:4000],
681
+ "insights": video.get("ai_insights", {}),
682
+ }
683
+ for video in videos
684
+ ]
685
+ },
686
+ ensure_ascii=False,
687
+ ),
688
+ },
689
+ ]
690
+ )
691
+ if not content:
692
+ return {**fallback, "provider": "heuristic"}
693
+
694
+ try:
695
+ parsed = json.loads(_strip_json_fence(content))
696
+ except json.JSONDecodeError:
697
+ return {**fallback, "provider": "heuristic"}
698
+
699
+ cta_patterns_raw = parsed.get("cta_patterns") or fallback["cta_patterns"]
700
+ cta_patterns: list[tuple[str, int]] = []
701
+ for item in cta_patterns_raw:
702
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
703
+ cta_patterns.append((str(item[0]).strip(), int(item[1])))
704
+
705
+ return {
706
+ "summary": str(parsed.get("summary") or fallback["summary"]).strip(),
707
+ "recurring_keywords": [str(item).strip() for item in (parsed.get("recurring_keywords") or fallback["recurring_keywords"]) if str(item).strip()][:12],
708
+ "top_hooks": [str(item).strip() for item in (parsed.get("top_hooks") or fallback["top_hooks"]) if str(item).strip()][:5],
709
+ "cta_patterns": cta_patterns or fallback["cta_patterns"],
710
+ "video_titles": [str(item).strip() for item in (parsed.get("video_titles") or fallback["video_titles"]) if str(item).strip()],
711
+ "provider": "groq",
712
+ }
713
+
714
+
715
+ def build_video_result(
716
+ candidate: VideoCandidate,
717
+ audio_path: Path,
718
+ transcript_path: Path,
719
+ metadata_path: Path,
720
+ whisper_result: Dict[str, Any],
721
+ *,
722
+ model_name: str,
723
+ cached: bool,
724
+ ) -> Dict[str, Any]:
725
+ transcript_text = transcript_path.read_text(encoding="utf-8") if transcript_path.exists() else ""
726
+ ai_insights = generate_video_ai_insights(transcript_text, candidate.caption, candidate.title)
727
+
728
+ return {
729
+ "status": "ok",
730
+ "source_kind": candidate.source_kind,
731
+ "platform": candidate.platform,
732
+ "source_label": candidate.source_label,
733
+ "position": candidate.position,
734
+ "total_videos": candidate.total_videos,
735
+ "video_id": candidate.video_id,
736
+ "title": candidate.title,
737
+ "uploader": candidate.uploader,
738
+ "input_url": candidate.input_url,
739
+ "canonical_url": candidate.canonical_url,
740
+ "video_url": candidate.video_url,
741
+ "caption": candidate.caption,
742
+ "taken_at_timestamp": candidate.timestamp,
743
+ "taken_at_iso": _timestamp_to_iso(candidate.timestamp),
744
+ "audio_file": str(audio_path),
745
+ "transcript_file": str(transcript_path),
746
+ "metadata_file": str(metadata_path),
747
+ "detected_language": whisper_result.get("language"),
748
+ "model": model_name,
749
+ "cached": cached,
750
+ "transcript_text": transcript_text,
751
+ "ai_insights": ai_insights,
752
+ }
753
+
754
+
755
+ def _failed_video_result(candidate: VideoCandidate, error: str) -> Dict[str, Any]:
756
+ return {
757
+ "status": "error",
758
+ "error": error,
759
+ "source_kind": candidate.source_kind,
760
+ "platform": candidate.platform,
761
+ "source_label": candidate.source_label,
762
+ "position": candidate.position,
763
+ "total_videos": candidate.total_videos,
764
+ "video_id": candidate.video_id,
765
+ "title": candidate.title,
766
+ "uploader": candidate.uploader,
767
+ "input_url": candidate.input_url,
768
+ "canonical_url": candidate.canonical_url,
769
+ "video_url": candidate.video_url,
770
+ "caption": candidate.caption,
771
+ "taken_at_timestamp": candidate.timestamp,
772
+ "taken_at_iso": _timestamp_to_iso(candidate.timestamp),
773
+ "audio_file": None,
774
+ "transcript_file": None,
775
+ "metadata_file": None,
776
+ "detected_language": None,
777
+ "cached": False,
778
+ "transcript_text": "",
779
+ "ai_insights": None,
780
+ }
781
+
782
+
783
+ def _load_cached_video_result(candidate: VideoCandidate, run_dir: Path, model_name: str) -> Optional[Dict[str, Any]]:
784
+ audio_path, transcript_path, metadata_path = _paths_for_run(run_dir)
785
+ if not (audio_path.exists() and transcript_path.exists() and metadata_path.exists()):
786
+ return None
787
+
788
+ try:
789
+ metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
790
+ except json.JSONDecodeError:
791
+ return None
792
+
793
+ return build_video_result(
794
+ candidate,
795
+ audio_path,
796
+ transcript_path,
797
+ metadata_path,
798
+ {"language": metadata.get("detected_language")},
799
+ model_name=model_name,
800
+ cached=True,
801
+ )
802
+
803
+
804
+ def download_audio(candidate: VideoCandidate, run_dir: Path) -> Path:
805
+ audio_path, _, _ = _paths_for_run(run_dir)
806
+ if audio_path.exists() and audio_path.stat().st_size > 0:
807
+ return audio_path
808
+
809
+ require_ffmpeg()
810
+ YoutubeDL = _import_yt_dlp()
811
+
812
+ output_template = str(run_dir / "%(id)s.%(ext)s")
813
+ options = {
814
+ **_yt_dlp_base_options(),
815
+ "format": "bestaudio/best",
816
+ "outtmpl": output_template,
817
+ "noplaylist": True,
818
+ "postprocessors": [
819
+ {
820
+ "key": "FFmpegExtractAudio",
821
+ "preferredcodec": "mp3",
822
+ "preferredquality": "192",
823
+ }
824
+ ],
825
+ }
826
+
827
+ try:
828
+ with YoutubeDL(options) as ydl:
829
+ info = ydl.extract_info(candidate.video_url, download=True)
830
+ except Exception as exc:
831
+ raise PipelineError(f"Failed to download audio for {candidate.video_url}: {exc}") from exc
832
+ if not isinstance(info, dict) or not info.get("id"):
833
+ raise PipelineError(f"yt-dlp did not return download metadata for {candidate.video_url}")
834
+
835
+ downloaded_audio_path = run_dir / f"{info['id']}.mp3"
836
+ if not downloaded_audio_path.exists():
837
+ candidates = sorted(run_dir.glob(f"{info['id']}.*"))
838
+ if not candidates:
839
+ raise PipelineError("yt-dlp finished without producing an audio file")
840
+ downloaded_audio_path = candidates[0]
841
+
842
+ if downloaded_audio_path != audio_path:
843
+ audio_path.unlink(missing_ok=True)
844
+ downloaded_audio_path.replace(audio_path)
845
+
846
+ if not audio_path.exists() or audio_path.stat().st_size == 0:
847
+ audio_path.unlink(missing_ok=True)
848
+ raise PipelineError(f"Downloaded audio for {candidate.video_url} is empty")
849
+
850
+ return audio_path
851
+
852
+
853
+ def available_whisper_models() -> list[str]:
854
+ whisper = _import_whisper()
855
+ return list(whisper.available_models())
856
+
857
+
858
+ @lru_cache(maxsize=4)
859
+ def load_whisper_model(model_name: str) -> Any:
860
+ whisper = _import_whisper()
861
+ return whisper.load_model(model_name)
862
+
863
+
864
+ def transcribe_audio(
865
+ audio_path: Path,
866
+ model_name: str,
867
+ language: Optional[str],
868
+ progress_callback: Optional[ProgressCallback] = None,
869
+ percent_range: tuple[int, int] = (55, 85),
870
+ ) -> Dict[str, Any]:
871
+ require_ffmpeg()
872
+ start, end = percent_range
873
+ midpoint = start + math.floor((end - start) * 0.25)
874
+ _emit(progress_callback, "loading_model", midpoint, f"Loading Whisper model '{model_name}'")
875
+ try:
876
+ model = load_whisper_model(model_name)
877
+ except PipelineError:
878
+ raise
879
+ except Exception as exc:
880
+ raise PipelineError(f"Failed to load Whisper model '{model_name}': {exc}") from exc
881
+
882
+ _emit(progress_callback, "transcribing", end, "Transcribing audio with Whisper")
883
+ try:
884
+ result = model.transcribe(str(audio_path), fp16=False, language=language, verbose=None)
885
+ except Exception as exc:
886
+ raise PipelineError(f"Whisper transcription failed: {exc}") from exc
887
+ if not isinstance(result, dict):
888
+ raise PipelineError("Whisper returned an unexpected transcription result")
889
+ return result
890
+
891
+
892
+ def write_video_outputs(
893
+ candidate: VideoCandidate,
894
+ run_dir: Path,
895
+ audio_path: Path,
896
+ whisper_result: Dict[str, Any],
897
+ model_name: str,
898
+ ) -> Dict[str, Any]:
899
+ _, transcript_path, metadata_path = _paths_for_run(run_dir)
900
+ transcript_text = (whisper_result.get("text") or "").strip()
901
+ _atomic_write_text(transcript_path, transcript_text + ("\n" if transcript_text else ""))
902
+
903
+ video_result = build_video_result(
904
+ candidate,
905
+ audio_path,
906
+ transcript_path,
907
+ metadata_path,
908
+ whisper_result,
909
+ model_name=model_name,
910
+ cached=False,
911
+ )
912
+
913
+ metadata = {
914
+ **video_result,
915
+ "generated_at": datetime.now(tz=timezone.utc).isoformat(),
916
+ }
917
+ _atomic_write_text(metadata_path, json.dumps(metadata, indent=2, ensure_ascii=False) + "\n")
918
+ return video_result
919
+
920
+
921
+ def process_video(
922
+ candidate: VideoCandidate,
923
+ *,
924
+ output_dir: Path,
925
+ model_name: str,
926
+ language: Optional[str],
927
+ progress_callback: Optional[ProgressCallback],
928
+ reuse_existing: bool,
929
+ ) -> Dict[str, Any]:
930
+ run_dir = ensure_run_dir(output_dir, candidate)
931
+
932
+ if reuse_existing:
933
+ cached = _load_cached_video_result(candidate, run_dir, model_name)
934
+ if cached is not None:
935
+ return cached
936
+ else:
937
+ # Force-fresh: wipe the candidate's run_dir so download_audio re-pulls the
938
+ # source instead of returning the stale MP3 sitting in the same dir from a
939
+ # prior run. Without this, two IG URLs that yt-dlp normalises to the same
940
+ # `info["id"]` would both serve the first call's transcript.
941
+ if run_dir.exists():
942
+ shutil.rmtree(run_dir, ignore_errors=True)
943
+ run_dir = ensure_run_dir(output_dir, candidate)
944
+
945
+ audio_path = download_audio(candidate, run_dir)
946
+ whisper_result = transcribe_audio(audio_path, model_name, language, progress_callback=progress_callback)
947
+ _emit(
948
+ progress_callback,
949
+ "generating_insights",
950
+ min(95, 15 + math.floor((candidate.position / max(candidate.total_videos, 1)) * 75)),
951
+ f"Generating AI insights for video {candidate.position}/{candidate.total_videos}",
952
+ )
953
+ return write_video_outputs(candidate, run_dir, audio_path, whisper_result, model_name)
954
+
955
+
956
+ def write_batch_manifest(
957
+ batch_result: Dict[str, Any],
958
+ base_output_dir: Path,
959
+ source_group: str,
960
+ source_label: str,
961
+ ) -> Path:
962
+ _, manifest_path = _paths_for_batch(base_output_dir, source_group, source_label)
963
+ _atomic_write_text(manifest_path, json.dumps(batch_result, indent=2, ensure_ascii=False) + "\n")
964
+ return manifest_path
965
+
966
+
967
+ def run_audio_file_transcription(
968
+ audio_path: str | Path,
969
+ *,
970
+ original_filename: Optional[str] = None,
971
+ output_dir: str | Path = "outputs",
972
+ model_name: str = "base",
973
+ language: Optional[str] = None,
974
+ progress_callback: Optional[ProgressCallback] = None,
975
+ ) -> Dict[str, Any]:
976
+ source_audio_path = Path(audio_path).expanduser().resolve()
977
+ if not source_audio_path.exists() or not source_audio_path.is_file():
978
+ raise PipelineError(f"Audio file not found: {source_audio_path}")
979
+ if source_audio_path.stat().st_size <= 0:
980
+ raise PipelineError("The uploaded audio file is empty.")
981
+ if not os.access(source_audio_path, os.R_OK):
982
+ raise PipelineError(f"Audio file is not readable: {source_audio_path}")
983
+
984
+ filename = (original_filename or source_audio_path.name).strip() or source_audio_path.name
985
+ title = Path(filename).stem or "Uploaded audio"
986
+ try:
987
+ file_hash = _file_sha1(source_audio_path)[:10]
988
+ except OSError as exc:
989
+ raise PipelineError(f"Could not read audio file {source_audio_path}: {exc}") from exc
990
+ source_label = _safe_slug(f"{title}-{file_hash}", "audio-upload")
991
+ timestamp = int(source_audio_path.stat().st_mtime)
992
+
993
+ candidate = VideoCandidate(
994
+ source_kind="audio_upload",
995
+ input_url=filename,
996
+ canonical_url=f"upload://{source_label}",
997
+ source_label=source_label,
998
+ source_group="audio_uploads",
999
+ video_id=source_label,
1000
+ timestamp=timestamp,
1001
+ title=title,
1002
+ caption="",
1003
+ video_url="",
1004
+ uploader="uploaded audio",
1005
+ platform="local_audio",
1006
+ position=1,
1007
+ total_videos=1,
1008
+ )
1009
+
1010
+ output_root = Path(output_dir).expanduser().resolve()
1011
+ run_dir = ensure_run_dir(output_root, candidate)
1012
+
1013
+ _emit(progress_callback, "validating", 4, "Validating uploaded audio file")
1014
+ _emit(progress_callback, "preparing_audio", 18, "Staging uploaded audio for transcription")
1015
+ staged_audio_path = _copy_uploaded_audio(source_audio_path, run_dir)
1016
+
1017
+ whisper_result = transcribe_audio(
1018
+ staged_audio_path,
1019
+ model_name,
1020
+ language,
1021
+ progress_callback=progress_callback,
1022
+ percent_range=(22, 82),
1023
+ )
1024
+ _emit(progress_callback, "generating_insights", 92, "Generating AI insights from uploaded audio")
1025
+ video_result = write_video_outputs(
1026
+ candidate,
1027
+ run_dir,
1028
+ staged_audio_path,
1029
+ whisper_result,
1030
+ model_name,
1031
+ )
1032
+
1033
+ batch_result = {
1034
+ "status": "ok",
1035
+ "input_kind": "audio_upload",
1036
+ "input_url": filename,
1037
+ "canonical_url": candidate.canonical_url,
1038
+ "model": model_name,
1039
+ "language_hint": language,
1040
+ "total_videos": 1,
1041
+ "completed_videos": 1,
1042
+ "failed_videos": 0,
1043
+ "videos": [video_result],
1044
+ "ai_overview": generate_batch_ai_overview([video_result]),
1045
+ }
1046
+
1047
+ manifest_path = write_batch_manifest(
1048
+ batch_result,
1049
+ output_root,
1050
+ candidate.source_group,
1051
+ candidate.source_label,
1052
+ )
1053
+ batch_result["manifest_file"] = str(manifest_path)
1054
+ _emit(progress_callback, "completed", 100, "Completed transcription for uploaded audio")
1055
+ return batch_result
1056
+
1057
+
1058
+ def run_transcription(
1059
+ input_url: str,
1060
+ *,
1061
+ output_dir: str | Path = "outputs",
1062
+ model_name: str = "base",
1063
+ language: Optional[str] = None,
1064
+ progress_callback: Optional[ProgressCallback] = None,
1065
+ reuse_existing: bool = True,
1066
+ ) -> Dict[str, Any]:
1067
+ _emit(progress_callback, "validating", 4, "Validating input URL")
1068
+ input_kind, canonical_input, candidates = resolve_candidates(input_url)
1069
+ output_root = Path(output_dir).expanduser().resolve()
1070
+
1071
+ if input_kind == "instagram_profile":
1072
+ _emit(progress_callback, "collecting_videos", 12, f"Collected the latest {len(candidates)} videos from the Instagram profile")
1073
+ else:
1074
+ _emit(progress_callback, "collecting_videos", 12, "Resolved the video URL")
1075
+
1076
+ video_results: list[Dict[str, Any]] = []
1077
+ total = len(candidates)
1078
+
1079
+ for index, candidate in enumerate(candidates, start=1):
1080
+ base_percent = 15 + math.floor(((index - 1) / total) * 75)
1081
+ _emit(
1082
+ progress_callback,
1083
+ "downloading_audio",
1084
+ base_percent,
1085
+ f"Processing video {index}/{total}: {candidate.title[:80]}",
1086
+ )
1087
+ try:
1088
+ video_result = process_video(
1089
+ candidate,
1090
+ output_dir=output_root,
1091
+ model_name=model_name,
1092
+ language=language,
1093
+ progress_callback=progress_callback,
1094
+ reuse_existing=reuse_existing,
1095
+ )
1096
+ except PipelineError as exc:
1097
+ # In a multi-video batch, one broken video should not abort the
1098
+ # remaining downloads. Record the failure and keep going.
1099
+ if total == 1:
1100
+ raise
1101
+ video_result = _failed_video_result(candidate, str(exc))
1102
+ _emit(
1103
+ progress_callback,
1104
+ "video_failed",
1105
+ base_percent,
1106
+ f"Skipping video {index}/{total} after error: {exc}",
1107
+ )
1108
+ video_results.append(video_result)
1109
+ completed_percent = 15 + math.floor((index / total) * 75)
1110
+ status_message = (
1111
+ f"Finished video {index}/{total}"
1112
+ if total > 1
1113
+ else "Transcript ready"
1114
+ )
1115
+ _emit(progress_callback, "writing_files", completed_percent, status_message)
1116
+
1117
+ successful_videos = [video for video in video_results if video.get("status") == "ok"]
1118
+ if not successful_videos:
1119
+ errors = "; ".join(
1120
+ str(video.get("error")) for video in video_results if video.get("error")
1121
+ )
1122
+ raise PipelineError(f"All {total} videos failed to transcribe. Errors: {errors or 'unknown'}")
1123
+
1124
+ failed_count = total - len(successful_videos)
1125
+ batch_result = {
1126
+ "status": "ok",
1127
+ "input_kind": input_kind,
1128
+ "input_url": input_url,
1129
+ "canonical_url": canonical_input,
1130
+ "model": model_name,
1131
+ "language_hint": language,
1132
+ "total_videos": total,
1133
+ "completed_videos": len(successful_videos),
1134
+ "failed_videos": failed_count,
1135
+ "videos": video_results,
1136
+ "ai_overview": generate_batch_ai_overview(successful_videos),
1137
+ }
1138
+
1139
+ manifest_path = write_batch_manifest(
1140
+ batch_result,
1141
+ output_root,
1142
+ candidates[0].source_group,
1143
+ candidates[0].source_label,
1144
+ )
1145
+ batch_result["manifest_file"] = str(manifest_path)
1146
+ completed_message = f"Completed transcription for {len(successful_videos)} video(s)"
1147
+ if failed_count:
1148
+ completed_message += f" ({failed_count} failed)"
1149
+ _emit(progress_callback, "completed", 100, completed_message)
1150
+ return batch_result