reelrecon 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/mcp_server.py ADDED
@@ -0,0 +1,987 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ import shutil
10
+ import sys
11
+ import threading
12
+ import time
13
+ from functools import lru_cache
14
+ from pathlib import Path
15
+ from typing import Any, Callable
16
+
17
+ from mcp.server.fastmcp import Context, FastMCP
18
+ from mcp.types import ToolAnnotations
19
+
20
+ SERVER_NAME = "ReelRecon"
21
+ SERVER_VERSION = "1.2.0"
22
+
23
+ logger = logging.getLogger("reelrecon.mcp")
24
+
25
+
26
+ def _env(name: str, default: str | None = None) -> str | None:
27
+ # REELRECON_* is the primary prefix; the legacy IG_TRANSCRIBER_* prefix
28
+ # remains supported so existing setups keep working after the rename.
29
+ for key in (f"REELRECON_{name}", f"IG_TRANSCRIBER_{name}"):
30
+ value = os.environ.get(key)
31
+ if value is not None:
32
+ return value
33
+ return default
34
+
35
+
36
+ def _env_int(name: str, default: int, *, minimum: int = 0) -> int:
37
+ try:
38
+ return max(int(_env(name, str(default))), minimum)
39
+ except (TypeError, ValueError):
40
+ return default
41
+
42
+
43
+ REPO_ROOT = Path(__file__).resolve().parent
44
+ DEFAULT_OUTPUT_DIR = Path(_env("OUTPUT_DIR", str(REPO_ROOT / "outputs"))).expanduser().resolve()
45
+ MAX_LIST_LIMIT = 50
46
+ JOB_TIMEOUT_SECONDS = _env_int("JOB_TIMEOUT_SECONDS", 3600, minimum=30)
47
+ QUEUE_TIMEOUT_SECONDS = _env_int("QUEUE_TIMEOUT_SECONDS", 900, minimum=5)
48
+ MAX_CONCURRENT_JOBS = _env_int("MAX_CONCURRENT_JOBS", 1, minimum=1)
49
+ MAX_UPLOAD_BYTES = _env_int("MAX_UPLOAD_BYTES", 2 * 1024 * 1024 * 1024, minimum=1)
50
+
51
+ # Whisper model names accepted without touching the (heavy) whisper import.
52
+ # Extra names can be allowed with REELRECON_EXTRA_MODELS="name1,name2".
53
+ KNOWN_WHISPER_MODELS = {
54
+ "tiny",
55
+ "tiny.en",
56
+ "base",
57
+ "base.en",
58
+ "small",
59
+ "small.en",
60
+ "medium",
61
+ "medium.en",
62
+ "large",
63
+ "large-v1",
64
+ "large-v2",
65
+ "large-v3",
66
+ "large-v3-turbo",
67
+ "turbo",
68
+ }
69
+
70
+ _SLUG_PATTERN = re.compile(r"[^a-zA-Z0-9._-]+")
71
+
72
+ # Serializes/limits transcription jobs so parallel tool calls cannot trample
73
+ # each other's output directories or exhaust memory loading Whisper models.
74
+ _job_semaphore: asyncio.Semaphore | None = None
75
+ _job_semaphore_guard = threading.Lock()
76
+ _active_jobs = 0
77
+ _abandoned_jobs = 0
78
+
79
+
80
+ def _get_job_semaphore() -> asyncio.Semaphore:
81
+ global _job_semaphore
82
+ with _job_semaphore_guard:
83
+ if _job_semaphore is None:
84
+ _job_semaphore = asyncio.Semaphore(MAX_CONCURRENT_JOBS)
85
+ return _job_semaphore
86
+
87
+
88
+ @lru_cache(maxsize=1)
89
+ def _pipeline() -> Any:
90
+ """Import the transcription pipeline lazily.
91
+
92
+ whisper/torch imports take seconds; deferring them keeps MCP `initialize`
93
+ fast, and an installation problem becomes a structured tool error instead
94
+ of a server that fails to boot.
95
+ """
96
+ from ig_transcriber import pipeline
97
+
98
+ return pipeline
99
+
100
+
101
+ def _error(error_type: str, message: str, *, hint: str | None = None, **extra: Any) -> dict[str, Any]:
102
+ payload: dict[str, Any] = {"status": "error", "error_type": error_type, "error": message}
103
+ if hint:
104
+ payload["hint"] = hint
105
+ payload.update(extra)
106
+ return payload
107
+
108
+
109
+ def _safe_slug(value: str, fallback: str = "item") -> str:
110
+ # Strip leading/trailing dots too so a slug can never be a path-traversal
111
+ # component like "." or "..".
112
+ slug = _SLUG_PATTERN.sub("-", value.strip()).strip("-.").lower()
113
+ if not slug or set(slug) <= {".", "-"}:
114
+ return fallback
115
+ return slug
116
+
117
+
118
+ def _json(data: Any) -> str:
119
+ return json.dumps(data, indent=2, ensure_ascii=False) + "\n"
120
+
121
+
122
+ def _load_json(path: Path) -> dict[str, Any]:
123
+ data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
124
+ if not isinstance(data, dict):
125
+ raise json.JSONDecodeError("Expected a JSON object", str(path), 0)
126
+ return data
127
+
128
+
129
+ def _within_output_root(path: Path) -> Path:
130
+ resolved = path.expanduser().resolve()
131
+ if resolved != DEFAULT_OUTPUT_DIR and DEFAULT_OUTPUT_DIR not in resolved.parents:
132
+ raise FileNotFoundError(f"Path is outside the output directory: {resolved}")
133
+ return resolved
134
+
135
+
136
+ def _manifest_path(source_group: str, source_label: str) -> Path:
137
+ return _within_output_root(
138
+ DEFAULT_OUTPUT_DIR / _safe_slug(source_group, "group") / _safe_slug(source_label, "source") / "manifest.json"
139
+ )
140
+
141
+
142
+ def _video_dir(source_group: str, source_label: str, video_id: str) -> Path:
143
+ return _within_output_root(
144
+ DEFAULT_OUTPUT_DIR
145
+ / _safe_slug(source_group, "group")
146
+ / _safe_slug(source_label, "source")
147
+ / _safe_slug(video_id, "video")
148
+ )
149
+
150
+
151
+ def _manifest_resource_uri(source_group: str, source_label: str) -> str:
152
+ return f"reelrecon://manifest/{_safe_slug(source_group, 'group')}/{_safe_slug(source_label, 'source')}"
153
+
154
+
155
+ def _transcript_resource_uri(source_group: str, source_label: str, video_id: str) -> str:
156
+ return (
157
+ "reelrecon://transcript/"
158
+ f"{_safe_slug(source_group, 'group')}/{_safe_slug(source_label, 'source')}/{_safe_slug(video_id, 'video')}"
159
+ )
160
+
161
+
162
+ def _mtime_or_zero(path: Path) -> float:
163
+ try:
164
+ return path.stat().st_mtime
165
+ except OSError:
166
+ return 0.0
167
+
168
+
169
+ def _recent_manifest_paths(limit: int) -> list[Path]:
170
+ capped_limit = min(max(limit, 1), MAX_LIST_LIMIT)
171
+ try:
172
+ manifests = sorted(
173
+ DEFAULT_OUTPUT_DIR.glob("*/*/manifest.json"),
174
+ key=_mtime_or_zero,
175
+ reverse=True,
176
+ )
177
+ except OSError as exc:
178
+ logger.warning("Could not scan output directory %s: %s", DEFAULT_OUTPUT_DIR, exc)
179
+ return []
180
+ return manifests[:capped_limit]
181
+
182
+
183
+ def _manifest_summary(path: Path) -> dict[str, Any]:
184
+ relative = path.relative_to(DEFAULT_OUTPUT_DIR)
185
+ source_group, source_label = relative.parts[0], relative.parts[1]
186
+ summary: dict[str, Any] = {
187
+ "source_group": source_group,
188
+ "source_label": source_label,
189
+ "manifest_file": str(path),
190
+ "manifest_resource": _manifest_resource_uri(source_group, source_label),
191
+ "updated_at": _mtime_or_zero(path),
192
+ }
193
+ try:
194
+ payload = _load_json(path)
195
+ except (OSError, json.JSONDecodeError) as exc:
196
+ summary["status"] = "unreadable"
197
+ summary["error"] = f"Manifest could not be read: {exc}"
198
+ return summary
199
+
200
+ summary.update(
201
+ {
202
+ "status": "ok",
203
+ "input_kind": payload.get("input_kind"),
204
+ "input_url": payload.get("input_url"),
205
+ "canonical_url": payload.get("canonical_url"),
206
+ "model": payload.get("model"),
207
+ "total_videos": payload.get("total_videos"),
208
+ "completed_videos": payload.get("completed_videos"),
209
+ "failed_videos": payload.get("failed_videos"),
210
+ }
211
+ )
212
+ return summary
213
+
214
+
215
+ def _known_batches(limit: int = 10) -> list[dict[str, str]]:
216
+ batches = []
217
+ for path in _recent_manifest_paths(limit):
218
+ relative = path.relative_to(DEFAULT_OUTPUT_DIR)
219
+ batches.append({"source_group": relative.parts[0], "source_label": relative.parts[1]})
220
+ return batches
221
+
222
+
223
+ def _attach_resource_links(batch_result: dict[str, Any], manifest_path: Path | None = None) -> dict[str, Any]:
224
+ try:
225
+ manifest_file = batch_result.get("manifest_file")
226
+ if manifest_file:
227
+ manifest_path = _within_output_root(Path(manifest_file))
228
+ elif manifest_path is not None:
229
+ manifest_path = _within_output_root(manifest_path)
230
+ batch_result["manifest_file"] = str(manifest_path)
231
+
232
+ if manifest_path is not None:
233
+ relative = manifest_path.relative_to(DEFAULT_OUTPUT_DIR)
234
+ source_group, source_label = relative.parts[0], relative.parts[1]
235
+ batch_result["manifest_resource"] = _manifest_resource_uri(source_group, source_label)
236
+ for video in batch_result.get("videos", []):
237
+ if isinstance(video, dict):
238
+ video["transcript_resource"] = _transcript_resource_uri(
239
+ source_group,
240
+ source_label,
241
+ str(video.get("video_id") or "video"),
242
+ )
243
+ except (OSError, ValueError, IndexError) as exc:
244
+ # Resource links are a convenience; never let them break a result
245
+ # that the pipeline already produced successfully.
246
+ logger.warning("Could not attach resource links: %s", exc)
247
+ return batch_result
248
+
249
+
250
+ def _shape_batch_result(
251
+ batch_result: dict[str, Any],
252
+ *,
253
+ include_transcript_text: bool,
254
+ max_transcript_chars: int,
255
+ ) -> dict[str, Any]:
256
+ enriched = _attach_resource_links(batch_result)
257
+ if include_transcript_text and max_transcript_chars <= 0:
258
+ return enriched
259
+
260
+ shaped = json.loads(json.dumps(enriched))
261
+ for video in shaped.get("videos", []):
262
+ if not isinstance(video, dict):
263
+ continue
264
+ text = video.get("transcript_text") or ""
265
+ video["transcript_chars"] = len(text)
266
+ if not include_transcript_text:
267
+ video.pop("transcript_text", None)
268
+ elif len(text) > max_transcript_chars:
269
+ video["transcript_text"] = text[:max_transcript_chars]
270
+ video["transcript_text_truncated"] = True
271
+ return shaped
272
+
273
+
274
+ def _clean_url(raw_url: str) -> str:
275
+ # LLM clients routinely wrap URLs in quotes, angle brackets, or markdown.
276
+ url = (raw_url or "").strip().strip("'\"").strip()
277
+ if url.startswith("<") and url.endswith(">"):
278
+ url = url[1:-1].strip()
279
+ return url
280
+
281
+
282
+ def _validate_url(raw_url: str) -> tuple[str | None, dict[str, Any] | None]:
283
+ url = _clean_url(raw_url)
284
+ if not url:
285
+ return None, _error(
286
+ "invalid_input",
287
+ "input_url is empty.",
288
+ hint="Pass a public Instagram profile URL (https://www.instagram.com/<username>/) or a direct video URL.",
289
+ )
290
+ if not re.match(r"^https?://", url, flags=re.IGNORECASE):
291
+ return None, _error(
292
+ "invalid_input",
293
+ f"input_url must start with http:// or https://, got: {url[:200]}",
294
+ hint="Example: https://www.instagram.com/instagram/ or https://www.instagram.com/reel/<id>/",
295
+ )
296
+ return url, None
297
+
298
+
299
+ def _allowed_models() -> set[str]:
300
+ extra = {
301
+ name.strip()
302
+ for name in (_env("EXTRA_MODELS") or "").split(",")
303
+ if name.strip()
304
+ }
305
+ return KNOWN_WHISPER_MODELS | extra
306
+
307
+
308
+ def _validate_model(model_name: str) -> tuple[str | None, dict[str, Any] | None]:
309
+ name = (model_name or "").strip()
310
+ if not name:
311
+ return "base", None
312
+ allowed = _allowed_models()
313
+ if name not in allowed:
314
+ return None, _error(
315
+ "invalid_input",
316
+ f"Unknown Whisper model: {name!r}",
317
+ hint=f"Valid models: {', '.join(sorted(allowed))}. "
318
+ "Set REELRECON_EXTRA_MODELS to allow additional names.",
319
+ )
320
+ return name, None
321
+
322
+
323
+ def _normalize_language(language: str | None) -> tuple[str | None, dict[str, Any] | None]:
324
+ if language is None:
325
+ return None, None
326
+ lang = language.strip().lower()
327
+ if lang in {"", "auto", "none", "null", "detect", "default"}:
328
+ return None, None
329
+ if not re.fullmatch(r"[a-z]{2,3}(-[a-z0-9]{2,8})?|[a-z ]{3,30}", lang):
330
+ return None, _error(
331
+ "invalid_input",
332
+ f"Invalid language hint: {language!r}",
333
+ hint="Use an ISO code like 'en', 'es', 'hi', a language name like 'english', or omit it for auto-detection.",
334
+ )
335
+ return lang, None
336
+
337
+
338
+ def _validate_limit(limit: int) -> int:
339
+ try:
340
+ return min(max(int(limit), 1), MAX_LIST_LIMIT)
341
+ except (TypeError, ValueError):
342
+ return 10
343
+
344
+
345
+ def _validate_slug_input(value: str, field: str) -> tuple[str | None, dict[str, Any] | None]:
346
+ cleaned = (value or "").strip()
347
+ if not cleaned:
348
+ return None, _error(
349
+ "invalid_input",
350
+ f"{field} is empty.",
351
+ hint="Call list_recent_batches to see the available source_group/source_label values.",
352
+ )
353
+ return cleaned, None
354
+
355
+
356
+ class _ThreadSafeProgress:
357
+ """Progress bridge from the worker thread to the MCP client.
358
+
359
+ Every failure mode is swallowed on purpose: a disconnected client or a
360
+ closed event loop must never crash the transcription thread. Updates are
361
+ throttled so long batches do not flood the MCP session with notifications.
362
+ """
363
+
364
+ def __init__(self, ctx: Context, loop: asyncio.AbstractEventLoop, *, min_interval: float = 0.5) -> None:
365
+ self._ctx = ctx
366
+ self._loop = loop
367
+ self._min_interval = min_interval
368
+ self._lock = threading.Lock()
369
+ self._last_stage: str | None = None
370
+ self._last_percent = -1
371
+ self._last_sent = 0.0
372
+
373
+ def __call__(self, stage: str, percent: int, message: str) -> None:
374
+ now = time.monotonic()
375
+ with self._lock:
376
+ stage_changed = stage != self._last_stage
377
+ if not stage_changed and (percent == self._last_percent or now - self._last_sent < self._min_interval):
378
+ return
379
+ self._last_stage = stage
380
+ self._last_percent = percent
381
+ self._last_sent = now
382
+ try:
383
+ future = asyncio.run_coroutine_threadsafe(
384
+ self._report(stage, float(percent), message, stage_changed),
385
+ self._loop,
386
+ )
387
+ except RuntimeError:
388
+ return
389
+ future.add_done_callback(self._consume_result)
390
+
391
+ @staticmethod
392
+ def _consume_result(future: Any) -> None:
393
+ try:
394
+ future.exception()
395
+ except Exception:
396
+ pass
397
+
398
+ async def _report(self, stage: str, percent: float, message: str, stage_changed: bool) -> None:
399
+ progress_message = f"[{stage}] {message}"
400
+ try:
401
+ await self._ctx.report_progress(progress=percent, total=100.0, message=progress_message)
402
+ if stage_changed:
403
+ await self._ctx.info(progress_message)
404
+ except Exception:
405
+ logger.debug("Progress notification dropped (client likely disconnected)", exc_info=True)
406
+
407
+
408
+ async def _notify(ctx: Context, level: str, message: str) -> None:
409
+ try:
410
+ if level == "error":
411
+ await ctx.error(message)
412
+ else:
413
+ await ctx.info(message)
414
+ except Exception:
415
+ logger.debug("MCP %s notification dropped", level, exc_info=True)
416
+
417
+
418
+ def _preflight_transcription() -> dict[str, Any] | None:
419
+ try:
420
+ pipeline = _pipeline()
421
+ except Exception as exc:
422
+ return _error(
423
+ "dependency_error",
424
+ f"The transcription pipeline could not be loaded: {exc}",
425
+ hint="Install dependencies with: pip install -r requirements.txt (Python 3.11 recommended).",
426
+ )
427
+ try:
428
+ pipeline.require_ffmpeg()
429
+ except pipeline.PipelineError as exc:
430
+ return _error("dependency_error", str(exc))
431
+ try:
432
+ DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
433
+ if not os.access(DEFAULT_OUTPUT_DIR, os.W_OK):
434
+ raise PermissionError(f"Not writable: {DEFAULT_OUTPUT_DIR}")
435
+ except OSError as exc:
436
+ return _error(
437
+ "output_dir_error",
438
+ f"The output directory is not usable: {exc}",
439
+ hint="Set REELRECON_OUTPUT_DIR to a writable directory.",
440
+ )
441
+ return None
442
+
443
+
444
+ async def _run_pipeline_job(
445
+ ctx: Context,
446
+ job_description: str,
447
+ func: Callable[..., dict[str, Any]],
448
+ *args: Any,
449
+ **kwargs: Any,
450
+ ) -> dict[str, Any]:
451
+ """Run a blocking pipeline call with queueing, a hard timeout, and structured errors."""
452
+ global _active_jobs, _abandoned_jobs
453
+
454
+ pipeline = _pipeline()
455
+ semaphore = _get_job_semaphore()
456
+ try:
457
+ await asyncio.wait_for(semaphore.acquire(), timeout=QUEUE_TIMEOUT_SECONDS)
458
+ except asyncio.TimeoutError:
459
+ return _error(
460
+ "server_busy",
461
+ f"The server is already running {MAX_CONCURRENT_JOBS} transcription job(s) and the queue wait "
462
+ f"exceeded {QUEUE_TIMEOUT_SECONDS}s.",
463
+ hint="Retry later, or raise REELRECON_MAX_CONCURRENT_JOBS / REELRECON_QUEUE_TIMEOUT_SECONDS.",
464
+ )
465
+
466
+ _active_jobs += 1
467
+ try:
468
+ return await asyncio.wait_for(asyncio.to_thread(func, *args, **kwargs), timeout=JOB_TIMEOUT_SECONDS)
469
+ except asyncio.TimeoutError:
470
+ _abandoned_jobs += 1
471
+ message = (
472
+ f"{job_description} exceeded the {JOB_TIMEOUT_SECONDS}s job timeout. "
473
+ "The worker may still be finishing in the background; completed output will appear in list_recent_batches."
474
+ )
475
+ await _notify(ctx, "error", message)
476
+ return _error(
477
+ "timeout",
478
+ message,
479
+ hint="Raise REELRECON_JOB_TIMEOUT_SECONDS for long batches, or use a smaller Whisper model.",
480
+ )
481
+ except pipeline.PipelineError as exc:
482
+ await _notify(ctx, "error", str(exc))
483
+ return _error("pipeline_error", str(exc))
484
+ except Exception as exc:
485
+ logger.exception("Unexpected failure in %s", job_description)
486
+ message = f"Unexpected failure: {exc}"
487
+ await _notify(ctx, "error", message)
488
+ return _error("internal_error", message)
489
+ finally:
490
+ _active_jobs -= 1
491
+ semaphore.release()
492
+
493
+
494
+ def _dependency_status() -> dict[str, Any]:
495
+ status: dict[str, Any] = {}
496
+ try:
497
+ _pipeline()
498
+ status["pipeline"] = "ok"
499
+ except Exception as exc:
500
+ status["pipeline"] = f"error: {exc}"
501
+ for module_name in ("whisper", "yt_dlp", "mcp"):
502
+ try:
503
+ __import__(module_name)
504
+ status[module_name] = "ok"
505
+ except Exception as exc:
506
+ status[module_name] = f"error: {exc}"
507
+ return status
508
+
509
+
510
+ def build_server(*, host: str, port: int, debug: bool) -> FastMCP:
511
+ mcp = FastMCP(
512
+ name=SERVER_NAME,
513
+ instructions=(
514
+ "Use this server to transcribe a direct video URL, the latest 10 videos from a public Instagram "
515
+ "profile, or a local audio file. The main tool is transcribe_input. Use list_recent_batches, "
516
+ "read_batch_manifest, and read_video_output to inspect prior results, and check_health to diagnose "
517
+ "setup problems. All tools return JSON objects with a 'status' field: 'ok' on success, or 'error' "
518
+ "with 'error_type', 'error', and usually a 'hint' on failure — tool calls never raise for expected "
519
+ "failures. Long transcriptions report progress notifications; set include_transcript_text=false or "
520
+ "max_transcript_chars to keep responses small."
521
+ ),
522
+ host=host,
523
+ port=port,
524
+ debug=debug,
525
+ log_level="DEBUG" if debug else "ERROR",
526
+ json_response=True,
527
+ )
528
+
529
+ read_only = ToolAnnotations(readOnlyHint=True, destructiveHint=False, idempotentHint=True, openWorldHint=False)
530
+
531
+ @mcp.resource("reelrecon://server")
532
+ def server_resource() -> str:
533
+ return _json(
534
+ {
535
+ "name": SERVER_NAME,
536
+ "version": SERVER_VERSION,
537
+ "output_root": str(DEFAULT_OUTPUT_DIR),
538
+ "tools": [
539
+ "transcribe_input",
540
+ "transcribe_local_audio",
541
+ "list_recent_batches",
542
+ "read_batch_manifest",
543
+ "read_video_output",
544
+ "check_health",
545
+ ],
546
+ "input_support": {
547
+ "instagram_profile": "Fetches and transcribes the latest 10 videos from a public Instagram profile.",
548
+ "video_url": "Transcribes a single direct video URL.",
549
+ "local_audio": "Transcribes a local audio file path.",
550
+ },
551
+ "error_contract": {
552
+ "status": "'ok' or 'error'",
553
+ "error_type": "invalid_input | dependency_error | output_dir_error | pipeline_error | not_found | server_busy | timeout | internal_error",
554
+ },
555
+ "limits": {
556
+ "job_timeout_seconds": JOB_TIMEOUT_SECONDS,
557
+ "queue_timeout_seconds": QUEUE_TIMEOUT_SECONDS,
558
+ "max_concurrent_jobs": MAX_CONCURRENT_JOBS,
559
+ "max_upload_bytes": MAX_UPLOAD_BYTES,
560
+ "max_list_limit": MAX_LIST_LIMIT,
561
+ },
562
+ "resources": [
563
+ "reelrecon://server",
564
+ "reelrecon://recent-batches",
565
+ "reelrecon://manifest/{source_group}/{source_label}",
566
+ "reelrecon://transcript/{source_group}/{source_label}/{video_id}",
567
+ ],
568
+ }
569
+ )
570
+
571
+ @mcp.resource("reelrecon://recent-batches")
572
+ def recent_batches_resource() -> str:
573
+ return _json(
574
+ {
575
+ "output_root": str(DEFAULT_OUTPUT_DIR),
576
+ "batches": [_manifest_summary(path) for path in _recent_manifest_paths(limit=10)],
577
+ }
578
+ )
579
+
580
+ @mcp.resource("reelrecon://manifest/{source_group}/{source_label}")
581
+ def manifest_resource(source_group: str, source_label: str) -> str:
582
+ try:
583
+ path = _manifest_path(source_group, source_label)
584
+ except FileNotFoundError as exc:
585
+ raise FileNotFoundError(f"Invalid manifest location for {source_group}/{source_label}") from exc
586
+ if not path.exists():
587
+ raise FileNotFoundError(f"Manifest not found for {source_group}/{source_label}")
588
+ try:
589
+ payload = _attach_resource_links(_load_json(path), manifest_path=path)
590
+ except json.JSONDecodeError as exc:
591
+ raise ValueError(f"Manifest for {source_group}/{source_label} is corrupt: {exc}") from exc
592
+ return _json(payload)
593
+
594
+ @mcp.resource("reelrecon://transcript/{source_group}/{source_label}/{video_id}")
595
+ def transcript_resource(source_group: str, source_label: str, video_id: str) -> str:
596
+ try:
597
+ transcript_path = _video_dir(source_group, source_label, video_id) / "transcript.txt"
598
+ except FileNotFoundError as exc:
599
+ raise FileNotFoundError(f"Invalid transcript location for {source_group}/{source_label}/{video_id}") from exc
600
+ if not transcript_path.exists():
601
+ raise FileNotFoundError(f"Transcript not found for {source_group}/{source_label}/{video_id}")
602
+ return transcript_path.read_text(encoding="utf-8", errors="replace")
603
+
604
+ @mcp.tool(
605
+ description=(
606
+ "Transcribe a direct video URL or the latest 10 videos from a public Instagram profile URL. "
607
+ "input_url: an Instagram profile URL (https://www.instagram.com/<username>/), a single Instagram "
608
+ "reel/post/tv URL, or any direct video URL yt-dlp supports. model_name: Whisper model "
609
+ "(tiny/base/small/medium/large*, default 'base'). language: optional ISO hint like 'en' (omit for "
610
+ "auto-detect). reuse_existing: reuse cached transcripts for already-processed videos. Set "
611
+ "include_transcript_text=false or max_transcript_chars>0 to shrink the response; full text stays "
612
+ "on disk and via resources. Long profile batches can take many minutes — progress is streamed via "
613
+ "MCP progress notifications."
614
+ ),
615
+ annotations=ToolAnnotations(readOnlyHint=False, destructiveHint=False, idempotentHint=True, openWorldHint=True),
616
+ )
617
+ async def transcribe_input(
618
+ input_url: str,
619
+ ctx: Context,
620
+ model_name: str = "base",
621
+ language: str | None = None,
622
+ reuse_existing: bool = True,
623
+ include_transcript_text: bool = True,
624
+ max_transcript_chars: int = 0,
625
+ ) -> dict[str, Any]:
626
+ url, url_error = _validate_url(input_url)
627
+ if url_error:
628
+ return url_error
629
+ model, model_error = _validate_model(model_name)
630
+ if model_error:
631
+ return model_error
632
+ lang, lang_error = _normalize_language(language)
633
+ if lang_error:
634
+ return lang_error
635
+ preflight_error = _preflight_transcription()
636
+ if preflight_error:
637
+ return preflight_error
638
+
639
+ pipeline = _pipeline()
640
+ loop = asyncio.get_running_loop()
641
+ progress_callback = _ThreadSafeProgress(ctx, loop)
642
+ await _notify(ctx, "info", f"Starting transcription for {url}")
643
+
644
+ result = await _run_pipeline_job(
645
+ ctx,
646
+ f"Transcription of {url}",
647
+ pipeline.run_transcription,
648
+ url,
649
+ output_dir=DEFAULT_OUTPUT_DIR,
650
+ model_name=model,
651
+ language=lang,
652
+ progress_callback=progress_callback,
653
+ reuse_existing=reuse_existing,
654
+ )
655
+ if result.get("status") != "ok":
656
+ return result
657
+
658
+ shaped = _shape_batch_result(
659
+ result,
660
+ include_transcript_text=include_transcript_text,
661
+ max_transcript_chars=max(int(max_transcript_chars), 0),
662
+ )
663
+ try:
664
+ await ctx.report_progress(progress=100.0, total=100.0, message="Transcription completed")
665
+ except Exception:
666
+ pass
667
+ await _notify(ctx, "info", f"Completed transcription for {shaped.get('completed_videos', 0)} video(s)")
668
+ return shaped
669
+
670
+ @mcp.tool(
671
+ description=(
672
+ "Transcribe a local audio file path and generate AI insights from the transcript. audio_path must "
673
+ "be an existing readable file on the server host (mp3/wav/m4a/flac/ogg and most ffmpeg-decodable "
674
+ "formats). original_filename: optional display name used to label the output. model_name/language: "
675
+ "same as transcribe_input. Set include_transcript_text=false or max_transcript_chars>0 to shrink "
676
+ "the response."
677
+ ),
678
+ annotations=ToolAnnotations(readOnlyHint=False, destructiveHint=False, idempotentHint=True, openWorldHint=False),
679
+ )
680
+ async def transcribe_local_audio(
681
+ audio_path: str,
682
+ ctx: Context,
683
+ original_filename: str | None = None,
684
+ model_name: str = "base",
685
+ language: str | None = None,
686
+ include_transcript_text: bool = True,
687
+ max_transcript_chars: int = 0,
688
+ ) -> dict[str, Any]:
689
+ raw_path = (audio_path or "").strip().strip("'\"")
690
+ if not raw_path:
691
+ return _error("invalid_input", "audio_path is empty.", hint="Pass the absolute path of a local audio file.")
692
+ source_path = Path(raw_path).expanduser()
693
+ try:
694
+ source_path = source_path.resolve()
695
+ except OSError as exc:
696
+ return _error("invalid_input", f"audio_path could not be resolved: {exc}")
697
+ if not source_path.exists():
698
+ return _error("not_found", f"Audio file not found: {source_path}")
699
+ if not source_path.is_file():
700
+ return _error("invalid_input", f"audio_path is not a regular file: {source_path}")
701
+ if not os.access(source_path, os.R_OK):
702
+ return _error("invalid_input", f"Audio file is not readable: {source_path}")
703
+ try:
704
+ size = source_path.stat().st_size
705
+ except OSError as exc:
706
+ return _error("invalid_input", f"Audio file could not be inspected: {exc}")
707
+ if size == 0:
708
+ return _error("invalid_input", f"Audio file is empty: {source_path}")
709
+ if size > MAX_UPLOAD_BYTES:
710
+ return _error(
711
+ "invalid_input",
712
+ f"Audio file is {size} bytes, above the {MAX_UPLOAD_BYTES} byte limit.",
713
+ hint="Raise REELRECON_MAX_UPLOAD_BYTES to allow larger files.",
714
+ )
715
+
716
+ model, model_error = _validate_model(model_name)
717
+ if model_error:
718
+ return model_error
719
+ lang, lang_error = _normalize_language(language)
720
+ if lang_error:
721
+ return lang_error
722
+ preflight_error = _preflight_transcription()
723
+ if preflight_error:
724
+ return preflight_error
725
+
726
+ pipeline = _pipeline()
727
+ loop = asyncio.get_running_loop()
728
+ progress_callback = _ThreadSafeProgress(ctx, loop)
729
+ await _notify(ctx, "info", f"Starting local audio transcription for {source_path}")
730
+
731
+ result = await _run_pipeline_job(
732
+ ctx,
733
+ f"Local audio transcription of {source_path}",
734
+ pipeline.run_audio_file_transcription,
735
+ str(source_path),
736
+ original_filename=original_filename,
737
+ output_dir=DEFAULT_OUTPUT_DIR,
738
+ model_name=model,
739
+ language=lang,
740
+ progress_callback=progress_callback,
741
+ )
742
+ if result.get("status") != "ok":
743
+ return result
744
+
745
+ shaped = _shape_batch_result(
746
+ result,
747
+ include_transcript_text=include_transcript_text,
748
+ max_transcript_chars=max(int(max_transcript_chars), 0),
749
+ )
750
+ try:
751
+ await ctx.report_progress(progress=100.0, total=100.0, message="Local audio transcription completed")
752
+ except Exception:
753
+ pass
754
+ await _notify(ctx, "info", "Completed local audio transcription")
755
+ return shaped
756
+
757
+ @mcp.tool(
758
+ description=(
759
+ "List the most recent saved transcription batches from the local outputs directory. "
760
+ f"limit: 1-{MAX_LIST_LIMIT}, default 10. Corrupt manifests are reported with status 'unreadable' "
761
+ "instead of failing the whole listing."
762
+ ),
763
+ annotations=read_only,
764
+ )
765
+ def list_recent_batches(limit: int = 10) -> dict[str, Any]:
766
+ manifests = [_manifest_summary(path) for path in _recent_manifest_paths(_validate_limit(limit))]
767
+ return {
768
+ "status": "ok",
769
+ "output_root": str(DEFAULT_OUTPUT_DIR),
770
+ "count": len(manifests),
771
+ "batches": manifests,
772
+ }
773
+
774
+ @mcp.tool(
775
+ description=(
776
+ "Load a saved batch manifest by source_group and source_label (as returned by list_recent_batches "
777
+ "or a transcribe call). Set include_transcript_text=false to omit per-video transcript text."
778
+ ),
779
+ annotations=read_only,
780
+ )
781
+ def read_batch_manifest(
782
+ source_group: str,
783
+ source_label: str,
784
+ include_transcript_text: bool = True,
785
+ max_transcript_chars: int = 0,
786
+ ) -> dict[str, Any]:
787
+ group, group_error = _validate_slug_input(source_group, "source_group")
788
+ if group_error:
789
+ return group_error
790
+ label, label_error = _validate_slug_input(source_label, "source_label")
791
+ if label_error:
792
+ return label_error
793
+
794
+ try:
795
+ path = _manifest_path(group, label)
796
+ except FileNotFoundError as exc:
797
+ return _error("invalid_input", str(exc))
798
+ if not path.exists():
799
+ return _error(
800
+ "not_found",
801
+ f"Manifest not found for {group}/{label}",
802
+ hint="Call list_recent_batches to see what is available.",
803
+ available_batches=_known_batches(),
804
+ )
805
+ try:
806
+ payload = _load_json(path)
807
+ except (OSError, json.JSONDecodeError) as exc:
808
+ return _error(
809
+ "internal_error",
810
+ f"Manifest for {group}/{label} could not be read: {exc}",
811
+ hint="Re-run the transcription to regenerate this manifest.",
812
+ )
813
+
814
+ payload = _shape_batch_result(
815
+ _attach_resource_links(payload, manifest_path=path),
816
+ include_transcript_text=include_transcript_text,
817
+ max_transcript_chars=max(int(max_transcript_chars), 0),
818
+ )
819
+ return {
820
+ "status": "ok",
821
+ "manifest_file": str(path),
822
+ "manifest_resource": _manifest_resource_uri(group, label),
823
+ "batch": payload,
824
+ }
825
+
826
+ @mcp.tool(
827
+ description=(
828
+ "Load the saved transcript and metadata for a single processed video by source_group, source_label, "
829
+ "and video_id (as returned by list_recent_batches / read_batch_manifest). "
830
+ "max_transcript_chars>0 truncates the returned transcript text."
831
+ ),
832
+ annotations=read_only,
833
+ )
834
+ def read_video_output(
835
+ source_group: str,
836
+ source_label: str,
837
+ video_id: str,
838
+ max_transcript_chars: int = 0,
839
+ ) -> dict[str, Any]:
840
+ group, group_error = _validate_slug_input(source_group, "source_group")
841
+ if group_error:
842
+ return group_error
843
+ label, label_error = _validate_slug_input(source_label, "source_label")
844
+ if label_error:
845
+ return label_error
846
+ video, video_error = _validate_slug_input(video_id, "video_id")
847
+ if video_error:
848
+ return video_error
849
+
850
+ try:
851
+ run_dir = _video_dir(group, label, video)
852
+ except FileNotFoundError as exc:
853
+ return _error("invalid_input", str(exc))
854
+ transcript_path = run_dir / "transcript.txt"
855
+ metadata_path = run_dir / "metadata.json"
856
+ audio_path = run_dir / "audio.mp3"
857
+
858
+ if not metadata_path.exists():
859
+ return _error(
860
+ "not_found",
861
+ f"Video output not found for {group}/{label}/{video}",
862
+ hint="Call read_batch_manifest to see the video_id values in this batch.",
863
+ available_batches=_known_batches(),
864
+ )
865
+
866
+ try:
867
+ metadata = _load_json(metadata_path)
868
+ except (OSError, json.JSONDecodeError) as exc:
869
+ return _error(
870
+ "internal_error",
871
+ f"Metadata for {group}/{label}/{video} could not be read: {exc}",
872
+ hint="Re-run the transcription to regenerate this video's outputs.",
873
+ )
874
+ try:
875
+ transcript_text = (
876
+ transcript_path.read_text(encoding="utf-8", errors="replace") if transcript_path.exists() else ""
877
+ )
878
+ except OSError as exc:
879
+ return _error("internal_error", f"Transcript for {group}/{label}/{video} could not be read: {exc}")
880
+
881
+ result: dict[str, Any] = {
882
+ "status": "ok",
883
+ "audio_file": str(audio_path) if audio_path.exists() else None,
884
+ "transcript_file": str(transcript_path) if transcript_path.exists() else None,
885
+ "metadata_file": str(metadata_path),
886
+ "transcript_resource": _transcript_resource_uri(group, label, video),
887
+ "transcript_chars": len(transcript_text),
888
+ "transcript_text": transcript_text,
889
+ "metadata": metadata,
890
+ }
891
+ capped = max(int(max_transcript_chars), 0)
892
+ if capped and len(transcript_text) > capped:
893
+ result["transcript_text"] = transcript_text[:capped]
894
+ result["transcript_text_truncated"] = True
895
+ return result
896
+
897
+ @mcp.tool(
898
+ description=(
899
+ "Check server health: dependency status (whisper, yt-dlp, ffmpeg), output directory writability, "
900
+ "GroqCloud configuration, job limits, and current job activity. Call this first when transcription "
901
+ "tools fail unexpectedly."
902
+ ),
903
+ annotations=read_only,
904
+ )
905
+ async def check_health() -> dict[str, Any]:
906
+ dependencies = await asyncio.to_thread(_dependency_status)
907
+ ffmpeg_path = shutil.which("ffmpeg")
908
+
909
+ output_root_exists = DEFAULT_OUTPUT_DIR.is_dir()
910
+ output_root_writable = False
911
+ try:
912
+ DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
913
+ output_root_exists = True
914
+ output_root_writable = os.access(DEFAULT_OUTPUT_DIR, os.W_OK)
915
+ except OSError:
916
+ pass
917
+
918
+ problems = []
919
+ if not ffmpeg_path:
920
+ problems.append("ffmpeg is not on PATH")
921
+ if not output_root_writable:
922
+ problems.append(f"output directory is not writable: {DEFAULT_OUTPUT_DIR}")
923
+ problems.extend(
924
+ f"{name}: {status}" for name, status in dependencies.items() if str(status).startswith("error")
925
+ )
926
+
927
+ return {
928
+ "status": "ok" if not problems else "degraded",
929
+ "problems": problems,
930
+ "server": {"name": SERVER_NAME, "version": SERVER_VERSION},
931
+ "python_version": sys.version.split()[0],
932
+ "dependencies": dependencies,
933
+ "ffmpeg_path": ffmpeg_path,
934
+ "output_root": str(DEFAULT_OUTPUT_DIR),
935
+ "output_root_exists": output_root_exists,
936
+ "output_root_writable": output_root_writable,
937
+ "saved_batches": len(_recent_manifest_paths(MAX_LIST_LIMIT)),
938
+ "groq_configured": bool(os.environ.get("GROQ_API_KEY")),
939
+ "jobs": {
940
+ "active": _active_jobs,
941
+ "abandoned_after_timeout": _abandoned_jobs,
942
+ "max_concurrent": MAX_CONCURRENT_JOBS,
943
+ "job_timeout_seconds": JOB_TIMEOUT_SECONDS,
944
+ "queue_timeout_seconds": QUEUE_TIMEOUT_SECONDS,
945
+ },
946
+ "allowed_models": sorted(_allowed_models()),
947
+ }
948
+
949
+ return mcp
950
+
951
+
952
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
953
+ parser = argparse.ArgumentParser(
954
+ description="Expose the IG Content Transcriber pipeline as an MCP server for other AI clients."
955
+ )
956
+ parser.add_argument(
957
+ "--transport",
958
+ choices=("stdio", "streamable-http", "sse"),
959
+ default="stdio",
960
+ help="MCP transport to run. stdio is the default and is the right choice for Claude/Cursor-style integrations.",
961
+ )
962
+ parser.add_argument("--host", default="127.0.0.1", help="Host for HTTP transports.")
963
+ parser.add_argument("--port", type=int, default=8000, help="Port for HTTP transports.")
964
+ parser.add_argument("--debug", action="store_true", help="Enable MCP server debug logging.")
965
+ return parser.parse_args(argv)
966
+
967
+
968
+ def main() -> int:
969
+ args = parse_args()
970
+ # Anything written to stdout would corrupt the stdio MCP framing; keep all
971
+ # server-side logging on stderr.
972
+ logging.basicConfig(
973
+ level=logging.DEBUG if args.debug else logging.WARNING,
974
+ stream=sys.stderr,
975
+ format="%(asctime)s %(name)s %(levelname)s %(message)s",
976
+ )
977
+ server = build_server(host=args.host, port=args.port, debug=args.debug)
978
+ try:
979
+ server.run(transport=args.transport)
980
+ except KeyboardInterrupt:
981
+ logger.info("MCP server interrupted; shutting down.")
982
+ return 0
983
+ return 0
984
+
985
+
986
+ if __name__ == "__main__":
987
+ raise SystemExit(main())