violin 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- .claude/skills/video-translator/SKILL.md +67 -0
- api/__init__.py +0 -0
- api/app.py +109 -0
- api/config.py +15 -0
- api/models.py +87 -0
- api/routes/__init__.py +0 -0
- api/routes/catalog.py +190 -0
- api/routes/chat.py +39 -0
- api/routes/files.py +133 -0
- api/routes/jobs.py +228 -0
- api/static/index.html +1644 -0
- api/stats.py +141 -0
- api/storage.py +241 -0
- api/usage.py +61 -0
- api/video_chat.py +185 -0
- api/worker.py +237 -0
- config/default.yaml +88 -0
- config/other_api.yaml +18 -0
- config/prod.yaml +21 -0
- main.py +219 -0
- pipeline/__init__.py +0 -0
- pipeline/config.py +49 -0
- pipeline/costs.py +123 -0
- pipeline/extractor.py +67 -0
- pipeline/ffmpeg_utils.py +47 -0
- pipeline/languages.py +48 -0
- pipeline/llm_client.py +212 -0
- pipeline/merger.py +642 -0
- pipeline/orchestrator.py +219 -0
- pipeline/pricing.py +47 -0
- pipeline/styles.py +60 -0
- pipeline/transcriber.py +451 -0
- pipeline/translator.py +281 -0
- pipeline/tts.py +122 -0
- pipeline/tts_elevenlabs.py +283 -0
- pipeline/tts_openai.py +141 -0
- pipeline/tts_together.py +150 -0
- prompts/__init__.py +34 -0
- prompts/styles.yaml +118 -0
- prompts/translate.yaml +106 -0
- prompts/video_chat.yaml +14 -0
- prompts/voice_match.yaml +16 -0
- run_api.py +65 -0
- violin-0.1.0.dist-info/METADATA +315 -0
- violin-0.1.0.dist-info/RECORD +48 -0
- violin-0.1.0.dist-info/WHEEL +4 -0
- violin-0.1.0.dist-info/entry_points.txt +3 -0
- violin-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: video-translator
|
|
3
|
+
description: Dub a video into another language and generate subtitles using the default Together + Cartesia stack. Trigger when the user wants to translate / dub / voice-over a video file, or generate subtitles for it. Handles `.mp4` / `.mkv` / `.webm`. Installs as the `violin` CLI (and `violin-api` for the FastAPI server) via `uv tool install`. For alternative models (OpenAI / ElevenLabs) or custom configs, point the user to the repo: https://github.com/shang-zhu/violin.
|
|
4
|
+
allowed-tools: Bash, Read
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Violin — operating skill
|
|
8
|
+
|
|
9
|
+
Always uses the default config (Together for translation, `cartesia/sonic-3` for TTS). If the user asks for OpenAI, ElevenLabs, or custom configs, **stop and point them to the Violin repo** — those flows aren't supported through the global CLI.
|
|
10
|
+
|
|
11
|
+
## Pre-flight
|
|
12
|
+
|
|
13
|
+
Run these silently first. Abort if any fails:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
command -v violin # 1. CLI on PATH
|
|
17
|
+
test -f "<input>" # 2. Input exists
|
|
18
|
+
printenv TOGETHER_API_KEY # 3. Key available
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
If `violin` is missing: tell the user to `uv tool install violin`, then `violin --install-skill` to refresh this skill file. Do not auto-install.
|
|
22
|
+
|
|
23
|
+
If `TOGETHER_API_KEY` is missing:
|
|
24
|
+
- Inside the Violin repo → populate `.env` (auto-loaded)
|
|
25
|
+
- Elsewhere → `export TOGETHER_API_KEY=...` in `~/.zshrc` / `~/.bashrc`, then `source` it
|
|
26
|
+
|
|
27
|
+
## Decisions
|
|
28
|
+
|
|
29
|
+
- **CLI vs API**: single run-and-wait file → CLI (`violin`). Multi-job / HTTP / web UI → API server (`violin-api`); print the command, don't auto-start it.
|
|
30
|
+
- **Style** (`--style`): default `standard`. Kids content → `kids`, formal/lecture → `academic`, casual → `casual`, dramatic → `storyteller`, news → `news`. Run `violin --style list` if unsure.
|
|
31
|
+
- **Voiceover**: keep default (mix dubbed audio over a quiet original). Use `--no-voiceover` only when the user explicitly says "replace audio entirely".
|
|
32
|
+
|
|
33
|
+
## Run
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
violin <input> <output> --language <Lang> [flags]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Flags
|
|
40
|
+
|
|
41
|
+
| Flag | Default | When to set |
|
|
42
|
+
|------|---------|-------------|
|
|
43
|
+
| `--language` / `-l` | *required* | Target language (e.g. `Chinese`, `Spanish`, `Japanese`). |
|
|
44
|
+
| `--voice` / `-v` | auto (native voice picked by `preferences.voice_gender`) | Only when the user names a specific voice from the catalog (e.g. `"warm female narrator"`). Otherwise omit and let the default kick in. |
|
|
45
|
+
| `--source-language` | `auto-detect` | Only if Whisper mis-detects the source language. |
|
|
46
|
+
| `--style` / `-s` | `standard` | See Decisions above. |
|
|
47
|
+
| `--no-subtitles` | off | User says "no SRT" / "video only". |
|
|
48
|
+
| `--no-voiceover` | off | User says "replace original audio entirely". |
|
|
49
|
+
| `--config` / `-c` | `config/default.yaml` | Don't use through this skill — repo-only flow. |
|
|
50
|
+
| `--timings-out` | off | Only when the user wants a per-step timing JSON for debugging / benchmarking. |
|
|
51
|
+
|
|
52
|
+
## Language coverage
|
|
53
|
+
|
|
54
|
+
33 target languages total. **16** ship with handpicked native-speaker voices: Chinese, Spanish, English, Hindi, Arabic, Portuguese, Russian, Japanese, Turkish, German, Korean, French, Italian, Polish, Dutch, Swedish. The other **17** fall back to the English voice catalog (multilingual under Cartesia Sonic 3) — quality is decent but the voice isn't a native speaker. Mention this caveat only if the user is translating to a fallback language and asks about voice quality.
|
|
55
|
+
|
|
56
|
+
## Report back
|
|
57
|
+
|
|
58
|
+
- Output video path + SRT path (printed by the run).
|
|
59
|
+
- Total cost (printed at end — surface, don't hide).
|
|
60
|
+
- If voiceover was on, mention the `_original.m4a` sidecar.
|
|
61
|
+
|
|
62
|
+
## Don'ts
|
|
63
|
+
|
|
64
|
+
- Don't run on multi-GB videos without first quoting the rough cost (audio length × per-provider rates in `pipeline/pricing.py`).
|
|
65
|
+
- Don't fabricate a "subtitles-only" mode — the CLI requires the full pipeline. If the user only wants SRT, run the full pipeline and hand them just the `.srt`, warning them of the cost first.
|
|
66
|
+
- Don't try to switch to OpenAI or ElevenLabs from this skill. Point the user to the repo + `--config config/other_api.yaml` (or their own override).
|
|
67
|
+
- Don't paraphrase the README. For supported languages (33), voice catalog, and full flag docs, point them at `README.md` or `violin --help`.
|
api/__init__.py
ADDED
|
File without changes
|
api/app.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""FastAPI application factory."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
|
|
8
|
+
from fastapi import FastAPI
|
|
9
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
10
|
+
from fastapi.responses import FileResponse
|
|
11
|
+
from fastapi.staticfiles import StaticFiles
|
|
12
|
+
|
|
13
|
+
from .routes import catalog, chat, files, jobs
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_STATIC = pathlib.Path(__file__).parent / "static"
|
|
18
|
+
|
|
19
|
+
_ALLOWED_ORIGINS = os.environ.get("CORS_ORIGINS", "*").split(",")
|
|
20
|
+
|
|
21
|
+
app = FastAPI(
|
|
22
|
+
title="Violin API",
|
|
23
|
+
description=(
|
|
24
|
+
"Translate educational videos into 42 languages using Together AI. "
|
|
25
|
+
"Upload a video, poll for status, then download the dubbed output."
|
|
26
|
+
),
|
|
27
|
+
version="0.1.0",
|
|
28
|
+
docs_url="/docs",
|
|
29
|
+
redoc_url="/redoc",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
app.add_middleware(
|
|
33
|
+
CORSMiddleware,
|
|
34
|
+
allow_origins=_ALLOWED_ORIGINS,
|
|
35
|
+
allow_methods=["*"],
|
|
36
|
+
allow_headers=["*"],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
app.include_router(jobs.router)
|
|
40
|
+
app.include_router(files.router)
|
|
41
|
+
app.include_router(catalog.router)
|
|
42
|
+
app.include_router(chat.router)
|
|
43
|
+
|
|
44
|
+
app.mount("/static", StaticFiles(directory=str(_STATIC)), name="static")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@app.get("/", include_in_schema=False)
|
|
48
|
+
def root():
|
|
49
|
+
# Force the SPA shell to revalidate on every load — otherwise users keep
|
|
50
|
+
# seeing stale UI after upgrading violin-api (esp. on 127.0.0.1 vs localhost,
|
|
51
|
+
# which Chrome caches as separate origins).
|
|
52
|
+
return FileResponse(
|
|
53
|
+
str(_STATIC / "index.html"),
|
|
54
|
+
headers={"Cache-Control": "no-cache, must-revalidate"},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Lightweight health probe — accepts both GET and HEAD so external monitors
|
|
59
|
+
# (e.g. UptimeRobot's free tier, which only does HEAD) don't get 405s.
|
|
60
|
+
@app.api_route("/health", methods=["GET", "HEAD"], include_in_schema=False)
|
|
61
|
+
def health():
|
|
62
|
+
return {"ok": True}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@app.get("/app-config", include_in_schema=False)
|
|
66
|
+
def app_config():
|
|
67
|
+
from .config import (
|
|
68
|
+
FREE_TRIAL_JOBS,
|
|
69
|
+
JOB_TTL_HOURS,
|
|
70
|
+
MAX_DURATION_SECONDS,
|
|
71
|
+
MAX_FILE_SIZE_MB,
|
|
72
|
+
URL_UPLOAD,
|
|
73
|
+
)
|
|
74
|
+
return {
|
|
75
|
+
"url_upload": URL_UPLOAD,
|
|
76
|
+
"max_duration_seconds": MAX_DURATION_SECONDS,
|
|
77
|
+
"max_file_size_mb": MAX_FILE_SIZE_MB,
|
|
78
|
+
# Used by the footer to hide privacy/auto-delete warnings on local deployments.
|
|
79
|
+
"free_trial_jobs": FREE_TRIAL_JOBS,
|
|
80
|
+
"job_ttl_hours": JOB_TTL_HOURS,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@app.on_event("startup")
|
|
85
|
+
async def _init_stats():
|
|
86
|
+
"""Create the stats table if it doesn't exist. Idempotent — existing rows
|
|
87
|
+
are preserved across restarts."""
|
|
88
|
+
from . import stats as _stats
|
|
89
|
+
_stats.init_db()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@app.on_event("startup")
|
|
93
|
+
async def _start_cleanup_loop():
|
|
94
|
+
from .config import JOB_TTL_HOURS
|
|
95
|
+
if JOB_TTL_HOURS <= 0:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
async def _cleanup_loop():
|
|
99
|
+
from .storage import cleanup_old_jobs
|
|
100
|
+
while True:
|
|
101
|
+
await asyncio.sleep(3600)
|
|
102
|
+
try:
|
|
103
|
+
deleted = cleanup_old_jobs(JOB_TTL_HOURS)
|
|
104
|
+
if deleted:
|
|
105
|
+
logger.info("Cleaned up %d expired job(s)", deleted)
|
|
106
|
+
except Exception:
|
|
107
|
+
logger.exception("Job cleanup failed")
|
|
108
|
+
|
|
109
|
+
asyncio.create_task(_cleanup_loop())
|
api/config.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""API configuration."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pipeline import config as _conf
|
|
6
|
+
|
|
7
|
+
_api = _conf.get()["api"]
|
|
8
|
+
|
|
9
|
+
JOBS_DIR = Path(_api["jobs_dir"])
|
|
10
|
+
MAX_WORKERS = _api["max_workers"]
|
|
11
|
+
JOB_TTL_HOURS = _api.get("job_ttl_hours", 24)
|
|
12
|
+
FREE_TRIAL_JOBS = _api.get("free_trial_jobs", 1)
|
|
13
|
+
URL_UPLOAD = _api.get("url_upload", True)
|
|
14
|
+
MAX_DURATION_SECONDS = _api.get("max_duration_seconds", 1800)
|
|
15
|
+
MAX_FILE_SIZE_MB = _api.get("max_file_size_mb", 500)
|
api/models.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Pydantic models for API requests and responses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JobStatus(str, Enum):
|
|
12
|
+
queued = "queued"
|
|
13
|
+
running = "running"
|
|
14
|
+
done = "done"
|
|
15
|
+
failed = "failed"
|
|
16
|
+
cancelled = "cancelled"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ProgressEvent(BaseModel):
|
|
20
|
+
step: int
|
|
21
|
+
total: int
|
|
22
|
+
message: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JobResponse(BaseModel):
|
|
26
|
+
id: str
|
|
27
|
+
status: JobStatus
|
|
28
|
+
language: str
|
|
29
|
+
voice: str
|
|
30
|
+
source_language: str
|
|
31
|
+
subtitles: bool
|
|
32
|
+
style: str = "standard"
|
|
33
|
+
progress: list[ProgressEvent] = Field(default_factory=list)
|
|
34
|
+
error: str | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CreateJobRequest(BaseModel):
|
|
38
|
+
"""Used internally — the route uses Form() fields directly."""
|
|
39
|
+
language: str
|
|
40
|
+
voice: str = ""
|
|
41
|
+
source_language: str = "auto-detect"
|
|
42
|
+
subtitles: bool = True
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SubtitleSegment(BaseModel):
|
|
46
|
+
id: int
|
|
47
|
+
start: float
|
|
48
|
+
end: float
|
|
49
|
+
text: str
|
|
50
|
+
speaker: str = "SPEAKER_00"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ChatMessage(BaseModel):
|
|
54
|
+
role: str
|
|
55
|
+
content: str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class VoiceMatchRequest(BaseModel):
|
|
59
|
+
description: str
|
|
60
|
+
language: str = ""
|
|
61
|
+
together_api_key: str = ""
|
|
62
|
+
openai_api_key: str = ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class VoiceCandidate(BaseModel):
|
|
66
|
+
voice: str
|
|
67
|
+
explanation: str
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class VoiceMatchResponse(BaseModel):
|
|
71
|
+
candidates: list[VoiceCandidate]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class VideoChatRequest(BaseModel):
|
|
75
|
+
question: str
|
|
76
|
+
current_time: float = Field(ge=0)
|
|
77
|
+
history: list[ChatMessage] = Field(default_factory=list)
|
|
78
|
+
language: str = ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class VideoChatResponse(BaseModel):
|
|
82
|
+
answer: str
|
|
83
|
+
context_start: float
|
|
84
|
+
context_end: float
|
|
85
|
+
subtitle_context: list[SubtitleSegment] = Field(default_factory=list)
|
|
86
|
+
sampled_timestamps: list[float] = Field(default_factory=list)
|
|
87
|
+
style: str = "standard"
|
api/routes/__init__.py
ADDED
|
File without changes
|
api/routes/catalog.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Catalog endpoints: list supported languages, voices, and styles."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from fastapi import APIRouter, HTTPException
|
|
11
|
+
|
|
12
|
+
from api.models import VoiceCandidate, VoiceMatchRequest, VoiceMatchResponse
|
|
13
|
+
from pipeline import config as _conf
|
|
14
|
+
from pipeline.languages import all_languages, language_code
|
|
15
|
+
from pipeline.llm_client import get_translation_model, make_translation_client
|
|
16
|
+
from pipeline.styles import list_styles
|
|
17
|
+
from pipeline.tts import all_voices, native_voices_for, voice_descriptions
|
|
18
|
+
import prompts as _prompts
|
|
19
|
+
|
|
20
|
+
load_dotenv(override=True)
|
|
21
|
+
|
|
22
|
+
router = APIRouter(tags=["catalog"])
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@router.get("/languages")
|
|
26
|
+
def list_languages() -> dict[str, str]:
|
|
27
|
+
"""Return a mapping of language name → BCP-47 code for all supported languages."""
|
|
28
|
+
return all_languages()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@router.get("/voices")
|
|
32
|
+
def list_voices() -> dict[str, list[str]]:
|
|
33
|
+
"""Return all known native Cartesia Sonic 3 voices grouped by BCP-47 language code."""
|
|
34
|
+
return all_voices()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@router.get("/voices/{language}")
|
|
38
|
+
def voices_for_language(language: str) -> list[str]:
|
|
39
|
+
"""Return native voices for a specific language name or BCP-47 code."""
|
|
40
|
+
return native_voices_for(language_code(language))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@router.get("/styles")
|
|
44
|
+
def get_styles() -> list[dict]:
|
|
45
|
+
"""Return all available translation style profiles."""
|
|
46
|
+
return [
|
|
47
|
+
{
|
|
48
|
+
"name": s.name,
|
|
49
|
+
"description": s.description,
|
|
50
|
+
"tts_speed": s.tts_speed,
|
|
51
|
+
"tts_emotion": s.tts_emotion,
|
|
52
|
+
}
|
|
53
|
+
for s in list_styles()
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _build_voice_catalog(target_lang: str) -> str:
|
|
58
|
+
"""Format the active provider's voice catalog as a string for the LLM prompt.
|
|
59
|
+
|
|
60
|
+
Each voice is rendered as `- <name> — <description>`. For Cartesia the
|
|
61
|
+
description is the name itself (e.g. 'german conversational woman'); for
|
|
62
|
+
ElevenLabs it is the official metadata description.
|
|
63
|
+
"""
|
|
64
|
+
voices = all_voices()
|
|
65
|
+
descriptions = voice_descriptions()
|
|
66
|
+
target_code = language_code(target_lang) if target_lang else ""
|
|
67
|
+
lines: list[str] = []
|
|
68
|
+
|
|
69
|
+
def _fmt(name: str) -> str:
|
|
70
|
+
d = descriptions.get(name, "")
|
|
71
|
+
return f" - {name} — {d}" if d and d != name else f" - {name}"
|
|
72
|
+
|
|
73
|
+
if target_code and target_code in voices:
|
|
74
|
+
lines.append(f"== Voices for target language ({target_code}) ==")
|
|
75
|
+
for v in voices[target_code]:
|
|
76
|
+
lines.append(_fmt(v))
|
|
77
|
+
lines.append("")
|
|
78
|
+
|
|
79
|
+
for code, voice_list in sorted(voices.items()):
|
|
80
|
+
if code == target_code:
|
|
81
|
+
continue
|
|
82
|
+
header = "All voices (multilingual)" if code == "multi" else code
|
|
83
|
+
lines.append(f"== {header} ==")
|
|
84
|
+
for v in voice_list:
|
|
85
|
+
lines.append(_fmt(v))
|
|
86
|
+
return "\n".join(lines)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
_MAX_VOICE_MATCH_RETRIES = 3
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _parse_voice_candidates(raw: str, all_ids: list[str]) -> list[VoiceCandidate]:
|
|
93
|
+
"""Try to extract a list of VoiceCandidates from the raw LLM response."""
|
|
94
|
+
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
|
95
|
+
raw = raw.removeprefix("```json").removeprefix("```").removesuffix("```").strip()
|
|
96
|
+
|
|
97
|
+
bracket_match = re.search(r"\[.*\]", raw, flags=re.DOTALL)
|
|
98
|
+
if bracket_match:
|
|
99
|
+
raw = bracket_match.group(0)
|
|
100
|
+
|
|
101
|
+
items = json.loads(raw)
|
|
102
|
+
if isinstance(items, dict):
|
|
103
|
+
items = [items]
|
|
104
|
+
|
|
105
|
+
all_lower = {v.lower(): v for v in all_ids}
|
|
106
|
+
|
|
107
|
+
def _normalize(name: str) -> str | None:
|
|
108
|
+
return all_lower.get(name.lower().strip())
|
|
109
|
+
|
|
110
|
+
candidates = []
|
|
111
|
+
seen: set[str] = set()
|
|
112
|
+
for item in items:
|
|
113
|
+
if isinstance(item, str):
|
|
114
|
+
voice, explanation = _normalize(item), ""
|
|
115
|
+
else:
|
|
116
|
+
voice = _normalize(item.get("voice", ""))
|
|
117
|
+
explanation = item.get("explanation", "")
|
|
118
|
+
if voice and voice.lower() not in seen:
|
|
119
|
+
seen.add(voice.lower())
|
|
120
|
+
candidates.append(VoiceCandidate(voice=voice, explanation=explanation))
|
|
121
|
+
if len(candidates) == 3:
|
|
122
|
+
break
|
|
123
|
+
return candidates
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@router.post("/voice-match", response_model=VoiceMatchResponse)
|
|
127
|
+
def match_voice(payload: VoiceMatchRequest):
|
|
128
|
+
"""Use an LLM to map a natural language voice description to the best voice in the catalog.
|
|
129
|
+
|
|
130
|
+
Reuses the translation client + model (``models.translation``) — no separate
|
|
131
|
+
configuration needed.
|
|
132
|
+
"""
|
|
133
|
+
cfg = _conf.get()
|
|
134
|
+
try:
|
|
135
|
+
client = make_translation_client(
|
|
136
|
+
cfg,
|
|
137
|
+
together_key_override=payload.together_api_key.strip() or None,
|
|
138
|
+
openai_key_override=payload.openai_api_key.strip() or None,
|
|
139
|
+
)
|
|
140
|
+
except RuntimeError as exc:
|
|
141
|
+
raise HTTPException(status_code=500, detail=str(exc))
|
|
142
|
+
|
|
143
|
+
catalog = _build_voice_catalog(payload.language)
|
|
144
|
+
all_ids: list[str] = []
|
|
145
|
+
for v_list in all_voices().values():
|
|
146
|
+
all_ids.extend(v_list)
|
|
147
|
+
all_ids = list(dict.fromkeys(all_ids))
|
|
148
|
+
|
|
149
|
+
messages = [
|
|
150
|
+
{
|
|
151
|
+
"role": "system",
|
|
152
|
+
"content": _prompts.load("voice_match", "system", catalog=catalog),
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"role": "user",
|
|
156
|
+
"content": _prompts.load(
|
|
157
|
+
"voice_match", "user",
|
|
158
|
+
language=payload.language or "not specified",
|
|
159
|
+
description=payload.description,
|
|
160
|
+
),
|
|
161
|
+
},
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
# voice_match reuses the translation client + model — same LLM, same provider.
|
|
165
|
+
model = get_translation_model(cfg)
|
|
166
|
+
|
|
167
|
+
last_error = ""
|
|
168
|
+
for attempt in range(_MAX_VOICE_MATCH_RETRIES):
|
|
169
|
+
try:
|
|
170
|
+
response = client.chat.completions.create(
|
|
171
|
+
model=model,
|
|
172
|
+
messages=messages,
|
|
173
|
+
temperature=0.1,
|
|
174
|
+
max_tokens=400,
|
|
175
|
+
)
|
|
176
|
+
msg = response.choices[0].message
|
|
177
|
+
raw = (msg.content or "").strip()
|
|
178
|
+
|
|
179
|
+
if not raw:
|
|
180
|
+
last_error = "LLM returned empty response"
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
candidates = _parse_voice_candidates(raw, all_ids)
|
|
184
|
+
if candidates:
|
|
185
|
+
return VoiceMatchResponse(candidates=candidates)
|
|
186
|
+
last_error = "parsed JSON but found no valid voice names"
|
|
187
|
+
except (json.JSONDecodeError, Exception) as exc:
|
|
188
|
+
last_error = str(exc)
|
|
189
|
+
|
|
190
|
+
raise HTTPException(status_code=502, detail=f"Voice matching failed after {_MAX_VOICE_MATCH_RETRIES} attempts ({last_error})")
|
api/routes/chat.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Video chat endpoints for completed jobs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, HTTPException
|
|
6
|
+
|
|
7
|
+
from api.models import SubtitleSegment, VideoChatRequest, VideoChatResponse
|
|
8
|
+
from api.storage import get_job, load_segments
|
|
9
|
+
from api.video_chat import answer_video_question
|
|
10
|
+
|
|
11
|
+
router = APIRouter(prefix="/jobs", tags=["chat"])
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@router.get("/{job_id}/segments", response_model=list[SubtitleSegment])
|
|
15
|
+
def get_job_segments(job_id: str):
|
|
16
|
+
job = get_job(job_id)
|
|
17
|
+
if job is None:
|
|
18
|
+
raise HTTPException(status_code=404, detail=f"Job '{job_id}' not found.")
|
|
19
|
+
if job.status != "done":
|
|
20
|
+
raise HTTPException(status_code=409, detail=f"Job '{job_id}' is not complete.")
|
|
21
|
+
return [SubtitleSegment(**item) for item in load_segments(job_id)]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@router.post("/{job_id}/chat", response_model=VideoChatResponse)
|
|
25
|
+
def chat_with_video(job_id: str, payload: VideoChatRequest):
|
|
26
|
+
try:
|
|
27
|
+
return answer_video_question(
|
|
28
|
+
job_id=job_id,
|
|
29
|
+
question=payload.question,
|
|
30
|
+
current_time=payload.current_time,
|
|
31
|
+
history=payload.history,
|
|
32
|
+
language=payload.language,
|
|
33
|
+
)
|
|
34
|
+
except FileNotFoundError as exc:
|
|
35
|
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
|
36
|
+
except RuntimeError as exc:
|
|
37
|
+
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
|
38
|
+
except Exception as exc:
|
|
39
|
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
api/routes/files.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""File download endpoints for completed jobs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
|
|
7
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
8
|
+
from fastapi.responses import FileResponse
|
|
9
|
+
|
|
10
|
+
from api.models import JobStatus
|
|
11
|
+
from api.storage import get_job, original_audio_path, output_srt_path, output_video_path, voiceover_video_path
|
|
12
|
+
from pipeline.ffmpeg_utils import FFMPEG_EXE
|
|
13
|
+
|
|
14
|
+
router = APIRouter(prefix="/jobs", tags=["files"])
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _assert_done(job_id: str) -> None:
|
|
18
|
+
job = get_job(job_id)
|
|
19
|
+
if job is None:
|
|
20
|
+
raise HTTPException(status_code=404, detail=f"Job '{job_id}' not found.")
|
|
21
|
+
if job.status != JobStatus.done:
|
|
22
|
+
raise HTTPException(
|
|
23
|
+
status_code=409,
|
|
24
|
+
detail=f"Job '{job_id}' is not complete (status: {job.status}).",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@router.get("/{job_id}/video", response_class=FileResponse)
|
|
29
|
+
def download_video(job_id: str):
|
|
30
|
+
"""Download the dubbed output video. Only available when status=done."""
|
|
31
|
+
_assert_done(job_id)
|
|
32
|
+
path = output_video_path(job_id)
|
|
33
|
+
if not path.exists():
|
|
34
|
+
raise HTTPException(status_code=404, detail="Output video not found.")
|
|
35
|
+
return FileResponse(
|
|
36
|
+
path=str(path),
|
|
37
|
+
media_type="video/mp4",
|
|
38
|
+
filename=f"{job_id}_dubbed.mp4",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@router.get("/{job_id}/original-audio")
|
|
43
|
+
def get_original_audio(job_id: str):
|
|
44
|
+
"""Serve the original audio track (aligned to the dubbed timeline) for voice-over mixing."""
|
|
45
|
+
_assert_done(job_id)
|
|
46
|
+
path = original_audio_path(job_id)
|
|
47
|
+
if not path.exists():
|
|
48
|
+
raise HTTPException(
|
|
49
|
+
status_code=404,
|
|
50
|
+
detail="Original audio track not available. The job may not have used voice-over mode.",
|
|
51
|
+
)
|
|
52
|
+
return FileResponse(
|
|
53
|
+
path=str(path),
|
|
54
|
+
media_type="audio/mp4",
|
|
55
|
+
filename=f"{job_id}_original.m4a",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@router.get("/{job_id}/video-voiceover", response_class=FileResponse)
|
|
60
|
+
def download_voiceover_video(
|
|
61
|
+
job_id: str,
|
|
62
|
+
volume: float = Query(0.1, ge=0.0, le=1.0, description="Original audio volume (0.0–1.0)"),
|
|
63
|
+
):
|
|
64
|
+
"""Download the dubbed video with original audio mixed in at the given volume."""
|
|
65
|
+
_assert_done(job_id)
|
|
66
|
+
|
|
67
|
+
dubbed = output_video_path(job_id)
|
|
68
|
+
orig_audio = original_audio_path(job_id)
|
|
69
|
+
if not dubbed.exists():
|
|
70
|
+
raise HTTPException(status_code=404, detail="Output video not found.")
|
|
71
|
+
if not orig_audio.exists():
|
|
72
|
+
raise HTTPException(status_code=404, detail="Original audio not available. Job may not have used voice-over mode.")
|
|
73
|
+
|
|
74
|
+
out = voiceover_video_path(job_id)
|
|
75
|
+
if not out.exists() or not _volume_matches(job_id, volume):
|
|
76
|
+
_mix_voiceover(str(dubbed), str(orig_audio), str(out), volume)
|
|
77
|
+
_save_volume(job_id, volume)
|
|
78
|
+
|
|
79
|
+
return FileResponse(
|
|
80
|
+
path=str(out),
|
|
81
|
+
media_type="video/mp4",
|
|
82
|
+
filename=f"{job_id}_voiceover.mp4",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _mix_voiceover(video_path: str, audio_path: str, output_path: str, volume: float) -> None:
|
|
87
|
+
"""Mix original audio into the dubbed video using ffmpeg."""
|
|
88
|
+
subprocess.run([
|
|
89
|
+
FFMPEG_EXE,
|
|
90
|
+
"-i", video_path,
|
|
91
|
+
"-i", audio_path,
|
|
92
|
+
"-filter_complex",
|
|
93
|
+
f"[0:a]volume=1.0[dub];[1:a]volume={volume}[orig];[dub][orig]amix=inputs=2:duration=first[out]",
|
|
94
|
+
"-map", "0:v",
|
|
95
|
+
"-map", "[out]",
|
|
96
|
+
"-c:v", "copy",
|
|
97
|
+
"-c:a", "aac",
|
|
98
|
+
"-movflags", "+faststart",
|
|
99
|
+
"-y", output_path,
|
|
100
|
+
], check=True, capture_output=True)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _volume_matches(job_id: str, volume: float) -> bool:
|
|
104
|
+
"""Check if the cached voiceover was mixed at the same volume."""
|
|
105
|
+
vol_file = voiceover_video_path(job_id).with_suffix(".vol")
|
|
106
|
+
if not vol_file.exists():
|
|
107
|
+
return False
|
|
108
|
+
try:
|
|
109
|
+
return abs(float(vol_file.read_text().strip()) - volume) < 0.001
|
|
110
|
+
except (ValueError, OSError):
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _save_volume(job_id: str, volume: float) -> None:
|
|
115
|
+
vol_file = voiceover_video_path(job_id).with_suffix(".vol")
|
|
116
|
+
vol_file.write_text(str(volume))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@router.get("/{job_id}/srt", response_class=FileResponse)
|
|
120
|
+
def download_srt(job_id: str):
|
|
121
|
+
"""Download the SRT subtitle file. Only available when status=done and subtitles=true."""
|
|
122
|
+
_assert_done(job_id)
|
|
123
|
+
path = output_srt_path(job_id)
|
|
124
|
+
if not path.exists():
|
|
125
|
+
raise HTTPException(
|
|
126
|
+
status_code=404,
|
|
127
|
+
detail="SRT file not found. The job may have been created with subtitles=false.",
|
|
128
|
+
)
|
|
129
|
+
return FileResponse(
|
|
130
|
+
path=str(path),
|
|
131
|
+
media_type="text/plain; charset=utf-8",
|
|
132
|
+
filename=f"{job_id}.srt",
|
|
133
|
+
)
|