@laitszkin/apollo-toolkit 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +62 -0
- package/CHANGELOG.md +100 -0
- package/LICENSE +21 -0
- package/README.md +144 -0
- package/align-project-documents/SKILL.md +94 -0
- package/align-project-documents/agents/openai.yaml +4 -0
- package/analyse-app-logs/LICENSE +21 -0
- package/analyse-app-logs/README.md +126 -0
- package/analyse-app-logs/SKILL.md +121 -0
- package/analyse-app-logs/agents/openai.yaml +4 -0
- package/analyse-app-logs/references/investigation-checklist.md +58 -0
- package/analyse-app-logs/references/log-signal-patterns.md +52 -0
- package/answering-questions-with-research/SKILL.md +46 -0
- package/answering-questions-with-research/agents/openai.yaml +4 -0
- package/bin/apollo-toolkit.js +7 -0
- package/commit-and-push/LICENSE +21 -0
- package/commit-and-push/README.md +26 -0
- package/commit-and-push/SKILL.md +70 -0
- package/commit-and-push/agents/openai.yaml +4 -0
- package/commit-and-push/references/branch-naming.md +15 -0
- package/commit-and-push/references/commit-messages.md +19 -0
- package/deep-research-topics/LICENSE +21 -0
- package/deep-research-topics/README.md +43 -0
- package/deep-research-topics/SKILL.md +84 -0
- package/deep-research-topics/agents/openai.yaml +4 -0
- package/develop-new-features/LICENSE +21 -0
- package/develop-new-features/README.md +52 -0
- package/develop-new-features/SKILL.md +105 -0
- package/develop-new-features/agents/openai.yaml +4 -0
- package/develop-new-features/references/testing-e2e.md +35 -0
- package/develop-new-features/references/testing-integration.md +42 -0
- package/develop-new-features/references/testing-property-based.md +44 -0
- package/develop-new-features/references/testing-unit.md +37 -0
- package/discover-edge-cases/CHANGELOG.md +19 -0
- package/discover-edge-cases/LICENSE +21 -0
- package/discover-edge-cases/README.md +87 -0
- package/discover-edge-cases/SKILL.md +124 -0
- package/discover-edge-cases/agents/openai.yaml +4 -0
- package/discover-edge-cases/references/architecture-edge-cases.md +41 -0
- package/discover-edge-cases/references/code-edge-cases.md +46 -0
- package/docs-to-voice/.env.example +106 -0
- package/docs-to-voice/CHANGELOG.md +71 -0
- package/docs-to-voice/LICENSE +21 -0
- package/docs-to-voice/README.md +118 -0
- package/docs-to-voice/SKILL.md +107 -0
- package/docs-to-voice/agents/openai.yaml +4 -0
- package/docs-to-voice/scripts/docs_to_voice.py +1385 -0
- package/docs-to-voice/scripts/docs_to_voice.sh +11 -0
- package/docs-to-voice/tests/test_docs_to_voice_api_max_chars.py +210 -0
- package/docs-to-voice/tests/test_docs_to_voice_sentence_timeline.py +115 -0
- package/docs-to-voice/tests/test_docs_to_voice_settings.py +43 -0
- package/docs-to-voice/tests/test_docs_to_voice_speech_rate.py +57 -0
- package/enhance-existing-features/CHANGELOG.md +35 -0
- package/enhance-existing-features/LICENSE +21 -0
- package/enhance-existing-features/README.md +54 -0
- package/enhance-existing-features/SKILL.md +120 -0
- package/enhance-existing-features/agents/openai.yaml +4 -0
- package/enhance-existing-features/references/e2e-tests.md +25 -0
- package/enhance-existing-features/references/integration-tests.md +30 -0
- package/enhance-existing-features/references/property-based-tests.md +33 -0
- package/enhance-existing-features/references/unit-tests.md +29 -0
- package/feature-propose/LICENSE +21 -0
- package/feature-propose/README.md +23 -0
- package/feature-propose/SKILL.md +107 -0
- package/feature-propose/agents/openai.yaml +4 -0
- package/feature-propose/references/enhancement-features.md +25 -0
- package/feature-propose/references/important-features.md +25 -0
- package/feature-propose/references/mvp-features.md +25 -0
- package/feature-propose/references/performance-features.md +25 -0
- package/financial-research/SKILL.md +208 -0
- package/financial-research/agents/openai.yaml +4 -0
- package/financial-research/assets/weekly_market_report_template.md +45 -0
- package/fix-github-issues/SKILL.md +98 -0
- package/fix-github-issues/agents/openai.yaml +4 -0
- package/fix-github-issues/scripts/list_issues.py +148 -0
- package/fix-github-issues/tests/test_list_issues.py +127 -0
- package/generate-spec/LICENSE +21 -0
- package/generate-spec/README.md +61 -0
- package/generate-spec/SKILL.md +96 -0
- package/generate-spec/agents/openai.yaml +4 -0
- package/generate-spec/references/templates/checklist.md +78 -0
- package/generate-spec/references/templates/spec.md +55 -0
- package/generate-spec/references/templates/tasks.md +35 -0
- package/generate-spec/scripts/create-specs +123 -0
- package/harden-app-security/CHANGELOG.md +27 -0
- package/harden-app-security/LICENSE +21 -0
- package/harden-app-security/README.md +46 -0
- package/harden-app-security/SKILL.md +127 -0
- package/harden-app-security/agents/openai.yaml +4 -0
- package/harden-app-security/references/agent-attack-catalog.md +117 -0
- package/harden-app-security/references/common-software-attack-catalog.md +168 -0
- package/harden-app-security/references/red-team-extreme-scenarios.md +81 -0
- package/harden-app-security/references/risk-checklist.md +78 -0
- package/harden-app-security/references/security-test-patterns-agent.md +101 -0
- package/harden-app-security/references/security-test-patterns-finance.md +88 -0
- package/harden-app-security/references/test-snippets.md +73 -0
- package/improve-observability/SKILL.md +114 -0
- package/improve-observability/agents/openai.yaml +4 -0
- package/learn-skill-from-conversations/CHANGELOG.md +15 -0
- package/learn-skill-from-conversations/LICENSE +22 -0
- package/learn-skill-from-conversations/README.md +47 -0
- package/learn-skill-from-conversations/SKILL.md +85 -0
- package/learn-skill-from-conversations/agents/openai.yaml +4 -0
- package/learn-skill-from-conversations/scripts/extract_recent_conversations.py +369 -0
- package/learn-skill-from-conversations/tests/test_extract_recent_conversations.py +176 -0
- package/learning-error-book/SKILL.md +112 -0
- package/learning-error-book/agents/openai.yaml +4 -0
- package/learning-error-book/assets/error_book_template.md +66 -0
- package/learning-error-book/scripts/render_markdown_to_pdf.py +367 -0
- package/lib/cli.js +338 -0
- package/lib/installer.js +225 -0
- package/maintain-project-constraints/SKILL.md +109 -0
- package/maintain-project-constraints/agents/openai.yaml +4 -0
- package/maintain-skill-catalog/README.md +18 -0
- package/maintain-skill-catalog/SKILL.md +66 -0
- package/maintain-skill-catalog/agents/openai.yaml +4 -0
- package/novel-to-short-video/CHANGELOG.md +53 -0
- package/novel-to-short-video/LICENSE +21 -0
- package/novel-to-short-video/README.md +63 -0
- package/novel-to-short-video/SKILL.md +233 -0
- package/novel-to-short-video/agents/openai.yaml +4 -0
- package/novel-to-short-video/references/plan-template.md +71 -0
- package/novel-to-short-video/references/roles-json.md +41 -0
- package/open-github-issue/LICENSE +21 -0
- package/open-github-issue/README.md +97 -0
- package/open-github-issue/SKILL.md +119 -0
- package/open-github-issue/agents/openai.yaml +4 -0
- package/open-github-issue/scripts/open_github_issue.py +380 -0
- package/open-github-issue/tests/test_open_github_issue.py +159 -0
- package/open-source-pr-workflow/CHANGELOG.md +32 -0
- package/open-source-pr-workflow/LICENSE +21 -0
- package/open-source-pr-workflow/README.md +23 -0
- package/open-source-pr-workflow/SKILL.md +123 -0
- package/open-source-pr-workflow/agents/openai.yaml +4 -0
- package/openai-text-to-image-storyboard/.env.example +10 -0
- package/openai-text-to-image-storyboard/CHANGELOG.md +49 -0
- package/openai-text-to-image-storyboard/LICENSE +21 -0
- package/openai-text-to-image-storyboard/README.md +99 -0
- package/openai-text-to-image-storyboard/SKILL.md +107 -0
- package/openai-text-to-image-storyboard/agents/openai.yaml +4 -0
- package/openai-text-to-image-storyboard/scripts/generate_storyboard_images.py +763 -0
- package/package.json +36 -0
- package/record-spending/SKILL.md +113 -0
- package/record-spending/agents/openai.yaml +4 -0
- package/record-spending/references/account-format.md +33 -0
- package/record-spending/references/workbook-layout.md +84 -0
- package/resolve-review-comments/SKILL.md +122 -0
- package/resolve-review-comments/agents/openai.yaml +4 -0
- package/resolve-review-comments/references/adoption-criteria.md +23 -0
- package/resolve-review-comments/scripts/review_threads.py +425 -0
- package/resolve-review-comments/tests/test_review_threads.py +74 -0
- package/review-change-set/LICENSE +21 -0
- package/review-change-set/README.md +55 -0
- package/review-change-set/SKILL.md +103 -0
- package/review-change-set/agents/openai.yaml +4 -0
- package/review-codebases/LICENSE +21 -0
- package/review-codebases/README.md +67 -0
- package/review-codebases/SKILL.md +109 -0
- package/review-codebases/agents/openai.yaml +4 -0
- package/scripts/install_skills.ps1 +283 -0
- package/scripts/install_skills.sh +262 -0
- package/scripts/validate_openai_agent_config.py +194 -0
- package/scripts/validate_skill_frontmatter.py +110 -0
- package/specs-to-project-docs/LICENSE +21 -0
- package/specs-to-project-docs/README.md +57 -0
- package/specs-to-project-docs/SKILL.md +111 -0
- package/specs-to-project-docs/agents/openai.yaml +4 -0
- package/specs-to-project-docs/references/templates/architecture.md +29 -0
- package/specs-to-project-docs/references/templates/configuration.md +29 -0
- package/specs-to-project-docs/references/templates/developer-guide.md +33 -0
- package/specs-to-project-docs/references/templates/docs-index.md +39 -0
- package/specs-to-project-docs/references/templates/features.md +25 -0
- package/specs-to-project-docs/references/templates/getting-started.md +38 -0
- package/specs-to-project-docs/references/templates/readme.md +49 -0
- package/systematic-debug/LICENSE +21 -0
- package/systematic-debug/README.md +81 -0
- package/systematic-debug/SKILL.md +59 -0
- package/systematic-debug/agents/openai.yaml +4 -0
- package/text-to-short-video/.env.example +36 -0
- package/text-to-short-video/LICENSE +21 -0
- package/text-to-short-video/README.md +82 -0
- package/text-to-short-video/SKILL.md +221 -0
- package/text-to-short-video/agents/openai.yaml +4 -0
- package/text-to-short-video/scripts/enforce_video_aspect_ratio.py +350 -0
- package/version-release/CHANGELOG.md +53 -0
- package/version-release/LICENSE +21 -0
- package/version-release/README.md +28 -0
- package/version-release/SKILL.md +94 -0
- package/version-release/agents/openai.yaml +4 -0
- package/version-release/references/branch-naming.md +15 -0
- package/version-release/references/changelog-writing.md +8 -0
- package/version-release/references/commit-messages.md +19 -0
- package/version-release/references/readme-writing.md +12 -0
- package/version-release/references/semantic-versioning.md +12 -0
- package/video-production/CHANGELOG.md +104 -0
- package/video-production/LICENSE +18 -0
- package/video-production/README.md +68 -0
- package/video-production/SKILL.md +213 -0
- package/video-production/agents/openai.yaml +4 -0
- package/video-production/references/plan-template.md +54 -0
- package/video-production/references/roles-json.md +41 -0
- package/weekly-financial-event-report/SKILL.md +195 -0
- package/weekly-financial-event-report/agents/openai.yaml +4 -0
- package/weekly-financial-event-report/assets/financial_event_report_template.md +53 -0
|
@@ -0,0 +1,1385 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Convert text or text files into audio and sentence timelines."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import base64
|
|
6
|
+
import datetime as dt
|
|
7
|
+
import http.client
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
import pathlib
|
|
12
|
+
import re
|
|
13
|
+
import shutil
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
import tempfile
|
|
17
|
+
import urllib.error
|
|
18
|
+
import urllib.parse
|
|
19
|
+
import urllib.request
|
|
20
|
+
import wave
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import aifc # type: ignore
|
|
24
|
+
except Exception: # pragma: no cover
|
|
25
|
+
aifc = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DEFAULT_API_ENDPOINT = (
|
|
29
|
+
"https://dashscope-intl.aliyuncs.com/api/v1/services/"
|
|
30
|
+
"aigc/multimodal-generation/generation"
|
|
31
|
+
)
|
|
32
|
+
DEFAULT_API_MODEL = "qwen3-tts"
|
|
33
|
+
DEFAULT_API_VOICE = "Cherry"
|
|
34
|
+
DEFAULT_API_MAX_CHARS_PROBE_LENGTH = 5000
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DocsToVoiceError(Exception):
|
|
38
|
+
"""User-facing error for CLI failures."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_args(argv):
|
|
42
|
+
parser = argparse.ArgumentParser(
|
|
43
|
+
prog="docs_to_voice.py",
|
|
44
|
+
description="Convert text into speech and generate timeline JSON/SRT files.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
parser.add_argument("--project-dir", required=True, help="Root project directory")
|
|
48
|
+
parser.add_argument("--project-name", help="Folder name under DIR/audio/")
|
|
49
|
+
parser.add_argument("--output-name", help="Output filename")
|
|
50
|
+
parser.add_argument("--env-file", help="Path to .env file")
|
|
51
|
+
parser.add_argument("--mode", help="TTS mode: say|api")
|
|
52
|
+
parser.add_argument("--voice", help="macOS say voice")
|
|
53
|
+
parser.add_argument("--rate", help="macOS say rate (WPM)")
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--speech-rate",
|
|
56
|
+
help="Speech rate multiplier applied after synthesis (e.g. 1.2 faster, 0.8 slower)",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument("--api-endpoint", help="Model Studio TTS endpoint")
|
|
59
|
+
parser.add_argument("--api-model", help="Model Studio model name")
|
|
60
|
+
parser.add_argument("--api-voice", help="Model Studio voice")
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--max-chars",
|
|
63
|
+
help="Max chars per TTS request before auto chunking (0 disables chunking)",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--no-auto-prosody",
|
|
67
|
+
action="store_true",
|
|
68
|
+
help="Disable punctuation pause enhancement in say mode",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--force", action="store_true", help="Overwrite output if it already exists"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
75
|
+
input_group.add_argument("--text", help="Raw text input")
|
|
76
|
+
input_group.add_argument("--input-file", help="Path to input text file")
|
|
77
|
+
|
|
78
|
+
return parser.parse_args(argv)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def trim(value):
|
|
82
|
+
return value.strip()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def strip_wrapping_quotes(value):
|
|
86
|
+
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
|
|
87
|
+
return value[1:-1]
|
|
88
|
+
return value
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def load_env_file(file_path):
|
|
92
|
+
values = {}
|
|
93
|
+
if not file_path.is_file():
|
|
94
|
+
return values
|
|
95
|
+
|
|
96
|
+
line_pattern = re.compile(r"^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$")
|
|
97
|
+
for raw_line in file_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
|
98
|
+
line = trim(raw_line)
|
|
99
|
+
if not line or line.startswith("#"):
|
|
100
|
+
continue
|
|
101
|
+
match = line_pattern.match(line)
|
|
102
|
+
if not match:
|
|
103
|
+
continue
|
|
104
|
+
key = match.group(1)
|
|
105
|
+
value = strip_wrapping_quotes(trim(match.group(2)))
|
|
106
|
+
values[key] = value
|
|
107
|
+
|
|
108
|
+
return values
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def resolve_setting(cli_value, env_key, env_values, default=""):
|
|
112
|
+
if cli_value is not None and str(cli_value).strip() != "":
|
|
113
|
+
return cli_value
|
|
114
|
+
|
|
115
|
+
file_value = env_values.get(env_key, "")
|
|
116
|
+
if file_value:
|
|
117
|
+
return file_value
|
|
118
|
+
|
|
119
|
+
env_value = os.environ.get(env_key)
|
|
120
|
+
if env_value:
|
|
121
|
+
return env_value
|
|
122
|
+
|
|
123
|
+
return default
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def normalize_mode(raw_mode):
|
|
127
|
+
mode = (raw_mode or "say").strip().lower()
|
|
128
|
+
if mode not in {"say", "api"}:
|
|
129
|
+
raise DocsToVoiceError("--mode must be one of: say, api")
|
|
130
|
+
return mode
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def ensure_command(name, hint):
|
|
134
|
+
if shutil.which(name):
|
|
135
|
+
return
|
|
136
|
+
raise DocsToVoiceError(hint)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def read_input_text(args):
|
|
140
|
+
if args.input_file:
|
|
141
|
+
input_file = pathlib.Path(args.input_file).expanduser()
|
|
142
|
+
if not input_file.is_absolute():
|
|
143
|
+
input_file = (pathlib.Path.cwd() / input_file).resolve()
|
|
144
|
+
if not input_file.is_file():
|
|
145
|
+
raise DocsToVoiceError("Input file not found: {0}".format(input_file))
|
|
146
|
+
return input_file.read_text(encoding="utf-8", errors="replace")
|
|
147
|
+
return args.text or ""
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def normalize_project_dir(project_dir):
|
|
151
|
+
path = pathlib.Path(project_dir).expanduser()
|
|
152
|
+
if not path.is_absolute():
|
|
153
|
+
path = pathlib.Path.cwd() / path
|
|
154
|
+
return path.resolve()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def extract_extension_from_url(source_url):
|
|
158
|
+
parsed = urllib.parse.urlparse(source_url)
|
|
159
|
+
filename = pathlib.Path(parsed.path).name
|
|
160
|
+
if "." not in filename:
|
|
161
|
+
return ""
|
|
162
|
+
return filename.rsplit(".", 1)[-1].strip().lower()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def extract_extension_from_audio_format(raw_format):
|
|
166
|
+
if not raw_format:
|
|
167
|
+
return ""
|
|
168
|
+
|
|
169
|
+
value = raw_format.strip().lower().lstrip(".")
|
|
170
|
+
if ";" in value:
|
|
171
|
+
value = value.split(";", 1)[0].strip()
|
|
172
|
+
if "/" in value:
|
|
173
|
+
value = value.split("/")[-1]
|
|
174
|
+
|
|
175
|
+
alias = {
|
|
176
|
+
"x-wav": "wav",
|
|
177
|
+
"mpeg": "mp3",
|
|
178
|
+
"x-m4a": "m4a",
|
|
179
|
+
"x-aiff": "aiff",
|
|
180
|
+
}
|
|
181
|
+
return alias.get(value, value)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def determine_api_extension(api_result):
|
|
185
|
+
ext = ""
|
|
186
|
+
if api_result.get("audio_url"):
|
|
187
|
+
ext = extract_extension_from_url(api_result["audio_url"])
|
|
188
|
+
if not ext:
|
|
189
|
+
ext = extract_extension_from_audio_format(api_result.get("audio_format", ""))
|
|
190
|
+
if not ext:
|
|
191
|
+
ext = "wav"
|
|
192
|
+
return ext
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def split_sentences(raw_text):
|
|
196
|
+
endings = set("。!?!?;;")
|
|
197
|
+
sentences = []
|
|
198
|
+
|
|
199
|
+
for raw_line in raw_text.split("\n"):
|
|
200
|
+
line = raw_line.strip()
|
|
201
|
+
if not line:
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
current = []
|
|
205
|
+
for char in line:
|
|
206
|
+
current.append(char)
|
|
207
|
+
if char in endings:
|
|
208
|
+
sentence = "".join(current).strip()
|
|
209
|
+
if sentence:
|
|
210
|
+
sentences.append(sentence)
|
|
211
|
+
current = []
|
|
212
|
+
|
|
213
|
+
tail = "".join(current).strip()
|
|
214
|
+
if tail:
|
|
215
|
+
sentences.append(tail)
|
|
216
|
+
|
|
217
|
+
return sentences
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def extract_timeline_sentences(source_text):
|
|
221
|
+
sentences = split_sentences(source_text)
|
|
222
|
+
if not sentences:
|
|
223
|
+
stripped = source_text.strip()
|
|
224
|
+
if stripped:
|
|
225
|
+
sentences = [stripped]
|
|
226
|
+
|
|
227
|
+
if not sentences:
|
|
228
|
+
raise DocsToVoiceError("No text content found for timeline generation.")
|
|
229
|
+
|
|
230
|
+
return sentences
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def read_duration_seconds(file_path):
|
|
234
|
+
suffix = file_path.suffix.lower()
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
if suffix == ".wav":
|
|
238
|
+
with wave.open(str(file_path), "rb") as wav_file:
|
|
239
|
+
frame_rate = wav_file.getframerate()
|
|
240
|
+
if frame_rate > 0:
|
|
241
|
+
return wav_file.getnframes() / float(frame_rate)
|
|
242
|
+
if suffix in {".aiff", ".aif", ".aifc"} and aifc is not None:
|
|
243
|
+
with aifc.open(str(file_path), "rb") as aiff_file:
|
|
244
|
+
frame_rate = aiff_file.getframerate()
|
|
245
|
+
if frame_rate > 0:
|
|
246
|
+
return aiff_file.getnframes() / float(frame_rate)
|
|
247
|
+
except Exception:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
proc = subprocess.run(
|
|
252
|
+
["afinfo", str(file_path)],
|
|
253
|
+
check=False,
|
|
254
|
+
capture_output=True,
|
|
255
|
+
text=True,
|
|
256
|
+
)
|
|
257
|
+
except FileNotFoundError:
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
if proc.returncode != 0:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
payload = "{0}\n{1}".format(proc.stdout, proc.stderr)
|
|
264
|
+
patterns = (
|
|
265
|
+
r"estimated duration:\s*([0-9.]+)\s*sec",
|
|
266
|
+
r"duration:\s*([0-9.]+)\s*sec",
|
|
267
|
+
r"duration:\s*([0-9.]+)",
|
|
268
|
+
)
|
|
269
|
+
for pattern in patterns:
|
|
270
|
+
match = re.search(pattern, payload, flags=re.IGNORECASE)
|
|
271
|
+
if not match:
|
|
272
|
+
continue
|
|
273
|
+
try:
|
|
274
|
+
return float(match.group(1))
|
|
275
|
+
except ValueError:
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def sentence_weight(sentence):
|
|
282
|
+
compact = re.sub(r"\s+", "", sentence)
|
|
283
|
+
if not compact:
|
|
284
|
+
return 1.0
|
|
285
|
+
|
|
286
|
+
total = 0.0
|
|
287
|
+
for char in compact:
|
|
288
|
+
if re.match(r"[A-Za-z0-9]", char):
|
|
289
|
+
total += 0.55
|
|
290
|
+
elif re.match(r"[\u4e00-\u9fff]", char):
|
|
291
|
+
total += 1.0
|
|
292
|
+
elif char in ",,、::":
|
|
293
|
+
total += 0.25
|
|
294
|
+
elif char in "。.!!??;;":
|
|
295
|
+
total += 0.45
|
|
296
|
+
else:
|
|
297
|
+
total += 0.65
|
|
298
|
+
|
|
299
|
+
return max(total, 1.0)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def srt_time(seconds):
|
|
303
|
+
millis = int(round(max(seconds, 0.0) * 1000))
|
|
304
|
+
hours, millis = divmod(millis, 3_600_000)
|
|
305
|
+
minutes, millis = divmod(millis, 60_000)
|
|
306
|
+
secs, millis = divmod(millis, 1_000)
|
|
307
|
+
return "{0:02}:{1:02}:{2:02},{3:03}".format(hours, minutes, secs, millis)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def write_sentence_timeline_files(
|
|
311
|
+
source_text,
|
|
312
|
+
audio_path,
|
|
313
|
+
sentence_durations=None,
|
|
314
|
+
timing_mode_hint=None,
|
|
315
|
+
):
|
|
316
|
+
sentences = extract_timeline_sentences(source_text)
|
|
317
|
+
duration_seconds = read_duration_seconds(audio_path)
|
|
318
|
+
|
|
319
|
+
entries = []
|
|
320
|
+
timing_mode = "duration-weighted"
|
|
321
|
+
|
|
322
|
+
if sentence_durations is not None and len(sentence_durations) == len(sentences):
|
|
323
|
+
normalized_durations = []
|
|
324
|
+
for raw_duration in sentence_durations:
|
|
325
|
+
try:
|
|
326
|
+
parsed = float(raw_duration)
|
|
327
|
+
except (TypeError, ValueError):
|
|
328
|
+
parsed = 0.0
|
|
329
|
+
normalized_durations.append(max(parsed, 0.0))
|
|
330
|
+
|
|
331
|
+
duration_total = sum(normalized_durations)
|
|
332
|
+
if duration_total > 0:
|
|
333
|
+
if duration_seconds is None or duration_seconds <= 0:
|
|
334
|
+
duration_seconds = duration_total
|
|
335
|
+
|
|
336
|
+
scale = 1.0
|
|
337
|
+
if duration_seconds and duration_seconds > 0:
|
|
338
|
+
scale = duration_seconds / duration_total
|
|
339
|
+
|
|
340
|
+
cursor = 0.0
|
|
341
|
+
for index, sentence in enumerate(sentences):
|
|
342
|
+
if index == len(sentences) - 1:
|
|
343
|
+
end = duration_seconds
|
|
344
|
+
else:
|
|
345
|
+
end = cursor + (normalized_durations[index] * scale)
|
|
346
|
+
end = max(end, cursor)
|
|
347
|
+
|
|
348
|
+
entries.append(
|
|
349
|
+
{
|
|
350
|
+
"index": index + 1,
|
|
351
|
+
"text": sentence,
|
|
352
|
+
"start_seconds": round(cursor, 3),
|
|
353
|
+
"end_seconds": round(end, 3),
|
|
354
|
+
"start_ms": int(round(cursor * 1000)),
|
|
355
|
+
"end_ms": int(round(end * 1000)),
|
|
356
|
+
}
|
|
357
|
+
)
|
|
358
|
+
cursor = end
|
|
359
|
+
|
|
360
|
+
timing_mode = timing_mode_hint or "sentence-audio"
|
|
361
|
+
|
|
362
|
+
if not entries:
|
|
363
|
+
weights = [sentence_weight(sentence) for sentence in sentences]
|
|
364
|
+
total_weight = sum(weights)
|
|
365
|
+
if total_weight <= 0:
|
|
366
|
+
total_weight = float(len(sentences))
|
|
367
|
+
|
|
368
|
+
if duration_seconds is None or duration_seconds <= 0:
|
|
369
|
+
timing_mode = "estimated"
|
|
370
|
+
duration_seconds = max(total_weight * 0.26, 0.4)
|
|
371
|
+
else:
|
|
372
|
+
timing_mode = "duration-weighted"
|
|
373
|
+
|
|
374
|
+
cursor = 0.0
|
|
375
|
+
for index, sentence in enumerate(sentences):
|
|
376
|
+
if index == len(sentences) - 1:
|
|
377
|
+
end = duration_seconds
|
|
378
|
+
else:
|
|
379
|
+
portion = weights[index] / total_weight
|
|
380
|
+
end = cursor + (duration_seconds * portion)
|
|
381
|
+
end = max(end, cursor)
|
|
382
|
+
|
|
383
|
+
entries.append(
|
|
384
|
+
{
|
|
385
|
+
"index": index + 1,
|
|
386
|
+
"text": sentence,
|
|
387
|
+
"start_seconds": round(cursor, 3),
|
|
388
|
+
"end_seconds": round(end, 3),
|
|
389
|
+
"start_ms": int(round(cursor * 1000)),
|
|
390
|
+
"end_ms": int(round(end * 1000)),
|
|
391
|
+
}
|
|
392
|
+
)
|
|
393
|
+
cursor = end
|
|
394
|
+
|
|
395
|
+
if entries:
|
|
396
|
+
entries[-1]["end_seconds"] = round(duration_seconds, 3)
|
|
397
|
+
entries[-1]["end_ms"] = int(round(duration_seconds * 1000))
|
|
398
|
+
|
|
399
|
+
timeline_base = audio_path.with_suffix("")
|
|
400
|
+
timeline_json_path = timeline_base.with_suffix(".timeline.json")
|
|
401
|
+
timeline_srt_path = timeline_base.with_suffix(".srt")
|
|
402
|
+
|
|
403
|
+
json_payload = {
|
|
404
|
+
"audio_file": audio_path.name,
|
|
405
|
+
"audio_path": str(audio_path),
|
|
406
|
+
"audio_duration_seconds": round(duration_seconds, 3),
|
|
407
|
+
"timing_mode": timing_mode,
|
|
408
|
+
"generated_at": dt.datetime.now(dt.timezone.utc)
|
|
409
|
+
.isoformat()
|
|
410
|
+
.replace("+00:00", "Z"),
|
|
411
|
+
"sentences": entries,
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
timeline_json_path.write_text(
|
|
415
|
+
json.dumps(json_payload, ensure_ascii=False, indent=2) + "\n",
|
|
416
|
+
encoding="utf-8",
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
srt_lines = []
|
|
420
|
+
for entry in entries:
|
|
421
|
+
srt_lines.append(str(entry["index"]))
|
|
422
|
+
srt_lines.append(
|
|
423
|
+
"{0} --> {1}".format(
|
|
424
|
+
srt_time(entry["start_seconds"]), srt_time(entry["end_seconds"])
|
|
425
|
+
)
|
|
426
|
+
)
|
|
427
|
+
srt_lines.append(entry["text"])
|
|
428
|
+
srt_lines.append("")
|
|
429
|
+
|
|
430
|
+
timeline_srt_path.write_text("\n".join(srt_lines).strip() + "\n", encoding="utf-8")
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def apply_plaintext_prosody_rules(segment):
|
|
434
|
+
output = []
|
|
435
|
+
index = 0
|
|
436
|
+
length = len(segment)
|
|
437
|
+
|
|
438
|
+
while index < length:
|
|
439
|
+
if segment[index] == "\n":
|
|
440
|
+
cursor = index
|
|
441
|
+
while cursor < length and segment[cursor] == "\n":
|
|
442
|
+
cursor += 1
|
|
443
|
+
newline_count = cursor - index
|
|
444
|
+
if newline_count >= 2:
|
|
445
|
+
output.append("[[slnc 260]] ")
|
|
446
|
+
else:
|
|
447
|
+
output.append("[[slnc 90]] ")
|
|
448
|
+
index = cursor
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
char = segment[index]
|
|
452
|
+
if char in ",、,:;:;":
|
|
453
|
+
output.append("{0} [[slnc 120]] ".format(char))
|
|
454
|
+
elif char in "。.":
|
|
455
|
+
output.append("{0} [[slnc 180]] ".format(char))
|
|
456
|
+
elif char in "??":
|
|
457
|
+
output.append("{0} [[slnc 190]] ".format(char))
|
|
458
|
+
elif char in "!!":
|
|
459
|
+
output.append("{0} [[slnc 150]] ".format(char))
|
|
460
|
+
else:
|
|
461
|
+
output.append(char)
|
|
462
|
+
|
|
463
|
+
index += 1
|
|
464
|
+
|
|
465
|
+
converted = "".join(output)
|
|
466
|
+
return re.sub(r"[ \t]{2,}", " ", converted)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def build_auto_prosody_text(raw_text):
|
|
470
|
+
parts = re.split(r"(\[\[[\s\S]*?\]\])", raw_text)
|
|
471
|
+
converted = []
|
|
472
|
+
|
|
473
|
+
for index, part in enumerate(parts):
|
|
474
|
+
if index % 2 == 1 and part.startswith("[[") and part.endswith("]]"):
|
|
475
|
+
converted.append(part)
|
|
476
|
+
else:
|
|
477
|
+
converted.append(apply_plaintext_prosody_rules(part))
|
|
478
|
+
|
|
479
|
+
return "".join(converted)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def api_text_length_units(raw_text):
|
|
483
|
+
units = 0
|
|
484
|
+
for char in raw_text:
|
|
485
|
+
if "\u4e00" <= char <= "\u9fff":
|
|
486
|
+
units += 2
|
|
487
|
+
else:
|
|
488
|
+
units += 1
|
|
489
|
+
return units
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def split_oversized_text(raw_text, max_chars, length_func):
|
|
493
|
+
pieces = []
|
|
494
|
+
current = []
|
|
495
|
+
current_units = 0
|
|
496
|
+
|
|
497
|
+
for char in raw_text:
|
|
498
|
+
char_units = length_func(char)
|
|
499
|
+
if char_units <= 0:
|
|
500
|
+
char_units = 1
|
|
501
|
+
|
|
502
|
+
if current and current_units + char_units > max_chars:
|
|
503
|
+
piece = "".join(current).strip()
|
|
504
|
+
if piece:
|
|
505
|
+
pieces.append(piece)
|
|
506
|
+
current = [char]
|
|
507
|
+
current_units = char_units
|
|
508
|
+
continue
|
|
509
|
+
|
|
510
|
+
current.append(char)
|
|
511
|
+
current_units += char_units
|
|
512
|
+
|
|
513
|
+
if current:
|
|
514
|
+
piece = "".join(current).strip()
|
|
515
|
+
if piece:
|
|
516
|
+
pieces.append(piece)
|
|
517
|
+
|
|
518
|
+
return pieces
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def split_text_into_api_sentence_requests(source_text, max_chars, length_func):
|
|
522
|
+
if length_func is None:
|
|
523
|
+
length_func = len
|
|
524
|
+
|
|
525
|
+
sentences = extract_timeline_sentences(source_text)
|
|
526
|
+
request_items = []
|
|
527
|
+
|
|
528
|
+
for sentence_index, sentence in enumerate(sentences):
|
|
529
|
+
parts = [sentence]
|
|
530
|
+
if max_chars and length_func(sentence) > max_chars:
|
|
531
|
+
parts = split_oversized_text(sentence, max_chars, length_func)
|
|
532
|
+
|
|
533
|
+
if not parts:
|
|
534
|
+
parts = [sentence]
|
|
535
|
+
|
|
536
|
+
for part in parts:
|
|
537
|
+
request_items.append({"sentence_index": sentence_index, "text": part})
|
|
538
|
+
|
|
539
|
+
return sentences, request_items
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def split_text_for_tts(source_text, max_chars, length_func=None):
|
|
543
|
+
if length_func is None:
|
|
544
|
+
length_func = len
|
|
545
|
+
|
|
546
|
+
text = source_text.replace("\r\n", "\n").replace("\r", "\n").strip()
|
|
547
|
+
if not text:
|
|
548
|
+
return []
|
|
549
|
+
|
|
550
|
+
if not max_chars or length_func(text) <= max_chars:
|
|
551
|
+
return [text]
|
|
552
|
+
|
|
553
|
+
chunks = []
|
|
554
|
+
current = ""
|
|
555
|
+
paragraphs = [part.strip() for part in re.split(r"\n{2,}", text) if part.strip()]
|
|
556
|
+
|
|
557
|
+
for paragraph in paragraphs:
|
|
558
|
+
sentences = [
|
|
559
|
+
item.strip()
|
|
560
|
+
for item in re.split(r"(?<=[。!?!?;;.!?])", paragraph)
|
|
561
|
+
if item.strip()
|
|
562
|
+
]
|
|
563
|
+
if not sentences:
|
|
564
|
+
sentences = [paragraph]
|
|
565
|
+
|
|
566
|
+
for sentence in sentences:
|
|
567
|
+
if length_func(sentence) > max_chars:
|
|
568
|
+
if current:
|
|
569
|
+
chunks.append(current.strip())
|
|
570
|
+
current = ""
|
|
571
|
+
|
|
572
|
+
for piece in split_oversized_text(sentence, max_chars, length_func):
|
|
573
|
+
if piece:
|
|
574
|
+
chunks.append(piece)
|
|
575
|
+
continue
|
|
576
|
+
|
|
577
|
+
if not current:
|
|
578
|
+
current = sentence
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
candidate = "{0} {1}".format(current, sentence)
|
|
582
|
+
if length_func(candidate) <= max_chars:
|
|
583
|
+
current = candidate
|
|
584
|
+
else:
|
|
585
|
+
chunks.append(current.strip())
|
|
586
|
+
current = sentence
|
|
587
|
+
|
|
588
|
+
if current:
|
|
589
|
+
chunks.append(current.strip())
|
|
590
|
+
|
|
591
|
+
return chunks
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def concat_wav_files(part_paths, output_path):
|
|
595
|
+
with wave.open(str(part_paths[0]), "rb") as first_file:
|
|
596
|
+
params = first_file.getparams()
|
|
597
|
+
frames = [first_file.readframes(first_file.getnframes())]
|
|
598
|
+
|
|
599
|
+
for part_path in part_paths[1:]:
|
|
600
|
+
with wave.open(str(part_path), "rb") as current:
|
|
601
|
+
if (
|
|
602
|
+
current.getnchannels() != params.nchannels
|
|
603
|
+
or current.getsampwidth() != params.sampwidth
|
|
604
|
+
or current.getframerate() != params.framerate
|
|
605
|
+
or current.getcomptype() != params.comptype
|
|
606
|
+
):
|
|
607
|
+
raise DocsToVoiceError("Chunk WAV formats do not match; cannot concatenate.")
|
|
608
|
+
frames.append(current.readframes(current.getnframes()))
|
|
609
|
+
|
|
610
|
+
with wave.open(str(output_path), "wb") as output:
|
|
611
|
+
output.setparams(params)
|
|
612
|
+
for frame in frames:
|
|
613
|
+
output.writeframes(frame)
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def concat_aiff_files(part_paths, output_path):
|
|
617
|
+
if aifc is None:
|
|
618
|
+
raise DocsToVoiceError("AIFF concatenation requires Python aifc module.")
|
|
619
|
+
|
|
620
|
+
with aifc.open(str(part_paths[0]), "rb") as first_file:
|
|
621
|
+
params = first_file.getparams()
|
|
622
|
+
frames = [first_file.readframes(first_file.getnframes())]
|
|
623
|
+
|
|
624
|
+
for part_path in part_paths[1:]:
|
|
625
|
+
with aifc.open(str(part_path), "rb") as current:
|
|
626
|
+
if (
|
|
627
|
+
current.getnchannels() != params.nchannels
|
|
628
|
+
or current.getsampwidth() != params.sampwidth
|
|
629
|
+
or current.getframerate() != params.framerate
|
|
630
|
+
or current.getcomptype() != params.comptype
|
|
631
|
+
):
|
|
632
|
+
raise DocsToVoiceError("Chunk AIFF formats do not match; cannot concatenate.")
|
|
633
|
+
frames.append(current.readframes(current.getnframes()))
|
|
634
|
+
|
|
635
|
+
with aifc.open(str(output_path), "wb") as output:
|
|
636
|
+
output.setparams(params)
|
|
637
|
+
for frame in frames:
|
|
638
|
+
output.writeframes(frame)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def concat_with_ffmpeg(part_paths, output_path):
|
|
642
|
+
ffmpeg = shutil.which("ffmpeg")
|
|
643
|
+
if not ffmpeg:
|
|
644
|
+
raise DocsToVoiceError("ffmpeg is required for concatenating this audio format.")
|
|
645
|
+
|
|
646
|
+
with tempfile.NamedTemporaryFile(
|
|
647
|
+
mode="w", encoding="utf-8", suffix=".txt", delete=False
|
|
648
|
+
) as handle:
|
|
649
|
+
list_file = pathlib.Path(handle.name)
|
|
650
|
+
for part_path in part_paths:
|
|
651
|
+
escaped = str(part_path).replace("'", "'\\''")
|
|
652
|
+
handle.write("file '{0}'\n".format(escaped))
|
|
653
|
+
|
|
654
|
+
codec_args = []
|
|
655
|
+
ext = output_path.suffix.lower()
|
|
656
|
+
if ext == ".wav":
|
|
657
|
+
codec_args = ["-c:a", "pcm_s16le"]
|
|
658
|
+
elif ext in {".aiff", ".aif", ".aifc"}:
|
|
659
|
+
codec_args = ["-c:a", "pcm_s16be"]
|
|
660
|
+
|
|
661
|
+
command = [
|
|
662
|
+
ffmpeg,
|
|
663
|
+
"-hide_banner",
|
|
664
|
+
"-loglevel",
|
|
665
|
+
"error",
|
|
666
|
+
"-y",
|
|
667
|
+
"-f",
|
|
668
|
+
"concat",
|
|
669
|
+
"-safe",
|
|
670
|
+
"0",
|
|
671
|
+
"-i",
|
|
672
|
+
str(list_file),
|
|
673
|
+
] + codec_args + [str(output_path)]
|
|
674
|
+
|
|
675
|
+
try:
|
|
676
|
+
subprocess.run(command, check=True)
|
|
677
|
+
except subprocess.CalledProcessError as exc:
|
|
678
|
+
raise DocsToVoiceError(
|
|
679
|
+
"ffmpeg failed while concatenating chunks (exit {0}).".format(exc.returncode)
|
|
680
|
+
)
|
|
681
|
+
finally:
|
|
682
|
+
try:
|
|
683
|
+
list_file.unlink()
|
|
684
|
+
except FileNotFoundError:
|
|
685
|
+
pass
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def concat_audio_files(part_paths, output_path):
|
|
689
|
+
if not part_paths:
|
|
690
|
+
raise DocsToVoiceError("No chunk audio generated for concatenation.")
|
|
691
|
+
|
|
692
|
+
if len(part_paths) == 1:
|
|
693
|
+
shutil.copyfile(str(part_paths[0]), str(output_path))
|
|
694
|
+
return
|
|
695
|
+
|
|
696
|
+
ext = output_path.suffix.lower()
|
|
697
|
+
if ext == ".wav":
|
|
698
|
+
try:
|
|
699
|
+
concat_wav_files(part_paths, output_path)
|
|
700
|
+
except Exception:
|
|
701
|
+
concat_with_ffmpeg(part_paths, output_path)
|
|
702
|
+
return
|
|
703
|
+
if ext in {".aiff", ".aif", ".aifc"}:
|
|
704
|
+
if shutil.which("ffmpeg"):
|
|
705
|
+
concat_with_ffmpeg(part_paths, output_path)
|
|
706
|
+
else:
|
|
707
|
+
try:
|
|
708
|
+
concat_aiff_files(part_paths, output_path)
|
|
709
|
+
except Exception:
|
|
710
|
+
raise DocsToVoiceError(
|
|
711
|
+
"AIFF chunk concatenation failed without ffmpeg. "
|
|
712
|
+
"Install ffmpeg or use --output-name with .wav."
|
|
713
|
+
)
|
|
714
|
+
return
|
|
715
|
+
|
|
716
|
+
concat_with_ffmpeg(part_paths, output_path)
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def build_atempo_filter_chain(speech_rate):
|
|
720
|
+
factors = []
|
|
721
|
+
remaining = float(speech_rate)
|
|
722
|
+
|
|
723
|
+
while remaining < 0.5:
|
|
724
|
+
factors.append(0.5)
|
|
725
|
+
remaining /= 0.5
|
|
726
|
+
|
|
727
|
+
while remaining > 2.0:
|
|
728
|
+
factors.append(2.0)
|
|
729
|
+
remaining /= 2.0
|
|
730
|
+
|
|
731
|
+
factors.append(remaining)
|
|
732
|
+
|
|
733
|
+
filters = []
|
|
734
|
+
for factor in factors:
|
|
735
|
+
text = "{0:.6f}".format(factor).rstrip("0").rstrip(".")
|
|
736
|
+
if "." not in text:
|
|
737
|
+
text += ".0"
|
|
738
|
+
filters.append("atempo={0}".format(text))
|
|
739
|
+
return ",".join(filters)
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def apply_speech_rate_to_audio(output_path, speech_rate):
|
|
743
|
+
if speech_rate is None or abs(speech_rate - 1.0) < 1e-9:
|
|
744
|
+
return
|
|
745
|
+
|
|
746
|
+
ffmpeg = shutil.which("ffmpeg")
|
|
747
|
+
if not ffmpeg:
|
|
748
|
+
raise DocsToVoiceError("--speech-rate requires ffmpeg to be installed.")
|
|
749
|
+
|
|
750
|
+
with tempfile.NamedTemporaryFile(
|
|
751
|
+
suffix=output_path.suffix,
|
|
752
|
+
prefix="docs-to-voice-rate-",
|
|
753
|
+
dir=str(output_path.parent),
|
|
754
|
+
delete=False,
|
|
755
|
+
) as handle:
|
|
756
|
+
temp_output_path = pathlib.Path(handle.name)
|
|
757
|
+
|
|
758
|
+
command = [
|
|
759
|
+
ffmpeg,
|
|
760
|
+
"-hide_banner",
|
|
761
|
+
"-loglevel",
|
|
762
|
+
"error",
|
|
763
|
+
"-y",
|
|
764
|
+
"-i",
|
|
765
|
+
str(output_path),
|
|
766
|
+
"-filter:a",
|
|
767
|
+
build_atempo_filter_chain(speech_rate),
|
|
768
|
+
str(temp_output_path),
|
|
769
|
+
]
|
|
770
|
+
|
|
771
|
+
try:
|
|
772
|
+
subprocess.run(command, check=True)
|
|
773
|
+
except subprocess.CalledProcessError as exc:
|
|
774
|
+
if temp_output_path.exists():
|
|
775
|
+
temp_output_path.unlink()
|
|
776
|
+
raise DocsToVoiceError(
|
|
777
|
+
"ffmpeg failed while applying --speech-rate (exit {0}).".format(
|
|
778
|
+
exc.returncode
|
|
779
|
+
)
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
if not temp_output_path.is_file() or temp_output_path.stat().st_size == 0:
|
|
783
|
+
if temp_output_path.exists():
|
|
784
|
+
temp_output_path.unlink()
|
|
785
|
+
raise DocsToVoiceError("Failed to apply --speech-rate to output audio.")
|
|
786
|
+
|
|
787
|
+
temp_output_path.replace(output_path)
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def parse_api_error_message(raw_payload):
|
|
791
|
+
try:
|
|
792
|
+
data = json.loads(raw_payload)
|
|
793
|
+
except Exception:
|
|
794
|
+
return raw_payload[:400]
|
|
795
|
+
|
|
796
|
+
message = data.get("message")
|
|
797
|
+
code = data.get("code")
|
|
798
|
+
if message and code:
|
|
799
|
+
return "{0}: {1}".format(code, message)
|
|
800
|
+
if message:
|
|
801
|
+
return message
|
|
802
|
+
if code:
|
|
803
|
+
return code
|
|
804
|
+
return raw_payload[:400]
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def parse_positive_int(raw_value):
|
|
808
|
+
if raw_value is None:
|
|
809
|
+
return None
|
|
810
|
+
if isinstance(raw_value, bool):
|
|
811
|
+
return None
|
|
812
|
+
if isinstance(raw_value, (int, float)):
|
|
813
|
+
parsed = int(raw_value)
|
|
814
|
+
if parsed > 0:
|
|
815
|
+
return parsed
|
|
816
|
+
return None
|
|
817
|
+
|
|
818
|
+
value = str(raw_value).strip().replace(",", "")
|
|
819
|
+
if not value.isdigit():
|
|
820
|
+
return None
|
|
821
|
+
|
|
822
|
+
parsed = int(value)
|
|
823
|
+
if parsed <= 0:
|
|
824
|
+
return None
|
|
825
|
+
return parsed
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def extract_max_chars_from_text(raw_text):
|
|
829
|
+
if not raw_text:
|
|
830
|
+
return None
|
|
831
|
+
|
|
832
|
+
patterns = (
|
|
833
|
+
r"range of input length should be \[\s*\d+\s*,\s*([\d,]+)\s*\]",
|
|
834
|
+
r"max(?:imum)?\s*(?:input\s*)?(?:text\s*)?(?:length|characters?|chars?)\s*(?:is|:|=)\s*([\d,]+)",
|
|
835
|
+
r"(?:cannot exceed|must be less than or equal to|must be <=?|up to)\s*([\d,]+)\s*(?:characters?|chars?)",
|
|
836
|
+
r"(?:不超過|不能超過|上限為|上限为)\s*([\d,]+)\s*(?:個?字元|個?字符|字元|字符)",
|
|
837
|
+
)
|
|
838
|
+
for pattern in patterns:
|
|
839
|
+
match = re.search(pattern, raw_text, flags=re.IGNORECASE)
|
|
840
|
+
if not match:
|
|
841
|
+
continue
|
|
842
|
+
parsed = parse_positive_int(match.group(1))
|
|
843
|
+
if parsed:
|
|
844
|
+
return parsed
|
|
845
|
+
|
|
846
|
+
return None
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def fetch_json_payload(url, headers, timeout=30):
|
|
850
|
+
request = urllib.request.Request(url, method="GET", headers=headers)
|
|
851
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
852
|
+
payload = response.read().decode("utf-8", errors="replace")
|
|
853
|
+
return json.loads(payload)
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def extract_model_entry_max_chars(model_entry):
|
|
857
|
+
model_info = model_entry.get("model_info") or {}
|
|
858
|
+
inference_metadata = model_entry.get("inference_metadata") or {}
|
|
859
|
+
|
|
860
|
+
candidates = (
|
|
861
|
+
model_entry.get("max_input_chars"),
|
|
862
|
+
model_entry.get("max_input_characters"),
|
|
863
|
+
model_entry.get("max_input_length"),
|
|
864
|
+
model_entry.get("max_text_length"),
|
|
865
|
+
model_info.get("max_input_chars"),
|
|
866
|
+
model_info.get("max_input_characters"),
|
|
867
|
+
model_info.get("max_input_length"),
|
|
868
|
+
model_info.get("max_text_length"),
|
|
869
|
+
model_info.get("max_input_tokens"),
|
|
870
|
+
inference_metadata.get("max_input_chars"),
|
|
871
|
+
inference_metadata.get("max_input_length"),
|
|
872
|
+
)
|
|
873
|
+
for candidate in candidates:
|
|
874
|
+
parsed = parse_positive_int(candidate)
|
|
875
|
+
if parsed:
|
|
876
|
+
return parsed
|
|
877
|
+
|
|
878
|
+
return extract_max_chars_from_text(model_entry.get("description", ""))
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def fetch_api_model_max_chars(api_endpoint, api_key, model):
|
|
882
|
+
parsed = urllib.parse.urlparse(api_endpoint)
|
|
883
|
+
if not parsed.scheme or not parsed.netloc:
|
|
884
|
+
return None
|
|
885
|
+
|
|
886
|
+
base_url = "{0}://{1}".format(parsed.scheme, parsed.netloc)
|
|
887
|
+
headers = {"Authorization": "Bearer {0}".format(api_key)}
|
|
888
|
+
|
|
889
|
+
page_no = 1
|
|
890
|
+
page_size = 100
|
|
891
|
+
while True:
|
|
892
|
+
query = urllib.parse.urlencode({"page_no": page_no, "page_size": page_size})
|
|
893
|
+
url = "{0}/api/v1/models?{1}".format(base_url, query)
|
|
894
|
+
|
|
895
|
+
try:
|
|
896
|
+
payload = fetch_json_payload(url, headers=headers)
|
|
897
|
+
except Exception:
|
|
898
|
+
return None
|
|
899
|
+
|
|
900
|
+
output = payload.get("output") or {}
|
|
901
|
+
models = output.get("models") or []
|
|
902
|
+
for model_entry in models:
|
|
903
|
+
if (model_entry.get("model") or "").strip() != model:
|
|
904
|
+
continue
|
|
905
|
+
return extract_model_entry_max_chars(model_entry)
|
|
906
|
+
|
|
907
|
+
total = parse_positive_int(output.get("total")) or 0
|
|
908
|
+
if total <= 0:
|
|
909
|
+
return None
|
|
910
|
+
if page_no * page_size >= total:
|
|
911
|
+
return None
|
|
912
|
+
page_no += 1
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def probe_api_max_chars(api_endpoint, api_key, model, voice):
|
|
916
|
+
probe_text = "測" * DEFAULT_API_MAX_CHARS_PROBE_LENGTH
|
|
917
|
+
|
|
918
|
+
try:
|
|
919
|
+
request_model_studio_audio(
|
|
920
|
+
api_endpoint=api_endpoint,
|
|
921
|
+
api_key=api_key,
|
|
922
|
+
model=model,
|
|
923
|
+
voice=voice,
|
|
924
|
+
text=probe_text,
|
|
925
|
+
)
|
|
926
|
+
except DocsToVoiceError as exc:
|
|
927
|
+
return extract_max_chars_from_text(str(exc))
|
|
928
|
+
|
|
929
|
+
return None
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
def discover_api_max_chars(api_endpoint, api_key, model, voice):
|
|
933
|
+
discovered = fetch_api_model_max_chars(
|
|
934
|
+
api_endpoint=api_endpoint,
|
|
935
|
+
api_key=api_key,
|
|
936
|
+
model=model,
|
|
937
|
+
)
|
|
938
|
+
if discovered:
|
|
939
|
+
return discovered
|
|
940
|
+
return probe_api_max_chars(
|
|
941
|
+
api_endpoint=api_endpoint,
|
|
942
|
+
api_key=api_key,
|
|
943
|
+
model=model,
|
|
944
|
+
voice=voice,
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def request_model_studio_audio(api_endpoint, api_key, model, voice, text):
|
|
949
|
+
payload = {
|
|
950
|
+
"model": model,
|
|
951
|
+
"input": {
|
|
952
|
+
"text": text,
|
|
953
|
+
"voice": voice,
|
|
954
|
+
},
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
|
958
|
+
request = urllib.request.Request(
|
|
959
|
+
api_endpoint,
|
|
960
|
+
method="POST",
|
|
961
|
+
data=data,
|
|
962
|
+
headers={
|
|
963
|
+
"Authorization": "Bearer {0}".format(api_key),
|
|
964
|
+
"Content-Type": "application/json",
|
|
965
|
+
},
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
try:
|
|
969
|
+
with urllib.request.urlopen(request, timeout=300) as response:
|
|
970
|
+
raw_payload = response.read().decode("utf-8", errors="replace")
|
|
971
|
+
except urllib.error.HTTPError as exc:
|
|
972
|
+
raw_payload = exc.read().decode("utf-8", errors="replace")
|
|
973
|
+
detail = parse_api_error_message(raw_payload)
|
|
974
|
+
raise DocsToVoiceError(
|
|
975
|
+
"Model Studio TTS request failed (HTTP {0}): {1}".format(exc.code, detail)
|
|
976
|
+
)
|
|
977
|
+
except urllib.error.URLError as exc:
|
|
978
|
+
reason = getattr(exc, "reason", exc)
|
|
979
|
+
raise DocsToVoiceError(
|
|
980
|
+
"Model Studio TTS request failed: {0}".format(reason)
|
|
981
|
+
)
|
|
982
|
+
except http.client.HTTPException as exc:
|
|
983
|
+
raise DocsToVoiceError(
|
|
984
|
+
"Model Studio TTS request failed: {0}".format(exc)
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
try:
|
|
988
|
+
response_json = json.loads(raw_payload)
|
|
989
|
+
except json.JSONDecodeError:
|
|
990
|
+
raise DocsToVoiceError("API response is not valid JSON.")
|
|
991
|
+
|
|
992
|
+
output = response_json.get("output") or {}
|
|
993
|
+
audio = output.get("audio") or {}
|
|
994
|
+
audio_url = audio.get("url") or ""
|
|
995
|
+
audio_data = audio.get("data") or ""
|
|
996
|
+
audio_format = audio.get("format") or audio.get("mime_type") or ""
|
|
997
|
+
|
|
998
|
+
if not audio_url and not audio_data:
|
|
999
|
+
raise DocsToVoiceError(
|
|
1000
|
+
"API response does not contain output.audio.url or output.audio.data"
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
return {
|
|
1004
|
+
"audio_url": audio_url,
|
|
1005
|
+
"audio_data": audio_data,
|
|
1006
|
+
"audio_format": audio_format,
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
def download_binary(url, output_path):
|
|
1011
|
+
try:
|
|
1012
|
+
with urllib.request.urlopen(url, timeout=300) as response:
|
|
1013
|
+
payload = response.read()
|
|
1014
|
+
except urllib.error.URLError as exc:
|
|
1015
|
+
reason = getattr(exc, "reason", exc)
|
|
1016
|
+
raise DocsToVoiceError("Failed to download audio URL: {0}".format(reason))
|
|
1017
|
+
except http.client.HTTPException as exc:
|
|
1018
|
+
raise DocsToVoiceError("Failed to download audio URL: {0}".format(exc))
|
|
1019
|
+
|
|
1020
|
+
output_path.write_bytes(payload)
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def write_base64_audio(raw_base64_data, output_path):
|
|
1024
|
+
try:
|
|
1025
|
+
audio_bytes = base64.b64decode(raw_base64_data, validate=True)
|
|
1026
|
+
except Exception:
|
|
1027
|
+
raise DocsToVoiceError("API returned invalid output.audio.data payload.")
|
|
1028
|
+
|
|
1029
|
+
output_path.write_bytes(audio_bytes)
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def ensure_output_not_exists(output_path, force):
|
|
1033
|
+
if output_path.exists() and not force:
|
|
1034
|
+
raise DocsToVoiceError(
|
|
1035
|
+
"Output already exists: {0} (use --force to overwrite)".format(output_path)
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
def run_say_mode(output_path, text, voice, rate):
|
|
1040
|
+
ensure_command("say", "macOS 'say' command not found.")
|
|
1041
|
+
|
|
1042
|
+
with tempfile.NamedTemporaryFile(
|
|
1043
|
+
mode="w", encoding="utf-8", suffix=".txt", delete=False
|
|
1044
|
+
) as handle:
|
|
1045
|
+
handle.write(text)
|
|
1046
|
+
temp_text_path = pathlib.Path(handle.name)
|
|
1047
|
+
|
|
1048
|
+
command = ["say", "-o", str(output_path)]
|
|
1049
|
+
if voice:
|
|
1050
|
+
command.extend(["-v", voice])
|
|
1051
|
+
if rate is not None:
|
|
1052
|
+
command.extend(["-r", str(rate)])
|
|
1053
|
+
command.extend(["-f", str(temp_text_path)])
|
|
1054
|
+
|
|
1055
|
+
try:
|
|
1056
|
+
subprocess.run(command, check=True)
|
|
1057
|
+
except subprocess.CalledProcessError as exc:
|
|
1058
|
+
raise DocsToVoiceError("say mode failed with exit code {0}".format(exc.returncode))
|
|
1059
|
+
finally:
|
|
1060
|
+
try:
|
|
1061
|
+
temp_text_path.unlink()
|
|
1062
|
+
except FileNotFoundError:
|
|
1063
|
+
pass
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
def choose_output_name(base_name, has_extension, mode, api_result):
|
|
1067
|
+
if has_extension:
|
|
1068
|
+
return base_name
|
|
1069
|
+
|
|
1070
|
+
if mode == "say":
|
|
1071
|
+
return "{0}.aiff".format(base_name)
|
|
1072
|
+
|
|
1073
|
+
ext = determine_api_extension(api_result)
|
|
1074
|
+
return "{0}.{1}".format(base_name, ext)
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
def validate_rate(raw_rate):
|
|
1078
|
+
if raw_rate is None:
|
|
1079
|
+
return None
|
|
1080
|
+
if not raw_rate.isdigit() or int(raw_rate) <= 0:
|
|
1081
|
+
raise DocsToVoiceError("--rate must be a positive integer.")
|
|
1082
|
+
return int(raw_rate)
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
def validate_speech_rate(raw_value):
|
|
1086
|
+
if raw_value is None:
|
|
1087
|
+
return None
|
|
1088
|
+
|
|
1089
|
+
value = str(raw_value).strip()
|
|
1090
|
+
if not value:
|
|
1091
|
+
return None
|
|
1092
|
+
|
|
1093
|
+
try:
|
|
1094
|
+
parsed = float(value)
|
|
1095
|
+
except ValueError:
|
|
1096
|
+
raise DocsToVoiceError("--speech-rate must be a positive number.")
|
|
1097
|
+
|
|
1098
|
+
if not math.isfinite(parsed) or parsed <= 0:
|
|
1099
|
+
raise DocsToVoiceError("--speech-rate must be a positive number.")
|
|
1100
|
+
return parsed
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
def scale_sentence_durations(sentence_durations, speech_rate):
|
|
1104
|
+
if not sentence_durations or speech_rate is None or abs(speech_rate - 1.0) < 1e-9:
|
|
1105
|
+
return sentence_durations
|
|
1106
|
+
return [max(float(value), 0.0) / speech_rate for value in sentence_durations]
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
def validate_max_chars(raw_value):
|
|
1110
|
+
if raw_value is None:
|
|
1111
|
+
return None
|
|
1112
|
+
|
|
1113
|
+
value = str(raw_value).strip()
|
|
1114
|
+
if not value:
|
|
1115
|
+
return None
|
|
1116
|
+
if not value.isdigit():
|
|
1117
|
+
raise DocsToVoiceError("--max-chars must be a non-negative integer.")
|
|
1118
|
+
|
|
1119
|
+
parsed = int(value)
|
|
1120
|
+
if parsed <= 0:
|
|
1121
|
+
return None
|
|
1122
|
+
return parsed
|
|
1123
|
+
|
|
1124
|
+
|
|
1125
|
+
def is_max_chars_disabled(raw_value):
|
|
1126
|
+
if raw_value is None:
|
|
1127
|
+
return False
|
|
1128
|
+
return str(raw_value).strip() == "0"
|
|
1129
|
+
|
|
1130
|
+
|
|
1131
|
+
def main(argv=None):
|
|
1132
|
+
args = parse_args(argv or sys.argv[1:])
|
|
1133
|
+
|
|
1134
|
+
script_dir = pathlib.Path(__file__).resolve().parent
|
|
1135
|
+
skill_dir = script_dir.parent
|
|
1136
|
+
|
|
1137
|
+
env_file = args.env_file
|
|
1138
|
+
if env_file:
|
|
1139
|
+
env_path = pathlib.Path(env_file).expanduser()
|
|
1140
|
+
if not env_path.is_absolute():
|
|
1141
|
+
env_path = (pathlib.Path.cwd() / env_path).resolve()
|
|
1142
|
+
else:
|
|
1143
|
+
env_path = skill_dir / ".env"
|
|
1144
|
+
|
|
1145
|
+
env_values = load_env_file(env_path)
|
|
1146
|
+
|
|
1147
|
+
mode = normalize_mode(resolve_setting(args.mode, "DOCS_TO_VOICE_MODE", env_values, "say"))
|
|
1148
|
+
say_voice = resolve_setting(args.voice, "DOCS_TO_VOICE_VOICE", env_values)
|
|
1149
|
+
api_endpoint = resolve_setting(
|
|
1150
|
+
args.api_endpoint,
|
|
1151
|
+
"DOCS_TO_VOICE_API_ENDPOINT",
|
|
1152
|
+
env_values,
|
|
1153
|
+
DEFAULT_API_ENDPOINT,
|
|
1154
|
+
)
|
|
1155
|
+
api_model = resolve_setting(
|
|
1156
|
+
args.api_model,
|
|
1157
|
+
"DOCS_TO_VOICE_API_MODEL",
|
|
1158
|
+
env_values,
|
|
1159
|
+
DEFAULT_API_MODEL,
|
|
1160
|
+
)
|
|
1161
|
+
api_voice = resolve_setting(
|
|
1162
|
+
args.api_voice,
|
|
1163
|
+
"DOCS_TO_VOICE_API_VOICE",
|
|
1164
|
+
env_values,
|
|
1165
|
+
DEFAULT_API_VOICE,
|
|
1166
|
+
)
|
|
1167
|
+
api_key = resolve_setting(None, "DASHSCOPE_API_KEY", env_values)
|
|
1168
|
+
raw_max_chars = resolve_setting(args.max_chars, "DOCS_TO_VOICE_MAX_CHARS", env_values)
|
|
1169
|
+
raw_speech_rate = resolve_setting(
|
|
1170
|
+
args.speech_rate,
|
|
1171
|
+
"DOCS_TO_VOICE_SPEECH_RATE",
|
|
1172
|
+
env_values,
|
|
1173
|
+
)
|
|
1174
|
+
max_chars = validate_max_chars(raw_max_chars)
|
|
1175
|
+
max_chars_disabled = is_max_chars_disabled(raw_max_chars)
|
|
1176
|
+
|
|
1177
|
+
rate = validate_rate(args.rate)
|
|
1178
|
+
speech_rate = validate_speech_rate(raw_speech_rate)
|
|
1179
|
+
|
|
1180
|
+
if mode == "api" and not api_key:
|
|
1181
|
+
raise DocsToVoiceError("DASHSCOPE_API_KEY is required for api mode.")
|
|
1182
|
+
|
|
1183
|
+
source_text = read_input_text(args)
|
|
1184
|
+
if not source_text.strip():
|
|
1185
|
+
raise DocsToVoiceError("No text content found for conversion.")
|
|
1186
|
+
|
|
1187
|
+
if mode == "api" and max_chars is None and not max_chars_disabled:
|
|
1188
|
+
discovered_max_chars = discover_api_max_chars(
|
|
1189
|
+
api_endpoint=api_endpoint,
|
|
1190
|
+
api_key=api_key,
|
|
1191
|
+
model=api_model,
|
|
1192
|
+
voice=api_voice,
|
|
1193
|
+
)
|
|
1194
|
+
if discovered_max_chars:
|
|
1195
|
+
max_chars = discovered_max_chars
|
|
1196
|
+
|
|
1197
|
+
timeline_sentence_durations = None
|
|
1198
|
+
timeline_timing_mode_hint = None
|
|
1199
|
+
|
|
1200
|
+
if mode == "api":
|
|
1201
|
+
api_sentences, api_request_items = split_text_into_api_sentence_requests(
|
|
1202
|
+
source_text=source_text,
|
|
1203
|
+
max_chars=max_chars,
|
|
1204
|
+
length_func=api_text_length_units,
|
|
1205
|
+
)
|
|
1206
|
+
if not api_request_items:
|
|
1207
|
+
raise DocsToVoiceError("No text content found for conversion.")
|
|
1208
|
+
else:
|
|
1209
|
+
text_chunks = split_text_for_tts(
|
|
1210
|
+
source_text,
|
|
1211
|
+
max_chars,
|
|
1212
|
+
length_func=None,
|
|
1213
|
+
)
|
|
1214
|
+
if not text_chunks:
|
|
1215
|
+
raise DocsToVoiceError("No text content found for conversion.")
|
|
1216
|
+
|
|
1217
|
+
project_dir = normalize_project_dir(args.project_dir)
|
|
1218
|
+
project_name = args.project_name or project_dir.name
|
|
1219
|
+
if not project_name:
|
|
1220
|
+
raise DocsToVoiceError("Unable to determine project name.")
|
|
1221
|
+
|
|
1222
|
+
output_dir = project_dir / "audio" / project_name
|
|
1223
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
1224
|
+
|
|
1225
|
+
output_name = args.output_name or "voice-{0}".format(dt.datetime.now().strftime("%Y%m%d-%H%M%S"))
|
|
1226
|
+
output_name_has_extension = "." in output_name
|
|
1227
|
+
|
|
1228
|
+
if mode == "say":
|
|
1229
|
+
if not args.no_auto_prosody:
|
|
1230
|
+
request_chunks = [build_auto_prosody_text(chunk) for chunk in text_chunks]
|
|
1231
|
+
else:
|
|
1232
|
+
request_chunks = text_chunks
|
|
1233
|
+
|
|
1234
|
+
final_output_name = choose_output_name(
|
|
1235
|
+
output_name,
|
|
1236
|
+
output_name_has_extension,
|
|
1237
|
+
mode,
|
|
1238
|
+
api_result={},
|
|
1239
|
+
)
|
|
1240
|
+
output_path = output_dir / final_output_name
|
|
1241
|
+
ensure_output_not_exists(output_path, args.force)
|
|
1242
|
+
|
|
1243
|
+
if len(request_chunks) == 1:
|
|
1244
|
+
run_say_mode(output_path, request_chunks[0], say_voice, rate)
|
|
1245
|
+
else:
|
|
1246
|
+
with tempfile.TemporaryDirectory(prefix="docs-to-voice-say-") as temp_dir:
|
|
1247
|
+
temp_dir_path = pathlib.Path(temp_dir)
|
|
1248
|
+
part_ext = output_path.suffix or ".aiff"
|
|
1249
|
+
part_paths = []
|
|
1250
|
+
|
|
1251
|
+
for index, chunk_text in enumerate(request_chunks, start=1):
|
|
1252
|
+
part_path = temp_dir_path / "part-{0:04d}{1}".format(index, part_ext)
|
|
1253
|
+
run_say_mode(part_path, chunk_text, say_voice, rate)
|
|
1254
|
+
part_paths.append(part_path)
|
|
1255
|
+
|
|
1256
|
+
concat_audio_files(part_paths, output_path)
|
|
1257
|
+
else:
|
|
1258
|
+
with tempfile.TemporaryDirectory(prefix="docs-to-voice-api-") as temp_dir:
|
|
1259
|
+
temp_dir_path = pathlib.Path(temp_dir)
|
|
1260
|
+
part_paths = []
|
|
1261
|
+
part_ext = ""
|
|
1262
|
+
sentence_durations = [0.0 for _ in api_sentences]
|
|
1263
|
+
sentence_duration_known = [True for _ in api_sentences]
|
|
1264
|
+
|
|
1265
|
+
for index, request_item in enumerate(api_request_items, start=1):
|
|
1266
|
+
chunk_text = request_item["text"]
|
|
1267
|
+
api_result = request_model_studio_audio(
|
|
1268
|
+
api_endpoint=api_endpoint,
|
|
1269
|
+
api_key=api_key,
|
|
1270
|
+
model=api_model,
|
|
1271
|
+
voice=api_voice,
|
|
1272
|
+
text=chunk_text,
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
current_ext = determine_api_extension(api_result)
|
|
1276
|
+
if not part_ext:
|
|
1277
|
+
part_ext = current_ext
|
|
1278
|
+
elif current_ext != part_ext:
|
|
1279
|
+
raise DocsToVoiceError(
|
|
1280
|
+
"API returned inconsistent chunk formats ({0} vs {1}).".format(
|
|
1281
|
+
part_ext, current_ext
|
|
1282
|
+
)
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
part_path = temp_dir_path / "part-{0:04d}.{1}".format(index, part_ext)
|
|
1286
|
+
if api_result.get("audio_url"):
|
|
1287
|
+
download_binary(api_result["audio_url"], part_path)
|
|
1288
|
+
else:
|
|
1289
|
+
write_base64_audio(api_result.get("audio_data", ""), part_path)
|
|
1290
|
+
|
|
1291
|
+
if not part_path.is_file() or part_path.stat().st_size == 0:
|
|
1292
|
+
raise DocsToVoiceError("Failed to generate audio chunk {0}.".format(index))
|
|
1293
|
+
part_paths.append(part_path)
|
|
1294
|
+
|
|
1295
|
+
sentence_index = request_item["sentence_index"]
|
|
1296
|
+
part_duration = read_duration_seconds(part_path)
|
|
1297
|
+
if part_duration is None or part_duration <= 0:
|
|
1298
|
+
sentence_duration_known[sentence_index] = False
|
|
1299
|
+
else:
|
|
1300
|
+
sentence_durations[sentence_index] += part_duration
|
|
1301
|
+
|
|
1302
|
+
final_output_name = output_name
|
|
1303
|
+
if not output_name_has_extension:
|
|
1304
|
+
final_output_name = "{0}.{1}".format(output_name, part_ext or "wav")
|
|
1305
|
+
|
|
1306
|
+
output_path = output_dir / final_output_name
|
|
1307
|
+
ensure_output_not_exists(output_path, args.force)
|
|
1308
|
+
|
|
1309
|
+
requested_ext = output_path.suffix.lower().lstrip(".")
|
|
1310
|
+
if len(part_paths) > 1 and requested_ext and requested_ext != part_ext:
|
|
1311
|
+
raise DocsToVoiceError(
|
|
1312
|
+
"Output extension .{0} does not match chunk audio format .{1}.".format(
|
|
1313
|
+
requested_ext, part_ext
|
|
1314
|
+
)
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
concat_audio_files(part_paths, output_path)
|
|
1318
|
+
|
|
1319
|
+
unknown_sentence_indexes = [
|
|
1320
|
+
index
|
|
1321
|
+
for index, is_known in enumerate(sentence_duration_known)
|
|
1322
|
+
if not is_known
|
|
1323
|
+
]
|
|
1324
|
+
|
|
1325
|
+
if not unknown_sentence_indexes and sum(sentence_durations) > 0:
|
|
1326
|
+
timeline_sentence_durations = sentence_durations
|
|
1327
|
+
timeline_timing_mode_hint = "sentence-audio"
|
|
1328
|
+
elif unknown_sentence_indexes:
|
|
1329
|
+
output_duration_seconds = read_duration_seconds(output_path)
|
|
1330
|
+
known_total = sum(
|
|
1331
|
+
value
|
|
1332
|
+
for index, value in enumerate(sentence_durations)
|
|
1333
|
+
if sentence_duration_known[index]
|
|
1334
|
+
)
|
|
1335
|
+
remaining_duration = None
|
|
1336
|
+
if (
|
|
1337
|
+
output_duration_seconds is not None
|
|
1338
|
+
and output_duration_seconds > known_total
|
|
1339
|
+
):
|
|
1340
|
+
remaining_duration = output_duration_seconds - known_total
|
|
1341
|
+
|
|
1342
|
+
if remaining_duration and remaining_duration > 0:
|
|
1343
|
+
unknown_weights = [
|
|
1344
|
+
sentence_weight(api_sentences[index])
|
|
1345
|
+
for index in unknown_sentence_indexes
|
|
1346
|
+
]
|
|
1347
|
+
total_unknown_weight = sum(unknown_weights)
|
|
1348
|
+
if total_unknown_weight > 0:
|
|
1349
|
+
for weight_index, sentence_index in enumerate(
|
|
1350
|
+
unknown_sentence_indexes
|
|
1351
|
+
):
|
|
1352
|
+
sentence_durations[sentence_index] += (
|
|
1353
|
+
remaining_duration
|
|
1354
|
+
* (unknown_weights[weight_index] / total_unknown_weight)
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
timeline_sentence_durations = sentence_durations
|
|
1358
|
+
timeline_timing_mode_hint = "sentence-audio-mixed"
|
|
1359
|
+
|
|
1360
|
+
if not output_path.is_file() or output_path.stat().st_size == 0:
|
|
1361
|
+
raise DocsToVoiceError("Failed to generate audio file.")
|
|
1362
|
+
|
|
1363
|
+
if speech_rate is not None and abs(speech_rate - 1.0) > 1e-9:
|
|
1364
|
+
apply_speech_rate_to_audio(output_path, speech_rate)
|
|
1365
|
+
timeline_sentence_durations = scale_sentence_durations(
|
|
1366
|
+
timeline_sentence_durations,
|
|
1367
|
+
speech_rate,
|
|
1368
|
+
)
|
|
1369
|
+
|
|
1370
|
+
write_sentence_timeline_files(
|
|
1371
|
+
source_text=source_text,
|
|
1372
|
+
audio_path=output_path,
|
|
1373
|
+
sentence_durations=timeline_sentence_durations,
|
|
1374
|
+
timing_mode_hint=timeline_timing_mode_hint,
|
|
1375
|
+
)
|
|
1376
|
+
print(str(output_path))
|
|
1377
|
+
return 0
|
|
1378
|
+
|
|
1379
|
+
|
|
1380
|
+
if __name__ == "__main__":
|
|
1381
|
+
try:
|
|
1382
|
+
raise SystemExit(main())
|
|
1383
|
+
except DocsToVoiceError as exc:
|
|
1384
|
+
print("[ERROR] {0}".format(exc), file=sys.stderr)
|
|
1385
|
+
raise SystemExit(1)
|