@laitszkin/apollo-toolkit 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/AGENTS.md +62 -0
  2. package/CHANGELOG.md +100 -0
  3. package/LICENSE +21 -0
  4. package/README.md +144 -0
  5. package/align-project-documents/SKILL.md +94 -0
  6. package/align-project-documents/agents/openai.yaml +4 -0
  7. package/analyse-app-logs/LICENSE +21 -0
  8. package/analyse-app-logs/README.md +126 -0
  9. package/analyse-app-logs/SKILL.md +121 -0
  10. package/analyse-app-logs/agents/openai.yaml +4 -0
  11. package/analyse-app-logs/references/investigation-checklist.md +58 -0
  12. package/analyse-app-logs/references/log-signal-patterns.md +52 -0
  13. package/answering-questions-with-research/SKILL.md +46 -0
  14. package/answering-questions-with-research/agents/openai.yaml +4 -0
  15. package/bin/apollo-toolkit.js +7 -0
  16. package/commit-and-push/LICENSE +21 -0
  17. package/commit-and-push/README.md +26 -0
  18. package/commit-and-push/SKILL.md +70 -0
  19. package/commit-and-push/agents/openai.yaml +4 -0
  20. package/commit-and-push/references/branch-naming.md +15 -0
  21. package/commit-and-push/references/commit-messages.md +19 -0
  22. package/deep-research-topics/LICENSE +21 -0
  23. package/deep-research-topics/README.md +43 -0
  24. package/deep-research-topics/SKILL.md +84 -0
  25. package/deep-research-topics/agents/openai.yaml +4 -0
  26. package/develop-new-features/LICENSE +21 -0
  27. package/develop-new-features/README.md +52 -0
  28. package/develop-new-features/SKILL.md +105 -0
  29. package/develop-new-features/agents/openai.yaml +4 -0
  30. package/develop-new-features/references/testing-e2e.md +35 -0
  31. package/develop-new-features/references/testing-integration.md +42 -0
  32. package/develop-new-features/references/testing-property-based.md +44 -0
  33. package/develop-new-features/references/testing-unit.md +37 -0
  34. package/discover-edge-cases/CHANGELOG.md +19 -0
  35. package/discover-edge-cases/LICENSE +21 -0
  36. package/discover-edge-cases/README.md +87 -0
  37. package/discover-edge-cases/SKILL.md +124 -0
  38. package/discover-edge-cases/agents/openai.yaml +4 -0
  39. package/discover-edge-cases/references/architecture-edge-cases.md +41 -0
  40. package/discover-edge-cases/references/code-edge-cases.md +46 -0
  41. package/docs-to-voice/.env.example +106 -0
  42. package/docs-to-voice/CHANGELOG.md +71 -0
  43. package/docs-to-voice/LICENSE +21 -0
  44. package/docs-to-voice/README.md +118 -0
  45. package/docs-to-voice/SKILL.md +107 -0
  46. package/docs-to-voice/agents/openai.yaml +4 -0
  47. package/docs-to-voice/scripts/docs_to_voice.py +1385 -0
  48. package/docs-to-voice/scripts/docs_to_voice.sh +11 -0
  49. package/docs-to-voice/tests/test_docs_to_voice_api_max_chars.py +210 -0
  50. package/docs-to-voice/tests/test_docs_to_voice_sentence_timeline.py +115 -0
  51. package/docs-to-voice/tests/test_docs_to_voice_settings.py +43 -0
  52. package/docs-to-voice/tests/test_docs_to_voice_speech_rate.py +57 -0
  53. package/enhance-existing-features/CHANGELOG.md +35 -0
  54. package/enhance-existing-features/LICENSE +21 -0
  55. package/enhance-existing-features/README.md +54 -0
  56. package/enhance-existing-features/SKILL.md +120 -0
  57. package/enhance-existing-features/agents/openai.yaml +4 -0
  58. package/enhance-existing-features/references/e2e-tests.md +25 -0
  59. package/enhance-existing-features/references/integration-tests.md +30 -0
  60. package/enhance-existing-features/references/property-based-tests.md +33 -0
  61. package/enhance-existing-features/references/unit-tests.md +29 -0
  62. package/feature-propose/LICENSE +21 -0
  63. package/feature-propose/README.md +23 -0
  64. package/feature-propose/SKILL.md +107 -0
  65. package/feature-propose/agents/openai.yaml +4 -0
  66. package/feature-propose/references/enhancement-features.md +25 -0
  67. package/feature-propose/references/important-features.md +25 -0
  68. package/feature-propose/references/mvp-features.md +25 -0
  69. package/feature-propose/references/performance-features.md +25 -0
  70. package/financial-research/SKILL.md +208 -0
  71. package/financial-research/agents/openai.yaml +4 -0
  72. package/financial-research/assets/weekly_market_report_template.md +45 -0
  73. package/fix-github-issues/SKILL.md +98 -0
  74. package/fix-github-issues/agents/openai.yaml +4 -0
  75. package/fix-github-issues/scripts/list_issues.py +148 -0
  76. package/fix-github-issues/tests/test_list_issues.py +127 -0
  77. package/generate-spec/LICENSE +21 -0
  78. package/generate-spec/README.md +61 -0
  79. package/generate-spec/SKILL.md +96 -0
  80. package/generate-spec/agents/openai.yaml +4 -0
  81. package/generate-spec/references/templates/checklist.md +78 -0
  82. package/generate-spec/references/templates/spec.md +55 -0
  83. package/generate-spec/references/templates/tasks.md +35 -0
  84. package/generate-spec/scripts/create-specs +123 -0
  85. package/harden-app-security/CHANGELOG.md +27 -0
  86. package/harden-app-security/LICENSE +21 -0
  87. package/harden-app-security/README.md +46 -0
  88. package/harden-app-security/SKILL.md +127 -0
  89. package/harden-app-security/agents/openai.yaml +4 -0
  90. package/harden-app-security/references/agent-attack-catalog.md +117 -0
  91. package/harden-app-security/references/common-software-attack-catalog.md +168 -0
  92. package/harden-app-security/references/red-team-extreme-scenarios.md +81 -0
  93. package/harden-app-security/references/risk-checklist.md +78 -0
  94. package/harden-app-security/references/security-test-patterns-agent.md +101 -0
  95. package/harden-app-security/references/security-test-patterns-finance.md +88 -0
  96. package/harden-app-security/references/test-snippets.md +73 -0
  97. package/improve-observability/SKILL.md +114 -0
  98. package/improve-observability/agents/openai.yaml +4 -0
  99. package/learn-skill-from-conversations/CHANGELOG.md +15 -0
  100. package/learn-skill-from-conversations/LICENSE +22 -0
  101. package/learn-skill-from-conversations/README.md +47 -0
  102. package/learn-skill-from-conversations/SKILL.md +85 -0
  103. package/learn-skill-from-conversations/agents/openai.yaml +4 -0
  104. package/learn-skill-from-conversations/scripts/extract_recent_conversations.py +369 -0
  105. package/learn-skill-from-conversations/tests/test_extract_recent_conversations.py +176 -0
  106. package/learning-error-book/SKILL.md +112 -0
  107. package/learning-error-book/agents/openai.yaml +4 -0
  108. package/learning-error-book/assets/error_book_template.md +66 -0
  109. package/learning-error-book/scripts/render_markdown_to_pdf.py +367 -0
  110. package/lib/cli.js +338 -0
  111. package/lib/installer.js +225 -0
  112. package/maintain-project-constraints/SKILL.md +109 -0
  113. package/maintain-project-constraints/agents/openai.yaml +4 -0
  114. package/maintain-skill-catalog/README.md +18 -0
  115. package/maintain-skill-catalog/SKILL.md +66 -0
  116. package/maintain-skill-catalog/agents/openai.yaml +4 -0
  117. package/novel-to-short-video/CHANGELOG.md +53 -0
  118. package/novel-to-short-video/LICENSE +21 -0
  119. package/novel-to-short-video/README.md +63 -0
  120. package/novel-to-short-video/SKILL.md +233 -0
  121. package/novel-to-short-video/agents/openai.yaml +4 -0
  122. package/novel-to-short-video/references/plan-template.md +71 -0
  123. package/novel-to-short-video/references/roles-json.md +41 -0
  124. package/open-github-issue/LICENSE +21 -0
  125. package/open-github-issue/README.md +97 -0
  126. package/open-github-issue/SKILL.md +119 -0
  127. package/open-github-issue/agents/openai.yaml +4 -0
  128. package/open-github-issue/scripts/open_github_issue.py +380 -0
  129. package/open-github-issue/tests/test_open_github_issue.py +159 -0
  130. package/open-source-pr-workflow/CHANGELOG.md +32 -0
  131. package/open-source-pr-workflow/LICENSE +21 -0
  132. package/open-source-pr-workflow/README.md +23 -0
  133. package/open-source-pr-workflow/SKILL.md +123 -0
  134. package/open-source-pr-workflow/agents/openai.yaml +4 -0
  135. package/openai-text-to-image-storyboard/.env.example +10 -0
  136. package/openai-text-to-image-storyboard/CHANGELOG.md +49 -0
  137. package/openai-text-to-image-storyboard/LICENSE +21 -0
  138. package/openai-text-to-image-storyboard/README.md +99 -0
  139. package/openai-text-to-image-storyboard/SKILL.md +107 -0
  140. package/openai-text-to-image-storyboard/agents/openai.yaml +4 -0
  141. package/openai-text-to-image-storyboard/scripts/generate_storyboard_images.py +763 -0
  142. package/package.json +36 -0
  143. package/record-spending/SKILL.md +113 -0
  144. package/record-spending/agents/openai.yaml +4 -0
  145. package/record-spending/references/account-format.md +33 -0
  146. package/record-spending/references/workbook-layout.md +84 -0
  147. package/resolve-review-comments/SKILL.md +122 -0
  148. package/resolve-review-comments/agents/openai.yaml +4 -0
  149. package/resolve-review-comments/references/adoption-criteria.md +23 -0
  150. package/resolve-review-comments/scripts/review_threads.py +425 -0
  151. package/resolve-review-comments/tests/test_review_threads.py +74 -0
  152. package/review-change-set/LICENSE +21 -0
  153. package/review-change-set/README.md +55 -0
  154. package/review-change-set/SKILL.md +103 -0
  155. package/review-change-set/agents/openai.yaml +4 -0
  156. package/review-codebases/LICENSE +21 -0
  157. package/review-codebases/README.md +67 -0
  158. package/review-codebases/SKILL.md +109 -0
  159. package/review-codebases/agents/openai.yaml +4 -0
  160. package/scripts/install_skills.ps1 +283 -0
  161. package/scripts/install_skills.sh +262 -0
  162. package/scripts/validate_openai_agent_config.py +194 -0
  163. package/scripts/validate_skill_frontmatter.py +110 -0
  164. package/specs-to-project-docs/LICENSE +21 -0
  165. package/specs-to-project-docs/README.md +57 -0
  166. package/specs-to-project-docs/SKILL.md +111 -0
  167. package/specs-to-project-docs/agents/openai.yaml +4 -0
  168. package/specs-to-project-docs/references/templates/architecture.md +29 -0
  169. package/specs-to-project-docs/references/templates/configuration.md +29 -0
  170. package/specs-to-project-docs/references/templates/developer-guide.md +33 -0
  171. package/specs-to-project-docs/references/templates/docs-index.md +39 -0
  172. package/specs-to-project-docs/references/templates/features.md +25 -0
  173. package/specs-to-project-docs/references/templates/getting-started.md +38 -0
  174. package/specs-to-project-docs/references/templates/readme.md +49 -0
  175. package/systematic-debug/LICENSE +21 -0
  176. package/systematic-debug/README.md +81 -0
  177. package/systematic-debug/SKILL.md +59 -0
  178. package/systematic-debug/agents/openai.yaml +4 -0
  179. package/text-to-short-video/.env.example +36 -0
  180. package/text-to-short-video/LICENSE +21 -0
  181. package/text-to-short-video/README.md +82 -0
  182. package/text-to-short-video/SKILL.md +221 -0
  183. package/text-to-short-video/agents/openai.yaml +4 -0
  184. package/text-to-short-video/scripts/enforce_video_aspect_ratio.py +350 -0
  185. package/version-release/CHANGELOG.md +53 -0
  186. package/version-release/LICENSE +21 -0
  187. package/version-release/README.md +28 -0
  188. package/version-release/SKILL.md +94 -0
  189. package/version-release/agents/openai.yaml +4 -0
  190. package/version-release/references/branch-naming.md +15 -0
  191. package/version-release/references/changelog-writing.md +8 -0
  192. package/version-release/references/commit-messages.md +19 -0
  193. package/version-release/references/readme-writing.md +12 -0
  194. package/version-release/references/semantic-versioning.md +12 -0
  195. package/video-production/CHANGELOG.md +104 -0
  196. package/video-production/LICENSE +18 -0
  197. package/video-production/README.md +68 -0
  198. package/video-production/SKILL.md +213 -0
  199. package/video-production/agents/openai.yaml +4 -0
  200. package/video-production/references/plan-template.md +54 -0
  201. package/video-production/references/roles-json.md +41 -0
  202. package/weekly-financial-event-report/SKILL.md +195 -0
  203. package/weekly-financial-event-report/agents/openai.yaml +4 -0
  204. package/weekly-financial-event-report/assets/financial_event_report_template.md +53 -0
@@ -0,0 +1,1385 @@
1
+ #!/usr/bin/env python3
2
+ """Convert text or text files into audio and sentence timelines."""
3
+
4
+ import argparse
5
+ import base64
6
+ import datetime as dt
7
+ import http.client
8
+ import json
9
+ import math
10
+ import os
11
+ import pathlib
12
+ import re
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ import tempfile
17
+ import urllib.error
18
+ import urllib.parse
19
+ import urllib.request
20
+ import wave
21
+
22
+ try:
23
+ import aifc # type: ignore
24
+ except Exception: # pragma: no cover
25
+ aifc = None
26
+
27
+
28
+ DEFAULT_API_ENDPOINT = (
29
+ "https://dashscope-intl.aliyuncs.com/api/v1/services/"
30
+ "aigc/multimodal-generation/generation"
31
+ )
32
+ DEFAULT_API_MODEL = "qwen3-tts"
33
+ DEFAULT_API_VOICE = "Cherry"
34
+ DEFAULT_API_MAX_CHARS_PROBE_LENGTH = 5000
35
+
36
+
37
+ class DocsToVoiceError(Exception):
38
+ """User-facing error for CLI failures."""
39
+
40
+
41
+ def parse_args(argv):
42
+ parser = argparse.ArgumentParser(
43
+ prog="docs_to_voice.py",
44
+ description="Convert text into speech and generate timeline JSON/SRT files.",
45
+ )
46
+
47
+ parser.add_argument("--project-dir", required=True, help="Root project directory")
48
+ parser.add_argument("--project-name", help="Folder name under DIR/audio/")
49
+ parser.add_argument("--output-name", help="Output filename")
50
+ parser.add_argument("--env-file", help="Path to .env file")
51
+ parser.add_argument("--mode", help="TTS mode: say|api")
52
+ parser.add_argument("--voice", help="macOS say voice")
53
+ parser.add_argument("--rate", help="macOS say rate (WPM)")
54
+ parser.add_argument(
55
+ "--speech-rate",
56
+ help="Speech rate multiplier applied after synthesis (e.g. 1.2 faster, 0.8 slower)",
57
+ )
58
+ parser.add_argument("--api-endpoint", help="Model Studio TTS endpoint")
59
+ parser.add_argument("--api-model", help="Model Studio model name")
60
+ parser.add_argument("--api-voice", help="Model Studio voice")
61
+ parser.add_argument(
62
+ "--max-chars",
63
+ help="Max chars per TTS request before auto chunking (0 disables chunking)",
64
+ )
65
+ parser.add_argument(
66
+ "--no-auto-prosody",
67
+ action="store_true",
68
+ help="Disable punctuation pause enhancement in say mode",
69
+ )
70
+ parser.add_argument(
71
+ "--force", action="store_true", help="Overwrite output if it already exists"
72
+ )
73
+
74
+ input_group = parser.add_mutually_exclusive_group(required=True)
75
+ input_group.add_argument("--text", help="Raw text input")
76
+ input_group.add_argument("--input-file", help="Path to input text file")
77
+
78
+ return parser.parse_args(argv)
79
+
80
+
81
+ def trim(value):
82
+ return value.strip()
83
+
84
+
85
+ def strip_wrapping_quotes(value):
86
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
87
+ return value[1:-1]
88
+ return value
89
+
90
+
91
+ def load_env_file(file_path):
92
+ values = {}
93
+ if not file_path.is_file():
94
+ return values
95
+
96
+ line_pattern = re.compile(r"^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$")
97
+ for raw_line in file_path.read_text(encoding="utf-8", errors="replace").splitlines():
98
+ line = trim(raw_line)
99
+ if not line or line.startswith("#"):
100
+ continue
101
+ match = line_pattern.match(line)
102
+ if not match:
103
+ continue
104
+ key = match.group(1)
105
+ value = strip_wrapping_quotes(trim(match.group(2)))
106
+ values[key] = value
107
+
108
+ return values
109
+
110
+
111
+ def resolve_setting(cli_value, env_key, env_values, default=""):
112
+ if cli_value is not None and str(cli_value).strip() != "":
113
+ return cli_value
114
+
115
+ file_value = env_values.get(env_key, "")
116
+ if file_value:
117
+ return file_value
118
+
119
+ env_value = os.environ.get(env_key)
120
+ if env_value:
121
+ return env_value
122
+
123
+ return default
124
+
125
+
126
+ def normalize_mode(raw_mode):
127
+ mode = (raw_mode or "say").strip().lower()
128
+ if mode not in {"say", "api"}:
129
+ raise DocsToVoiceError("--mode must be one of: say, api")
130
+ return mode
131
+
132
+
133
+ def ensure_command(name, hint):
134
+ if shutil.which(name):
135
+ return
136
+ raise DocsToVoiceError(hint)
137
+
138
+
139
+ def read_input_text(args):
140
+ if args.input_file:
141
+ input_file = pathlib.Path(args.input_file).expanduser()
142
+ if not input_file.is_absolute():
143
+ input_file = (pathlib.Path.cwd() / input_file).resolve()
144
+ if not input_file.is_file():
145
+ raise DocsToVoiceError("Input file not found: {0}".format(input_file))
146
+ return input_file.read_text(encoding="utf-8", errors="replace")
147
+ return args.text or ""
148
+
149
+
150
+ def normalize_project_dir(project_dir):
151
+ path = pathlib.Path(project_dir).expanduser()
152
+ if not path.is_absolute():
153
+ path = pathlib.Path.cwd() / path
154
+ return path.resolve()
155
+
156
+
157
+ def extract_extension_from_url(source_url):
158
+ parsed = urllib.parse.urlparse(source_url)
159
+ filename = pathlib.Path(parsed.path).name
160
+ if "." not in filename:
161
+ return ""
162
+ return filename.rsplit(".", 1)[-1].strip().lower()
163
+
164
+
165
+ def extract_extension_from_audio_format(raw_format):
166
+ if not raw_format:
167
+ return ""
168
+
169
+ value = raw_format.strip().lower().lstrip(".")
170
+ if ";" in value:
171
+ value = value.split(";", 1)[0].strip()
172
+ if "/" in value:
173
+ value = value.split("/")[-1]
174
+
175
+ alias = {
176
+ "x-wav": "wav",
177
+ "mpeg": "mp3",
178
+ "x-m4a": "m4a",
179
+ "x-aiff": "aiff",
180
+ }
181
+ return alias.get(value, value)
182
+
183
+
184
+ def determine_api_extension(api_result):
185
+ ext = ""
186
+ if api_result.get("audio_url"):
187
+ ext = extract_extension_from_url(api_result["audio_url"])
188
+ if not ext:
189
+ ext = extract_extension_from_audio_format(api_result.get("audio_format", ""))
190
+ if not ext:
191
+ ext = "wav"
192
+ return ext
193
+
194
+
195
+ def split_sentences(raw_text):
196
+ endings = set("。!?!?;;")
197
+ sentences = []
198
+
199
+ for raw_line in raw_text.split("\n"):
200
+ line = raw_line.strip()
201
+ if not line:
202
+ continue
203
+
204
+ current = []
205
+ for char in line:
206
+ current.append(char)
207
+ if char in endings:
208
+ sentence = "".join(current).strip()
209
+ if sentence:
210
+ sentences.append(sentence)
211
+ current = []
212
+
213
+ tail = "".join(current).strip()
214
+ if tail:
215
+ sentences.append(tail)
216
+
217
+ return sentences
218
+
219
+
220
+ def extract_timeline_sentences(source_text):
221
+ sentences = split_sentences(source_text)
222
+ if not sentences:
223
+ stripped = source_text.strip()
224
+ if stripped:
225
+ sentences = [stripped]
226
+
227
+ if not sentences:
228
+ raise DocsToVoiceError("No text content found for timeline generation.")
229
+
230
+ return sentences
231
+
232
+
233
+ def read_duration_seconds(file_path):
234
+ suffix = file_path.suffix.lower()
235
+
236
+ try:
237
+ if suffix == ".wav":
238
+ with wave.open(str(file_path), "rb") as wav_file:
239
+ frame_rate = wav_file.getframerate()
240
+ if frame_rate > 0:
241
+ return wav_file.getnframes() / float(frame_rate)
242
+ if suffix in {".aiff", ".aif", ".aifc"} and aifc is not None:
243
+ with aifc.open(str(file_path), "rb") as aiff_file:
244
+ frame_rate = aiff_file.getframerate()
245
+ if frame_rate > 0:
246
+ return aiff_file.getnframes() / float(frame_rate)
247
+ except Exception:
248
+ pass
249
+
250
+ try:
251
+ proc = subprocess.run(
252
+ ["afinfo", str(file_path)],
253
+ check=False,
254
+ capture_output=True,
255
+ text=True,
256
+ )
257
+ except FileNotFoundError:
258
+ return None
259
+
260
+ if proc.returncode != 0:
261
+ return None
262
+
263
+ payload = "{0}\n{1}".format(proc.stdout, proc.stderr)
264
+ patterns = (
265
+ r"estimated duration:\s*([0-9.]+)\s*sec",
266
+ r"duration:\s*([0-9.]+)\s*sec",
267
+ r"duration:\s*([0-9.]+)",
268
+ )
269
+ for pattern in patterns:
270
+ match = re.search(pattern, payload, flags=re.IGNORECASE)
271
+ if not match:
272
+ continue
273
+ try:
274
+ return float(match.group(1))
275
+ except ValueError:
276
+ return None
277
+
278
+ return None
279
+
280
+
281
+ def sentence_weight(sentence):
282
+ compact = re.sub(r"\s+", "", sentence)
283
+ if not compact:
284
+ return 1.0
285
+
286
+ total = 0.0
287
+ for char in compact:
288
+ if re.match(r"[A-Za-z0-9]", char):
289
+ total += 0.55
290
+ elif re.match(r"[\u4e00-\u9fff]", char):
291
+ total += 1.0
292
+ elif char in ",,、::":
293
+ total += 0.25
294
+ elif char in "。.!!??;;":
295
+ total += 0.45
296
+ else:
297
+ total += 0.65
298
+
299
+ return max(total, 1.0)
300
+
301
+
302
+ def srt_time(seconds):
303
+ millis = int(round(max(seconds, 0.0) * 1000))
304
+ hours, millis = divmod(millis, 3_600_000)
305
+ minutes, millis = divmod(millis, 60_000)
306
+ secs, millis = divmod(millis, 1_000)
307
+ return "{0:02}:{1:02}:{2:02},{3:03}".format(hours, minutes, secs, millis)
308
+
309
+
310
+ def write_sentence_timeline_files(
311
+ source_text,
312
+ audio_path,
313
+ sentence_durations=None,
314
+ timing_mode_hint=None,
315
+ ):
316
+ sentences = extract_timeline_sentences(source_text)
317
+ duration_seconds = read_duration_seconds(audio_path)
318
+
319
+ entries = []
320
+ timing_mode = "duration-weighted"
321
+
322
+ if sentence_durations is not None and len(sentence_durations) == len(sentences):
323
+ normalized_durations = []
324
+ for raw_duration in sentence_durations:
325
+ try:
326
+ parsed = float(raw_duration)
327
+ except (TypeError, ValueError):
328
+ parsed = 0.0
329
+ normalized_durations.append(max(parsed, 0.0))
330
+
331
+ duration_total = sum(normalized_durations)
332
+ if duration_total > 0:
333
+ if duration_seconds is None or duration_seconds <= 0:
334
+ duration_seconds = duration_total
335
+
336
+ scale = 1.0
337
+ if duration_seconds and duration_seconds > 0:
338
+ scale = duration_seconds / duration_total
339
+
340
+ cursor = 0.0
341
+ for index, sentence in enumerate(sentences):
342
+ if index == len(sentences) - 1:
343
+ end = duration_seconds
344
+ else:
345
+ end = cursor + (normalized_durations[index] * scale)
346
+ end = max(end, cursor)
347
+
348
+ entries.append(
349
+ {
350
+ "index": index + 1,
351
+ "text": sentence,
352
+ "start_seconds": round(cursor, 3),
353
+ "end_seconds": round(end, 3),
354
+ "start_ms": int(round(cursor * 1000)),
355
+ "end_ms": int(round(end * 1000)),
356
+ }
357
+ )
358
+ cursor = end
359
+
360
+ timing_mode = timing_mode_hint or "sentence-audio"
361
+
362
+ if not entries:
363
+ weights = [sentence_weight(sentence) for sentence in sentences]
364
+ total_weight = sum(weights)
365
+ if total_weight <= 0:
366
+ total_weight = float(len(sentences))
367
+
368
+ if duration_seconds is None or duration_seconds <= 0:
369
+ timing_mode = "estimated"
370
+ duration_seconds = max(total_weight * 0.26, 0.4)
371
+ else:
372
+ timing_mode = "duration-weighted"
373
+
374
+ cursor = 0.0
375
+ for index, sentence in enumerate(sentences):
376
+ if index == len(sentences) - 1:
377
+ end = duration_seconds
378
+ else:
379
+ portion = weights[index] / total_weight
380
+ end = cursor + (duration_seconds * portion)
381
+ end = max(end, cursor)
382
+
383
+ entries.append(
384
+ {
385
+ "index": index + 1,
386
+ "text": sentence,
387
+ "start_seconds": round(cursor, 3),
388
+ "end_seconds": round(end, 3),
389
+ "start_ms": int(round(cursor * 1000)),
390
+ "end_ms": int(round(end * 1000)),
391
+ }
392
+ )
393
+ cursor = end
394
+
395
+ if entries:
396
+ entries[-1]["end_seconds"] = round(duration_seconds, 3)
397
+ entries[-1]["end_ms"] = int(round(duration_seconds * 1000))
398
+
399
+ timeline_base = audio_path.with_suffix("")
400
+ timeline_json_path = timeline_base.with_suffix(".timeline.json")
401
+ timeline_srt_path = timeline_base.with_suffix(".srt")
402
+
403
+ json_payload = {
404
+ "audio_file": audio_path.name,
405
+ "audio_path": str(audio_path),
406
+ "audio_duration_seconds": round(duration_seconds, 3),
407
+ "timing_mode": timing_mode,
408
+ "generated_at": dt.datetime.now(dt.timezone.utc)
409
+ .isoformat()
410
+ .replace("+00:00", "Z"),
411
+ "sentences": entries,
412
+ }
413
+
414
+ timeline_json_path.write_text(
415
+ json.dumps(json_payload, ensure_ascii=False, indent=2) + "\n",
416
+ encoding="utf-8",
417
+ )
418
+
419
+ srt_lines = []
420
+ for entry in entries:
421
+ srt_lines.append(str(entry["index"]))
422
+ srt_lines.append(
423
+ "{0} --> {1}".format(
424
+ srt_time(entry["start_seconds"]), srt_time(entry["end_seconds"])
425
+ )
426
+ )
427
+ srt_lines.append(entry["text"])
428
+ srt_lines.append("")
429
+
430
+ timeline_srt_path.write_text("\n".join(srt_lines).strip() + "\n", encoding="utf-8")
431
+
432
+
433
+ def apply_plaintext_prosody_rules(segment):
434
+ output = []
435
+ index = 0
436
+ length = len(segment)
437
+
438
+ while index < length:
439
+ if segment[index] == "\n":
440
+ cursor = index
441
+ while cursor < length and segment[cursor] == "\n":
442
+ cursor += 1
443
+ newline_count = cursor - index
444
+ if newline_count >= 2:
445
+ output.append("[[slnc 260]] ")
446
+ else:
447
+ output.append("[[slnc 90]] ")
448
+ index = cursor
449
+ continue
450
+
451
+ char = segment[index]
452
+ if char in ",、,:;:;":
453
+ output.append("{0} [[slnc 120]] ".format(char))
454
+ elif char in "。.":
455
+ output.append("{0} [[slnc 180]] ".format(char))
456
+ elif char in "??":
457
+ output.append("{0} [[slnc 190]] ".format(char))
458
+ elif char in "!!":
459
+ output.append("{0} [[slnc 150]] ".format(char))
460
+ else:
461
+ output.append(char)
462
+
463
+ index += 1
464
+
465
+ converted = "".join(output)
466
+ return re.sub(r"[ \t]{2,}", " ", converted)
467
+
468
+
469
+ def build_auto_prosody_text(raw_text):
470
+ parts = re.split(r"(\[\[[\s\S]*?\]\])", raw_text)
471
+ converted = []
472
+
473
+ for index, part in enumerate(parts):
474
+ if index % 2 == 1 and part.startswith("[[") and part.endswith("]]"):
475
+ converted.append(part)
476
+ else:
477
+ converted.append(apply_plaintext_prosody_rules(part))
478
+
479
+ return "".join(converted)
480
+
481
+
482
+ def api_text_length_units(raw_text):
483
+ units = 0
484
+ for char in raw_text:
485
+ if "\u4e00" <= char <= "\u9fff":
486
+ units += 2
487
+ else:
488
+ units += 1
489
+ return units
490
+
491
+
492
+ def split_oversized_text(raw_text, max_chars, length_func):
493
+ pieces = []
494
+ current = []
495
+ current_units = 0
496
+
497
+ for char in raw_text:
498
+ char_units = length_func(char)
499
+ if char_units <= 0:
500
+ char_units = 1
501
+
502
+ if current and current_units + char_units > max_chars:
503
+ piece = "".join(current).strip()
504
+ if piece:
505
+ pieces.append(piece)
506
+ current = [char]
507
+ current_units = char_units
508
+ continue
509
+
510
+ current.append(char)
511
+ current_units += char_units
512
+
513
+ if current:
514
+ piece = "".join(current).strip()
515
+ if piece:
516
+ pieces.append(piece)
517
+
518
+ return pieces
519
+
520
+
521
+ def split_text_into_api_sentence_requests(source_text, max_chars, length_func):
522
+ if length_func is None:
523
+ length_func = len
524
+
525
+ sentences = extract_timeline_sentences(source_text)
526
+ request_items = []
527
+
528
+ for sentence_index, sentence in enumerate(sentences):
529
+ parts = [sentence]
530
+ if max_chars and length_func(sentence) > max_chars:
531
+ parts = split_oversized_text(sentence, max_chars, length_func)
532
+
533
+ if not parts:
534
+ parts = [sentence]
535
+
536
+ for part in parts:
537
+ request_items.append({"sentence_index": sentence_index, "text": part})
538
+
539
+ return sentences, request_items
540
+
541
+
542
+ def split_text_for_tts(source_text, max_chars, length_func=None):
543
+ if length_func is None:
544
+ length_func = len
545
+
546
+ text = source_text.replace("\r\n", "\n").replace("\r", "\n").strip()
547
+ if not text:
548
+ return []
549
+
550
+ if not max_chars or length_func(text) <= max_chars:
551
+ return [text]
552
+
553
+ chunks = []
554
+ current = ""
555
+ paragraphs = [part.strip() for part in re.split(r"\n{2,}", text) if part.strip()]
556
+
557
+ for paragraph in paragraphs:
558
+ sentences = [
559
+ item.strip()
560
+ for item in re.split(r"(?<=[。!?!?;;.!?])", paragraph)
561
+ if item.strip()
562
+ ]
563
+ if not sentences:
564
+ sentences = [paragraph]
565
+
566
+ for sentence in sentences:
567
+ if length_func(sentence) > max_chars:
568
+ if current:
569
+ chunks.append(current.strip())
570
+ current = ""
571
+
572
+ for piece in split_oversized_text(sentence, max_chars, length_func):
573
+ if piece:
574
+ chunks.append(piece)
575
+ continue
576
+
577
+ if not current:
578
+ current = sentence
579
+ continue
580
+
581
+ candidate = "{0} {1}".format(current, sentence)
582
+ if length_func(candidate) <= max_chars:
583
+ current = candidate
584
+ else:
585
+ chunks.append(current.strip())
586
+ current = sentence
587
+
588
+ if current:
589
+ chunks.append(current.strip())
590
+
591
+ return chunks
592
+
593
+
594
+ def concat_wav_files(part_paths, output_path):
595
+ with wave.open(str(part_paths[0]), "rb") as first_file:
596
+ params = first_file.getparams()
597
+ frames = [first_file.readframes(first_file.getnframes())]
598
+
599
+ for part_path in part_paths[1:]:
600
+ with wave.open(str(part_path), "rb") as current:
601
+ if (
602
+ current.getnchannels() != params.nchannels
603
+ or current.getsampwidth() != params.sampwidth
604
+ or current.getframerate() != params.framerate
605
+ or current.getcomptype() != params.comptype
606
+ ):
607
+ raise DocsToVoiceError("Chunk WAV formats do not match; cannot concatenate.")
608
+ frames.append(current.readframes(current.getnframes()))
609
+
610
+ with wave.open(str(output_path), "wb") as output:
611
+ output.setparams(params)
612
+ for frame in frames:
613
+ output.writeframes(frame)
614
+
615
+
616
+ def concat_aiff_files(part_paths, output_path):
617
+ if aifc is None:
618
+ raise DocsToVoiceError("AIFF concatenation requires Python aifc module.")
619
+
620
+ with aifc.open(str(part_paths[0]), "rb") as first_file:
621
+ params = first_file.getparams()
622
+ frames = [first_file.readframes(first_file.getnframes())]
623
+
624
+ for part_path in part_paths[1:]:
625
+ with aifc.open(str(part_path), "rb") as current:
626
+ if (
627
+ current.getnchannels() != params.nchannels
628
+ or current.getsampwidth() != params.sampwidth
629
+ or current.getframerate() != params.framerate
630
+ or current.getcomptype() != params.comptype
631
+ ):
632
+ raise DocsToVoiceError("Chunk AIFF formats do not match; cannot concatenate.")
633
+ frames.append(current.readframes(current.getnframes()))
634
+
635
+ with aifc.open(str(output_path), "wb") as output:
636
+ output.setparams(params)
637
+ for frame in frames:
638
+ output.writeframes(frame)
639
+
640
+
641
+ def concat_with_ffmpeg(part_paths, output_path):
642
+ ffmpeg = shutil.which("ffmpeg")
643
+ if not ffmpeg:
644
+ raise DocsToVoiceError("ffmpeg is required for concatenating this audio format.")
645
+
646
+ with tempfile.NamedTemporaryFile(
647
+ mode="w", encoding="utf-8", suffix=".txt", delete=False
648
+ ) as handle:
649
+ list_file = pathlib.Path(handle.name)
650
+ for part_path in part_paths:
651
+ escaped = str(part_path).replace("'", "'\\''")
652
+ handle.write("file '{0}'\n".format(escaped))
653
+
654
+ codec_args = []
655
+ ext = output_path.suffix.lower()
656
+ if ext == ".wav":
657
+ codec_args = ["-c:a", "pcm_s16le"]
658
+ elif ext in {".aiff", ".aif", ".aifc"}:
659
+ codec_args = ["-c:a", "pcm_s16be"]
660
+
661
+ command = [
662
+ ffmpeg,
663
+ "-hide_banner",
664
+ "-loglevel",
665
+ "error",
666
+ "-y",
667
+ "-f",
668
+ "concat",
669
+ "-safe",
670
+ "0",
671
+ "-i",
672
+ str(list_file),
673
+ ] + codec_args + [str(output_path)]
674
+
675
+ try:
676
+ subprocess.run(command, check=True)
677
+ except subprocess.CalledProcessError as exc:
678
+ raise DocsToVoiceError(
679
+ "ffmpeg failed while concatenating chunks (exit {0}).".format(exc.returncode)
680
+ )
681
+ finally:
682
+ try:
683
+ list_file.unlink()
684
+ except FileNotFoundError:
685
+ pass
686
+
687
+
688
+ def concat_audio_files(part_paths, output_path):
689
+ if not part_paths:
690
+ raise DocsToVoiceError("No chunk audio generated for concatenation.")
691
+
692
+ if len(part_paths) == 1:
693
+ shutil.copyfile(str(part_paths[0]), str(output_path))
694
+ return
695
+
696
+ ext = output_path.suffix.lower()
697
+ if ext == ".wav":
698
+ try:
699
+ concat_wav_files(part_paths, output_path)
700
+ except Exception:
701
+ concat_with_ffmpeg(part_paths, output_path)
702
+ return
703
+ if ext in {".aiff", ".aif", ".aifc"}:
704
+ if shutil.which("ffmpeg"):
705
+ concat_with_ffmpeg(part_paths, output_path)
706
+ else:
707
+ try:
708
+ concat_aiff_files(part_paths, output_path)
709
+ except Exception:
710
+ raise DocsToVoiceError(
711
+ "AIFF chunk concatenation failed without ffmpeg. "
712
+ "Install ffmpeg or use --output-name with .wav."
713
+ )
714
+ return
715
+
716
+ concat_with_ffmpeg(part_paths, output_path)
717
+
718
+
719
+ def build_atempo_filter_chain(speech_rate):
720
+ factors = []
721
+ remaining = float(speech_rate)
722
+
723
+ while remaining < 0.5:
724
+ factors.append(0.5)
725
+ remaining /= 0.5
726
+
727
+ while remaining > 2.0:
728
+ factors.append(2.0)
729
+ remaining /= 2.0
730
+
731
+ factors.append(remaining)
732
+
733
+ filters = []
734
+ for factor in factors:
735
+ text = "{0:.6f}".format(factor).rstrip("0").rstrip(".")
736
+ if "." not in text:
737
+ text += ".0"
738
+ filters.append("atempo={0}".format(text))
739
+ return ",".join(filters)
740
+
741
+
742
+ def apply_speech_rate_to_audio(output_path, speech_rate):
743
+ if speech_rate is None or abs(speech_rate - 1.0) < 1e-9:
744
+ return
745
+
746
+ ffmpeg = shutil.which("ffmpeg")
747
+ if not ffmpeg:
748
+ raise DocsToVoiceError("--speech-rate requires ffmpeg to be installed.")
749
+
750
+ with tempfile.NamedTemporaryFile(
751
+ suffix=output_path.suffix,
752
+ prefix="docs-to-voice-rate-",
753
+ dir=str(output_path.parent),
754
+ delete=False,
755
+ ) as handle:
756
+ temp_output_path = pathlib.Path(handle.name)
757
+
758
+ command = [
759
+ ffmpeg,
760
+ "-hide_banner",
761
+ "-loglevel",
762
+ "error",
763
+ "-y",
764
+ "-i",
765
+ str(output_path),
766
+ "-filter:a",
767
+ build_atempo_filter_chain(speech_rate),
768
+ str(temp_output_path),
769
+ ]
770
+
771
+ try:
772
+ subprocess.run(command, check=True)
773
+ except subprocess.CalledProcessError as exc:
774
+ if temp_output_path.exists():
775
+ temp_output_path.unlink()
776
+ raise DocsToVoiceError(
777
+ "ffmpeg failed while applying --speech-rate (exit {0}).".format(
778
+ exc.returncode
779
+ )
780
+ )
781
+
782
+ if not temp_output_path.is_file() or temp_output_path.stat().st_size == 0:
783
+ if temp_output_path.exists():
784
+ temp_output_path.unlink()
785
+ raise DocsToVoiceError("Failed to apply --speech-rate to output audio.")
786
+
787
+ temp_output_path.replace(output_path)
788
+
789
+
790
+ def parse_api_error_message(raw_payload):
791
+ try:
792
+ data = json.loads(raw_payload)
793
+ except Exception:
794
+ return raw_payload[:400]
795
+
796
+ message = data.get("message")
797
+ code = data.get("code")
798
+ if message and code:
799
+ return "{0}: {1}".format(code, message)
800
+ if message:
801
+ return message
802
+ if code:
803
+ return code
804
+ return raw_payload[:400]
805
+
806
+
807
+ def parse_positive_int(raw_value):
808
+ if raw_value is None:
809
+ return None
810
+ if isinstance(raw_value, bool):
811
+ return None
812
+ if isinstance(raw_value, (int, float)):
813
+ parsed = int(raw_value)
814
+ if parsed > 0:
815
+ return parsed
816
+ return None
817
+
818
+ value = str(raw_value).strip().replace(",", "")
819
+ if not value.isdigit():
820
+ return None
821
+
822
+ parsed = int(value)
823
+ if parsed <= 0:
824
+ return None
825
+ return parsed
826
+
827
+
828
+ def extract_max_chars_from_text(raw_text):
829
+ if not raw_text:
830
+ return None
831
+
832
+ patterns = (
833
+ r"range of input length should be \[\s*\d+\s*,\s*([\d,]+)\s*\]",
834
+ r"max(?:imum)?\s*(?:input\s*)?(?:text\s*)?(?:length|characters?|chars?)\s*(?:is|:|=)\s*([\d,]+)",
835
+ r"(?:cannot exceed|must be less than or equal to|must be <=?|up to)\s*([\d,]+)\s*(?:characters?|chars?)",
836
+ r"(?:不超過|不能超過|上限為|上限为)\s*([\d,]+)\s*(?:個?字元|個?字符|字元|字符)",
837
+ )
838
+ for pattern in patterns:
839
+ match = re.search(pattern, raw_text, flags=re.IGNORECASE)
840
+ if not match:
841
+ continue
842
+ parsed = parse_positive_int(match.group(1))
843
+ if parsed:
844
+ return parsed
845
+
846
+ return None
847
+
848
+
849
+ def fetch_json_payload(url, headers, timeout=30):
850
+ request = urllib.request.Request(url, method="GET", headers=headers)
851
+ with urllib.request.urlopen(request, timeout=timeout) as response:
852
+ payload = response.read().decode("utf-8", errors="replace")
853
+ return json.loads(payload)
854
+
855
+
856
+ def extract_model_entry_max_chars(model_entry):
857
+ model_info = model_entry.get("model_info") or {}
858
+ inference_metadata = model_entry.get("inference_metadata") or {}
859
+
860
+ candidates = (
861
+ model_entry.get("max_input_chars"),
862
+ model_entry.get("max_input_characters"),
863
+ model_entry.get("max_input_length"),
864
+ model_entry.get("max_text_length"),
865
+ model_info.get("max_input_chars"),
866
+ model_info.get("max_input_characters"),
867
+ model_info.get("max_input_length"),
868
+ model_info.get("max_text_length"),
869
+ model_info.get("max_input_tokens"),
870
+ inference_metadata.get("max_input_chars"),
871
+ inference_metadata.get("max_input_length"),
872
+ )
873
+ for candidate in candidates:
874
+ parsed = parse_positive_int(candidate)
875
+ if parsed:
876
+ return parsed
877
+
878
+ return extract_max_chars_from_text(model_entry.get("description", ""))
879
+
880
+
881
+ def fetch_api_model_max_chars(api_endpoint, api_key, model):
882
+ parsed = urllib.parse.urlparse(api_endpoint)
883
+ if not parsed.scheme or not parsed.netloc:
884
+ return None
885
+
886
+ base_url = "{0}://{1}".format(parsed.scheme, parsed.netloc)
887
+ headers = {"Authorization": "Bearer {0}".format(api_key)}
888
+
889
+ page_no = 1
890
+ page_size = 100
891
+ while True:
892
+ query = urllib.parse.urlencode({"page_no": page_no, "page_size": page_size})
893
+ url = "{0}/api/v1/models?{1}".format(base_url, query)
894
+
895
+ try:
896
+ payload = fetch_json_payload(url, headers=headers)
897
+ except Exception:
898
+ return None
899
+
900
+ output = payload.get("output") or {}
901
+ models = output.get("models") or []
902
+ for model_entry in models:
903
+ if (model_entry.get("model") or "").strip() != model:
904
+ continue
905
+ return extract_model_entry_max_chars(model_entry)
906
+
907
+ total = parse_positive_int(output.get("total")) or 0
908
+ if total <= 0:
909
+ return None
910
+ if page_no * page_size >= total:
911
+ return None
912
+ page_no += 1
913
+
914
+
915
+ def probe_api_max_chars(api_endpoint, api_key, model, voice):
916
+ probe_text = "測" * DEFAULT_API_MAX_CHARS_PROBE_LENGTH
917
+
918
+ try:
919
+ request_model_studio_audio(
920
+ api_endpoint=api_endpoint,
921
+ api_key=api_key,
922
+ model=model,
923
+ voice=voice,
924
+ text=probe_text,
925
+ )
926
+ except DocsToVoiceError as exc:
927
+ return extract_max_chars_from_text(str(exc))
928
+
929
+ return None
930
+
931
+
932
+ def discover_api_max_chars(api_endpoint, api_key, model, voice):
933
+ discovered = fetch_api_model_max_chars(
934
+ api_endpoint=api_endpoint,
935
+ api_key=api_key,
936
+ model=model,
937
+ )
938
+ if discovered:
939
+ return discovered
940
+ return probe_api_max_chars(
941
+ api_endpoint=api_endpoint,
942
+ api_key=api_key,
943
+ model=model,
944
+ voice=voice,
945
+ )
946
+
947
+
948
+ def request_model_studio_audio(api_endpoint, api_key, model, voice, text):
949
+ payload = {
950
+ "model": model,
951
+ "input": {
952
+ "text": text,
953
+ "voice": voice,
954
+ },
955
+ }
956
+
957
+ data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
958
+ request = urllib.request.Request(
959
+ api_endpoint,
960
+ method="POST",
961
+ data=data,
962
+ headers={
963
+ "Authorization": "Bearer {0}".format(api_key),
964
+ "Content-Type": "application/json",
965
+ },
966
+ )
967
+
968
+ try:
969
+ with urllib.request.urlopen(request, timeout=300) as response:
970
+ raw_payload = response.read().decode("utf-8", errors="replace")
971
+ except urllib.error.HTTPError as exc:
972
+ raw_payload = exc.read().decode("utf-8", errors="replace")
973
+ detail = parse_api_error_message(raw_payload)
974
+ raise DocsToVoiceError(
975
+ "Model Studio TTS request failed (HTTP {0}): {1}".format(exc.code, detail)
976
+ )
977
+ except urllib.error.URLError as exc:
978
+ reason = getattr(exc, "reason", exc)
979
+ raise DocsToVoiceError(
980
+ "Model Studio TTS request failed: {0}".format(reason)
981
+ )
982
+ except http.client.HTTPException as exc:
983
+ raise DocsToVoiceError(
984
+ "Model Studio TTS request failed: {0}".format(exc)
985
+ )
986
+
987
+ try:
988
+ response_json = json.loads(raw_payload)
989
+ except json.JSONDecodeError:
990
+ raise DocsToVoiceError("API response is not valid JSON.")
991
+
992
+ output = response_json.get("output") or {}
993
+ audio = output.get("audio") or {}
994
+ audio_url = audio.get("url") or ""
995
+ audio_data = audio.get("data") or ""
996
+ audio_format = audio.get("format") or audio.get("mime_type") or ""
997
+
998
+ if not audio_url and not audio_data:
999
+ raise DocsToVoiceError(
1000
+ "API response does not contain output.audio.url or output.audio.data"
1001
+ )
1002
+
1003
+ return {
1004
+ "audio_url": audio_url,
1005
+ "audio_data": audio_data,
1006
+ "audio_format": audio_format,
1007
+ }
1008
+
1009
+
1010
+ def download_binary(url, output_path):
1011
+ try:
1012
+ with urllib.request.urlopen(url, timeout=300) as response:
1013
+ payload = response.read()
1014
+ except urllib.error.URLError as exc:
1015
+ reason = getattr(exc, "reason", exc)
1016
+ raise DocsToVoiceError("Failed to download audio URL: {0}".format(reason))
1017
+ except http.client.HTTPException as exc:
1018
+ raise DocsToVoiceError("Failed to download audio URL: {0}".format(exc))
1019
+
1020
+ output_path.write_bytes(payload)
1021
+
1022
+
1023
+ def write_base64_audio(raw_base64_data, output_path):
1024
+ try:
1025
+ audio_bytes = base64.b64decode(raw_base64_data, validate=True)
1026
+ except Exception:
1027
+ raise DocsToVoiceError("API returned invalid output.audio.data payload.")
1028
+
1029
+ output_path.write_bytes(audio_bytes)
1030
+
1031
+
1032
+ def ensure_output_not_exists(output_path, force):
1033
+ if output_path.exists() and not force:
1034
+ raise DocsToVoiceError(
1035
+ "Output already exists: {0} (use --force to overwrite)".format(output_path)
1036
+ )
1037
+
1038
+
1039
+ def run_say_mode(output_path, text, voice, rate):
1040
+ ensure_command("say", "macOS 'say' command not found.")
1041
+
1042
+ with tempfile.NamedTemporaryFile(
1043
+ mode="w", encoding="utf-8", suffix=".txt", delete=False
1044
+ ) as handle:
1045
+ handle.write(text)
1046
+ temp_text_path = pathlib.Path(handle.name)
1047
+
1048
+ command = ["say", "-o", str(output_path)]
1049
+ if voice:
1050
+ command.extend(["-v", voice])
1051
+ if rate is not None:
1052
+ command.extend(["-r", str(rate)])
1053
+ command.extend(["-f", str(temp_text_path)])
1054
+
1055
+ try:
1056
+ subprocess.run(command, check=True)
1057
+ except subprocess.CalledProcessError as exc:
1058
+ raise DocsToVoiceError("say mode failed with exit code {0}".format(exc.returncode))
1059
+ finally:
1060
+ try:
1061
+ temp_text_path.unlink()
1062
+ except FileNotFoundError:
1063
+ pass
1064
+
1065
+
1066
+ def choose_output_name(base_name, has_extension, mode, api_result):
1067
+ if has_extension:
1068
+ return base_name
1069
+
1070
+ if mode == "say":
1071
+ return "{0}.aiff".format(base_name)
1072
+
1073
+ ext = determine_api_extension(api_result)
1074
+ return "{0}.{1}".format(base_name, ext)
1075
+
1076
+
1077
+ def validate_rate(raw_rate):
1078
+ if raw_rate is None:
1079
+ return None
1080
+ if not raw_rate.isdigit() or int(raw_rate) <= 0:
1081
+ raise DocsToVoiceError("--rate must be a positive integer.")
1082
+ return int(raw_rate)
1083
+
1084
+
1085
+ def validate_speech_rate(raw_value):
1086
+ if raw_value is None:
1087
+ return None
1088
+
1089
+ value = str(raw_value).strip()
1090
+ if not value:
1091
+ return None
1092
+
1093
+ try:
1094
+ parsed = float(value)
1095
+ except ValueError:
1096
+ raise DocsToVoiceError("--speech-rate must be a positive number.")
1097
+
1098
+ if not math.isfinite(parsed) or parsed <= 0:
1099
+ raise DocsToVoiceError("--speech-rate must be a positive number.")
1100
+ return parsed
1101
+
1102
+
1103
+ def scale_sentence_durations(sentence_durations, speech_rate):
1104
+ if not sentence_durations or speech_rate is None or abs(speech_rate - 1.0) < 1e-9:
1105
+ return sentence_durations
1106
+ return [max(float(value), 0.0) / speech_rate for value in sentence_durations]
1107
+
1108
+
1109
+ def validate_max_chars(raw_value):
1110
+ if raw_value is None:
1111
+ return None
1112
+
1113
+ value = str(raw_value).strip()
1114
+ if not value:
1115
+ return None
1116
+ if not value.isdigit():
1117
+ raise DocsToVoiceError("--max-chars must be a non-negative integer.")
1118
+
1119
+ parsed = int(value)
1120
+ if parsed <= 0:
1121
+ return None
1122
+ return parsed
1123
+
1124
+
1125
+ def is_max_chars_disabled(raw_value):
1126
+ if raw_value is None:
1127
+ return False
1128
+ return str(raw_value).strip() == "0"
1129
+
1130
+
1131
+ def main(argv=None):
1132
+ args = parse_args(argv or sys.argv[1:])
1133
+
1134
+ script_dir = pathlib.Path(__file__).resolve().parent
1135
+ skill_dir = script_dir.parent
1136
+
1137
+ env_file = args.env_file
1138
+ if env_file:
1139
+ env_path = pathlib.Path(env_file).expanduser()
1140
+ if not env_path.is_absolute():
1141
+ env_path = (pathlib.Path.cwd() / env_path).resolve()
1142
+ else:
1143
+ env_path = skill_dir / ".env"
1144
+
1145
+ env_values = load_env_file(env_path)
1146
+
1147
+ mode = normalize_mode(resolve_setting(args.mode, "DOCS_TO_VOICE_MODE", env_values, "say"))
1148
+ say_voice = resolve_setting(args.voice, "DOCS_TO_VOICE_VOICE", env_values)
1149
+ api_endpoint = resolve_setting(
1150
+ args.api_endpoint,
1151
+ "DOCS_TO_VOICE_API_ENDPOINT",
1152
+ env_values,
1153
+ DEFAULT_API_ENDPOINT,
1154
+ )
1155
+ api_model = resolve_setting(
1156
+ args.api_model,
1157
+ "DOCS_TO_VOICE_API_MODEL",
1158
+ env_values,
1159
+ DEFAULT_API_MODEL,
1160
+ )
1161
+ api_voice = resolve_setting(
1162
+ args.api_voice,
1163
+ "DOCS_TO_VOICE_API_VOICE",
1164
+ env_values,
1165
+ DEFAULT_API_VOICE,
1166
+ )
1167
+ api_key = resolve_setting(None, "DASHSCOPE_API_KEY", env_values)
1168
+ raw_max_chars = resolve_setting(args.max_chars, "DOCS_TO_VOICE_MAX_CHARS", env_values)
1169
+ raw_speech_rate = resolve_setting(
1170
+ args.speech_rate,
1171
+ "DOCS_TO_VOICE_SPEECH_RATE",
1172
+ env_values,
1173
+ )
1174
+ max_chars = validate_max_chars(raw_max_chars)
1175
+ max_chars_disabled = is_max_chars_disabled(raw_max_chars)
1176
+
1177
+ rate = validate_rate(args.rate)
1178
+ speech_rate = validate_speech_rate(raw_speech_rate)
1179
+
1180
+ if mode == "api" and not api_key:
1181
+ raise DocsToVoiceError("DASHSCOPE_API_KEY is required for api mode.")
1182
+
1183
+ source_text = read_input_text(args)
1184
+ if not source_text.strip():
1185
+ raise DocsToVoiceError("No text content found for conversion.")
1186
+
1187
+ if mode == "api" and max_chars is None and not max_chars_disabled:
1188
+ discovered_max_chars = discover_api_max_chars(
1189
+ api_endpoint=api_endpoint,
1190
+ api_key=api_key,
1191
+ model=api_model,
1192
+ voice=api_voice,
1193
+ )
1194
+ if discovered_max_chars:
1195
+ max_chars = discovered_max_chars
1196
+
1197
+ timeline_sentence_durations = None
1198
+ timeline_timing_mode_hint = None
1199
+
1200
+ if mode == "api":
1201
+ api_sentences, api_request_items = split_text_into_api_sentence_requests(
1202
+ source_text=source_text,
1203
+ max_chars=max_chars,
1204
+ length_func=api_text_length_units,
1205
+ )
1206
+ if not api_request_items:
1207
+ raise DocsToVoiceError("No text content found for conversion.")
1208
+ else:
1209
+ text_chunks = split_text_for_tts(
1210
+ source_text,
1211
+ max_chars,
1212
+ length_func=None,
1213
+ )
1214
+ if not text_chunks:
1215
+ raise DocsToVoiceError("No text content found for conversion.")
1216
+
1217
+ project_dir = normalize_project_dir(args.project_dir)
1218
+ project_name = args.project_name or project_dir.name
1219
+ if not project_name:
1220
+ raise DocsToVoiceError("Unable to determine project name.")
1221
+
1222
+ output_dir = project_dir / "audio" / project_name
1223
+ output_dir.mkdir(parents=True, exist_ok=True)
1224
+
1225
+ output_name = args.output_name or "voice-{0}".format(dt.datetime.now().strftime("%Y%m%d-%H%M%S"))
1226
+ output_name_has_extension = "." in output_name
1227
+
1228
+ if mode == "say":
1229
+ if not args.no_auto_prosody:
1230
+ request_chunks = [build_auto_prosody_text(chunk) for chunk in text_chunks]
1231
+ else:
1232
+ request_chunks = text_chunks
1233
+
1234
+ final_output_name = choose_output_name(
1235
+ output_name,
1236
+ output_name_has_extension,
1237
+ mode,
1238
+ api_result={},
1239
+ )
1240
+ output_path = output_dir / final_output_name
1241
+ ensure_output_not_exists(output_path, args.force)
1242
+
1243
+ if len(request_chunks) == 1:
1244
+ run_say_mode(output_path, request_chunks[0], say_voice, rate)
1245
+ else:
1246
+ with tempfile.TemporaryDirectory(prefix="docs-to-voice-say-") as temp_dir:
1247
+ temp_dir_path = pathlib.Path(temp_dir)
1248
+ part_ext = output_path.suffix or ".aiff"
1249
+ part_paths = []
1250
+
1251
+ for index, chunk_text in enumerate(request_chunks, start=1):
1252
+ part_path = temp_dir_path / "part-{0:04d}{1}".format(index, part_ext)
1253
+ run_say_mode(part_path, chunk_text, say_voice, rate)
1254
+ part_paths.append(part_path)
1255
+
1256
+ concat_audio_files(part_paths, output_path)
1257
+ else:
1258
+ with tempfile.TemporaryDirectory(prefix="docs-to-voice-api-") as temp_dir:
1259
+ temp_dir_path = pathlib.Path(temp_dir)
1260
+ part_paths = []
1261
+ part_ext = ""
1262
+ sentence_durations = [0.0 for _ in api_sentences]
1263
+ sentence_duration_known = [True for _ in api_sentences]
1264
+
1265
+ for index, request_item in enumerate(api_request_items, start=1):
1266
+ chunk_text = request_item["text"]
1267
+ api_result = request_model_studio_audio(
1268
+ api_endpoint=api_endpoint,
1269
+ api_key=api_key,
1270
+ model=api_model,
1271
+ voice=api_voice,
1272
+ text=chunk_text,
1273
+ )
1274
+
1275
+ current_ext = determine_api_extension(api_result)
1276
+ if not part_ext:
1277
+ part_ext = current_ext
1278
+ elif current_ext != part_ext:
1279
+ raise DocsToVoiceError(
1280
+ "API returned inconsistent chunk formats ({0} vs {1}).".format(
1281
+ part_ext, current_ext
1282
+ )
1283
+ )
1284
+
1285
+ part_path = temp_dir_path / "part-{0:04d}.{1}".format(index, part_ext)
1286
+ if api_result.get("audio_url"):
1287
+ download_binary(api_result["audio_url"], part_path)
1288
+ else:
1289
+ write_base64_audio(api_result.get("audio_data", ""), part_path)
1290
+
1291
+ if not part_path.is_file() or part_path.stat().st_size == 0:
1292
+ raise DocsToVoiceError("Failed to generate audio chunk {0}.".format(index))
1293
+ part_paths.append(part_path)
1294
+
1295
+ sentence_index = request_item["sentence_index"]
1296
+ part_duration = read_duration_seconds(part_path)
1297
+ if part_duration is None or part_duration <= 0:
1298
+ sentence_duration_known[sentence_index] = False
1299
+ else:
1300
+ sentence_durations[sentence_index] += part_duration
1301
+
1302
+ final_output_name = output_name
1303
+ if not output_name_has_extension:
1304
+ final_output_name = "{0}.{1}".format(output_name, part_ext or "wav")
1305
+
1306
+ output_path = output_dir / final_output_name
1307
+ ensure_output_not_exists(output_path, args.force)
1308
+
1309
+ requested_ext = output_path.suffix.lower().lstrip(".")
1310
+ if len(part_paths) > 1 and requested_ext and requested_ext != part_ext:
1311
+ raise DocsToVoiceError(
1312
+ "Output extension .{0} does not match chunk audio format .{1}.".format(
1313
+ requested_ext, part_ext
1314
+ )
1315
+ )
1316
+
1317
+ concat_audio_files(part_paths, output_path)
1318
+
1319
+ unknown_sentence_indexes = [
1320
+ index
1321
+ for index, is_known in enumerate(sentence_duration_known)
1322
+ if not is_known
1323
+ ]
1324
+
1325
+ if not unknown_sentence_indexes and sum(sentence_durations) > 0:
1326
+ timeline_sentence_durations = sentence_durations
1327
+ timeline_timing_mode_hint = "sentence-audio"
1328
+ elif unknown_sentence_indexes:
1329
+ output_duration_seconds = read_duration_seconds(output_path)
1330
+ known_total = sum(
1331
+ value
1332
+ for index, value in enumerate(sentence_durations)
1333
+ if sentence_duration_known[index]
1334
+ )
1335
+ remaining_duration = None
1336
+ if (
1337
+ output_duration_seconds is not None
1338
+ and output_duration_seconds > known_total
1339
+ ):
1340
+ remaining_duration = output_duration_seconds - known_total
1341
+
1342
+ if remaining_duration and remaining_duration > 0:
1343
+ unknown_weights = [
1344
+ sentence_weight(api_sentences[index])
1345
+ for index in unknown_sentence_indexes
1346
+ ]
1347
+ total_unknown_weight = sum(unknown_weights)
1348
+ if total_unknown_weight > 0:
1349
+ for weight_index, sentence_index in enumerate(
1350
+ unknown_sentence_indexes
1351
+ ):
1352
+ sentence_durations[sentence_index] += (
1353
+ remaining_duration
1354
+ * (unknown_weights[weight_index] / total_unknown_weight)
1355
+ )
1356
+
1357
+ timeline_sentence_durations = sentence_durations
1358
+ timeline_timing_mode_hint = "sentence-audio-mixed"
1359
+
1360
+ if not output_path.is_file() or output_path.stat().st_size == 0:
1361
+ raise DocsToVoiceError("Failed to generate audio file.")
1362
+
1363
+ if speech_rate is not None and abs(speech_rate - 1.0) > 1e-9:
1364
+ apply_speech_rate_to_audio(output_path, speech_rate)
1365
+ timeline_sentence_durations = scale_sentence_durations(
1366
+ timeline_sentence_durations,
1367
+ speech_rate,
1368
+ )
1369
+
1370
+ write_sentence_timeline_files(
1371
+ source_text=source_text,
1372
+ audio_path=output_path,
1373
+ sentence_durations=timeline_sentence_durations,
1374
+ timing_mode_hint=timeline_timing_mode_hint,
1375
+ )
1376
+ print(str(output_path))
1377
+ return 0
1378
+
1379
+
1380
+ if __name__ == "__main__":
1381
+ try:
1382
+ raise SystemExit(main())
1383
+ except DocsToVoiceError as exc:
1384
+ print("[ERROR] {0}".format(exc), file=sys.stderr)
1385
+ raise SystemExit(1)