ssmd 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssmd/__init__.py +189 -0
- ssmd/_version.py +34 -0
- ssmd/capabilities.py +277 -0
- ssmd/document.py +918 -0
- ssmd/formatter.py +244 -0
- ssmd/parser.py +1049 -0
- ssmd/parser_types.py +41 -0
- ssmd/py.typed +0 -0
- ssmd/segment.py +720 -0
- ssmd/sentence.py +270 -0
- ssmd/ssml_conversions.py +124 -0
- ssmd/ssml_parser.py +599 -0
- ssmd/types.py +122 -0
- ssmd/utils.py +333 -0
- ssmd/xsampa_to_ipa.txt +174 -0
- ssmd-0.5.3.dist-info/METADATA +1210 -0
- ssmd-0.5.3.dist-info/RECORD +20 -0
- ssmd-0.5.3.dist-info/WHEEL +5 -0
- ssmd-0.5.3.dist-info/licenses/LICENSE +21 -0
- ssmd-0.5.3.dist-info/top_level.txt +1 -0
ssmd/segment.py
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
"""Segment - A piece of text with SSMD attributes.
|
|
2
|
+
|
|
3
|
+
A Segment represents a portion of text with specific formatting and processing
|
|
4
|
+
attributes. Segments are combined to form sentences.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from ssmd.ssml_conversions import (
|
|
12
|
+
PROSODY_PITCH_MAP as PITCH_MAP,
|
|
13
|
+
)
|
|
14
|
+
from ssmd.ssml_conversions import (
|
|
15
|
+
PROSODY_RATE_MAP as RATE_MAP,
|
|
16
|
+
)
|
|
17
|
+
from ssmd.ssml_conversions import (
|
|
18
|
+
PROSODY_VOLUME_MAP as VOLUME_MAP,
|
|
19
|
+
)
|
|
20
|
+
from ssmd.ssml_conversions import (
|
|
21
|
+
SSMD_BREAK_STRENGTH_MAP,
|
|
22
|
+
)
|
|
23
|
+
from ssmd.ssml_conversions import (
|
|
24
|
+
SSMD_PITCH_SHORTHAND as PITCH_TO_SSMD,
|
|
25
|
+
)
|
|
26
|
+
from ssmd.ssml_conversions import (
|
|
27
|
+
SSMD_RATE_SHORTHAND as RATE_TO_SSMD,
|
|
28
|
+
)
|
|
29
|
+
from ssmd.ssml_conversions import (
|
|
30
|
+
SSMD_VOLUME_SHORTHAND as VOLUME_TO_SSMD,
|
|
31
|
+
)
|
|
32
|
+
from ssmd.types import (
|
|
33
|
+
AudioAttrs,
|
|
34
|
+
BreakAttrs,
|
|
35
|
+
PhonemeAttrs,
|
|
36
|
+
ProsodyAttrs,
|
|
37
|
+
SayAsAttrs,
|
|
38
|
+
VoiceAttrs,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from ssmd.capabilities import TTSCapabilities
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Language code defaults (2-letter code -> full locale)
|
|
46
|
+
LANGUAGE_DEFAULTS = {
|
|
47
|
+
"en": "en-US",
|
|
48
|
+
"de": "de-DE",
|
|
49
|
+
"fr": "fr-FR",
|
|
50
|
+
"es": "es-ES",
|
|
51
|
+
"it": "it-IT",
|
|
52
|
+
"pt": "pt-PT",
|
|
53
|
+
"ru": "ru-RU",
|
|
54
|
+
"zh": "zh-CN",
|
|
55
|
+
"ja": "ja-JP",
|
|
56
|
+
"ko": "ko-KR",
|
|
57
|
+
"ar": "ar-SA",
|
|
58
|
+
"hi": "hi-IN",
|
|
59
|
+
"nl": "nl-NL",
|
|
60
|
+
"pl": "pl-PL",
|
|
61
|
+
"sv": "sv-SE",
|
|
62
|
+
"da": "da-DK",
|
|
63
|
+
"no": "no-NO",
|
|
64
|
+
"fi": "fi-FI",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Default extension handlers
|
|
69
|
+
DEFAULT_EXTENSIONS = {
|
|
70
|
+
"whisper": lambda text: f'<amazon:effect name="whispered">{text}</amazon:effect>',
|
|
71
|
+
"drc": lambda text: f'<amazon:effect name="drc">{text}</amazon:effect>',
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _escape_xml_attr(value: str) -> str:
|
|
76
|
+
"""Escape a value for use in an XML attribute.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
value: The attribute value to escape
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Escaped string safe for XML attribute
|
|
83
|
+
"""
|
|
84
|
+
return (
|
|
85
|
+
value.replace("&", "&")
|
|
86
|
+
.replace("<", "<")
|
|
87
|
+
.replace(">", ">")
|
|
88
|
+
.replace('"', """)
|
|
89
|
+
.replace("'", "'")
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _escape_xml_text(value: str) -> str:
|
|
94
|
+
"""Escape a value for use in XML text content.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
value: The text content to escape
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Escaped string safe for XML text
|
|
101
|
+
"""
|
|
102
|
+
return value.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# X-SAMPA to IPA conversion table (lazy-loaded)
|
|
106
|
+
_XSAMPA_TABLE: dict[str, str] | None = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _load_xsampa_table() -> dict[str, str]:
|
|
110
|
+
"""Load X-SAMPA to IPA conversion table."""
|
|
111
|
+
global _XSAMPA_TABLE
|
|
112
|
+
if _XSAMPA_TABLE is not None:
|
|
113
|
+
return _XSAMPA_TABLE
|
|
114
|
+
|
|
115
|
+
table = {}
|
|
116
|
+
# Try both old and new locations
|
|
117
|
+
table_paths = [
|
|
118
|
+
Path(__file__).parent / "xsampa_to_ipa.txt",
|
|
119
|
+
Path(__file__).parent / "annotations" / "xsampa_to_ipa.txt",
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
for table_file in table_paths:
|
|
123
|
+
if table_file.exists():
|
|
124
|
+
with open(table_file, encoding="utf-8") as f:
|
|
125
|
+
for line in f:
|
|
126
|
+
line = line.strip()
|
|
127
|
+
if line and not line.startswith("#"):
|
|
128
|
+
parts = line.split(maxsplit=1)
|
|
129
|
+
if len(parts) == 2:
|
|
130
|
+
xsampa, ipa = parts
|
|
131
|
+
table[xsampa] = ipa
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
_XSAMPA_TABLE = table
|
|
135
|
+
return table
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def xsampa_to_ipa(xsampa: str) -> str:
|
|
139
|
+
"""Convert X-SAMPA notation to IPA.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
xsampa: X-SAMPA phoneme string
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
IPA phoneme string
|
|
146
|
+
"""
|
|
147
|
+
table = _load_xsampa_table()
|
|
148
|
+
|
|
149
|
+
# Sort by length (longest first) for proper replacement
|
|
150
|
+
sorted_keys = sorted(table.keys(), key=len, reverse=True)
|
|
151
|
+
|
|
152
|
+
result = xsampa
|
|
153
|
+
for x in sorted_keys:
|
|
154
|
+
result = result.replace(x, table[x])
|
|
155
|
+
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def expand_language_code(code: str) -> str:
|
|
160
|
+
"""Expand 2-letter language code to full BCP-47 locale.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
code: Language code (e.g., "en", "en-US")
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Full locale code (e.g., "en-US")
|
|
167
|
+
"""
|
|
168
|
+
if code in LANGUAGE_DEFAULTS:
|
|
169
|
+
return LANGUAGE_DEFAULTS[code]
|
|
170
|
+
return code
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass
|
|
174
|
+
class Segment:
|
|
175
|
+
"""A segment of text with SSMD features.
|
|
176
|
+
|
|
177
|
+
Represents a portion of text with specific formatting and processing attributes.
|
|
178
|
+
Segments are the atomic units of SSMD content.
|
|
179
|
+
|
|
180
|
+
Attributes:
|
|
181
|
+
text: Raw text content
|
|
182
|
+
emphasis: Emphasis level (True/"moderate", "strong", "reduced", "none", False)
|
|
183
|
+
prosody: Volume, rate, pitch settings
|
|
184
|
+
language: Language code for this segment
|
|
185
|
+
voice: Voice settings for this segment
|
|
186
|
+
say_as: Text interpretation hints
|
|
187
|
+
substitution: Replacement text (alias)
|
|
188
|
+
phoneme: IPA pronunciation
|
|
189
|
+
audio: Audio file to play
|
|
190
|
+
extension: Platform-specific extension name
|
|
191
|
+
breaks_before: Pauses before this segment
|
|
192
|
+
breaks_after: Pauses after this segment
|
|
193
|
+
marks_before: Event markers before this segment
|
|
194
|
+
marks_after: Event markers after this segment
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
text: str
|
|
198
|
+
|
|
199
|
+
# Styling features
|
|
200
|
+
emphasis: bool | str = False # True/"moderate", "strong", "reduced", "none"
|
|
201
|
+
prosody: ProsodyAttrs | None = None
|
|
202
|
+
language: str | None = None
|
|
203
|
+
voice: VoiceAttrs | None = None
|
|
204
|
+
|
|
205
|
+
# Text transformation features
|
|
206
|
+
say_as: SayAsAttrs | None = None
|
|
207
|
+
substitution: str | None = None
|
|
208
|
+
phoneme: PhonemeAttrs | None = None
|
|
209
|
+
|
|
210
|
+
# Media
|
|
211
|
+
audio: AudioAttrs | None = None
|
|
212
|
+
|
|
213
|
+
# Platform-specific
|
|
214
|
+
extension: str | None = None
|
|
215
|
+
|
|
216
|
+
# Breaks and marks
|
|
217
|
+
breaks_before: list[BreakAttrs] = field(default_factory=list)
|
|
218
|
+
breaks_after: list[BreakAttrs] = field(default_factory=list)
|
|
219
|
+
marks_before: list[str] = field(default_factory=list)
|
|
220
|
+
marks_after: list[str] = field(default_factory=list)
|
|
221
|
+
|
|
222
|
+
def to_ssml(
|
|
223
|
+
self,
|
|
224
|
+
capabilities: "TTSCapabilities | None" = None,
|
|
225
|
+
extensions: dict | None = None,
|
|
226
|
+
) -> str:
|
|
227
|
+
"""Convert segment to SSML.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
capabilities: TTS engine capabilities for filtering
|
|
231
|
+
extensions: Custom extension handlers
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
SSML string
|
|
235
|
+
"""
|
|
236
|
+
result = ""
|
|
237
|
+
|
|
238
|
+
# Add marks before
|
|
239
|
+
if not capabilities or capabilities.mark:
|
|
240
|
+
for mark in self.marks_before:
|
|
241
|
+
mark_escaped = _escape_xml_attr(mark)
|
|
242
|
+
result += f'<mark name="{mark_escaped}"/>'
|
|
243
|
+
|
|
244
|
+
# Add breaks before
|
|
245
|
+
if not capabilities or capabilities.break_tags:
|
|
246
|
+
for brk in self.breaks_before:
|
|
247
|
+
result += self._break_to_ssml(brk)
|
|
248
|
+
|
|
249
|
+
# Build content with wrappers
|
|
250
|
+
content = self._build_content_ssml(capabilities, extensions)
|
|
251
|
+
result += content
|
|
252
|
+
|
|
253
|
+
# Add breaks after
|
|
254
|
+
if not capabilities or capabilities.break_tags:
|
|
255
|
+
for brk in self.breaks_after:
|
|
256
|
+
result += self._break_to_ssml(brk)
|
|
257
|
+
|
|
258
|
+
# Add marks after
|
|
259
|
+
if not capabilities or capabilities.mark:
|
|
260
|
+
for mark in self.marks_after:
|
|
261
|
+
mark_escaped = _escape_xml_attr(mark)
|
|
262
|
+
result += f'<mark name="{mark_escaped}"/>'
|
|
263
|
+
|
|
264
|
+
return result
|
|
265
|
+
|
|
266
|
+
def _build_content_ssml(
|
|
267
|
+
self,
|
|
268
|
+
capabilities: "TTSCapabilities | None",
|
|
269
|
+
extensions: dict | None,
|
|
270
|
+
) -> str:
|
|
271
|
+
"""Build the main content SSML with all wrappers.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
capabilities: TTS capabilities for filtering
|
|
275
|
+
extensions: Custom extension handlers
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
SSML content string
|
|
279
|
+
"""
|
|
280
|
+
# Handle audio (replaces text)
|
|
281
|
+
if self.audio:
|
|
282
|
+
if capabilities and not capabilities.audio:
|
|
283
|
+
return _escape_xml_text(self.text) # Fallback to description
|
|
284
|
+
return self._audio_to_ssml(self.audio)
|
|
285
|
+
|
|
286
|
+
# Start with escaped text
|
|
287
|
+
content = _escape_xml_text(self.text)
|
|
288
|
+
|
|
289
|
+
# Apply substitution
|
|
290
|
+
if self.substitution:
|
|
291
|
+
if not capabilities or capabilities.substitution:
|
|
292
|
+
alias = _escape_xml_attr(self.substitution)
|
|
293
|
+
content = f'<sub alias="{alias}">{content}</sub>'
|
|
294
|
+
|
|
295
|
+
# Apply phoneme
|
|
296
|
+
elif self.phoneme:
|
|
297
|
+
if not capabilities or capabilities.phoneme:
|
|
298
|
+
ph = self.phoneme.ph
|
|
299
|
+
# Convert X-SAMPA to IPA if needed
|
|
300
|
+
if self.phoneme.alphabet.lower() in ("x-sampa", "sampa"):
|
|
301
|
+
ph = xsampa_to_ipa(ph)
|
|
302
|
+
ph = _escape_xml_attr(ph)
|
|
303
|
+
content = f'<phoneme alphabet="ipa" ph="{ph}">{content}</phoneme>'
|
|
304
|
+
|
|
305
|
+
# Apply say-as
|
|
306
|
+
elif self.say_as:
|
|
307
|
+
if not capabilities or capabilities.say_as:
|
|
308
|
+
content = self._say_as_to_ssml(self.say_as, content)
|
|
309
|
+
|
|
310
|
+
# Apply emphasis
|
|
311
|
+
if self.emphasis:
|
|
312
|
+
if not capabilities or capabilities.emphasis:
|
|
313
|
+
content = self._emphasis_to_ssml(content)
|
|
314
|
+
|
|
315
|
+
# Apply prosody
|
|
316
|
+
if self.prosody:
|
|
317
|
+
if not capabilities or capabilities.prosody:
|
|
318
|
+
content = self._prosody_to_ssml(self.prosody, content, capabilities)
|
|
319
|
+
|
|
320
|
+
# Apply language
|
|
321
|
+
if self.language:
|
|
322
|
+
if not capabilities or capabilities.language:
|
|
323
|
+
lang = expand_language_code(self.language)
|
|
324
|
+
content = f'<lang xml:lang="{lang}">{content}</lang>'
|
|
325
|
+
|
|
326
|
+
# Apply voice (inline) - note: TTSCapabilities doesn't have voice attr
|
|
327
|
+
# Voice is always enabled as it's fundamental to TTS
|
|
328
|
+
if self.voice:
|
|
329
|
+
content = self._voice_to_ssml(self.voice, content)
|
|
330
|
+
|
|
331
|
+
# Apply extension
|
|
332
|
+
if self.extension:
|
|
333
|
+
ext_handlers = {**DEFAULT_EXTENSIONS, **(extensions or {})}
|
|
334
|
+
handler = ext_handlers.get(self.extension)
|
|
335
|
+
if handler:
|
|
336
|
+
content = handler(content)
|
|
337
|
+
|
|
338
|
+
return content
|
|
339
|
+
|
|
340
|
+
def _emphasis_to_ssml(self, content: str) -> str:
|
|
341
|
+
"""Convert emphasis to SSML."""
|
|
342
|
+
if self.emphasis is True or self.emphasis == "moderate":
|
|
343
|
+
return f"<emphasis>{content}</emphasis>"
|
|
344
|
+
elif self.emphasis == "strong":
|
|
345
|
+
return f'<emphasis level="strong">{content}</emphasis>'
|
|
346
|
+
elif self.emphasis == "reduced":
|
|
347
|
+
return f'<emphasis level="reduced">{content}</emphasis>'
|
|
348
|
+
elif self.emphasis == "none":
|
|
349
|
+
return f'<emphasis level="none">{content}</emphasis>'
|
|
350
|
+
return content
|
|
351
|
+
|
|
352
|
+
def _prosody_to_ssml(
|
|
353
|
+
self,
|
|
354
|
+
prosody: ProsodyAttrs,
|
|
355
|
+
content: str,
|
|
356
|
+
capabilities: "TTSCapabilities | None",
|
|
357
|
+
) -> str:
|
|
358
|
+
"""Convert prosody to SSML."""
|
|
359
|
+
attrs = []
|
|
360
|
+
|
|
361
|
+
if prosody.volume and (not capabilities or capabilities.prosody_volume):
|
|
362
|
+
# Map numeric to named if needed
|
|
363
|
+
vol = VOLUME_MAP.get(prosody.volume, prosody.volume)
|
|
364
|
+
vol = _escape_xml_attr(vol)
|
|
365
|
+
attrs.append(f'volume="{vol}"')
|
|
366
|
+
|
|
367
|
+
if prosody.rate and (not capabilities or capabilities.prosody_rate):
|
|
368
|
+
rate = RATE_MAP.get(prosody.rate, prosody.rate)
|
|
369
|
+
rate = _escape_xml_attr(rate)
|
|
370
|
+
attrs.append(f'rate="{rate}"')
|
|
371
|
+
|
|
372
|
+
if prosody.pitch and (not capabilities or capabilities.prosody_pitch):
|
|
373
|
+
pitch = PITCH_MAP.get(prosody.pitch, prosody.pitch)
|
|
374
|
+
pitch = _escape_xml_attr(pitch)
|
|
375
|
+
attrs.append(f'pitch="{pitch}"')
|
|
376
|
+
|
|
377
|
+
if attrs:
|
|
378
|
+
return f"<prosody {' '.join(attrs)}>{content}</prosody>"
|
|
379
|
+
return content
|
|
380
|
+
|
|
381
|
+
def _voice_to_ssml(self, voice: VoiceAttrs, content: str) -> str:
|
|
382
|
+
"""Convert voice to SSML."""
|
|
383
|
+
attrs = []
|
|
384
|
+
|
|
385
|
+
if voice.name:
|
|
386
|
+
name = _escape_xml_attr(voice.name)
|
|
387
|
+
attrs.append(f'name="{name}"')
|
|
388
|
+
else:
|
|
389
|
+
if voice.language:
|
|
390
|
+
lang = _escape_xml_attr(voice.language)
|
|
391
|
+
attrs.append(f'language="{lang}"')
|
|
392
|
+
if voice.gender:
|
|
393
|
+
gender = _escape_xml_attr(voice.gender)
|
|
394
|
+
attrs.append(f'gender="{gender}"')
|
|
395
|
+
if voice.variant:
|
|
396
|
+
variant = _escape_xml_attr(str(voice.variant))
|
|
397
|
+
attrs.append(f'variant="{variant}"')
|
|
398
|
+
|
|
399
|
+
if attrs:
|
|
400
|
+
return f"<voice {' '.join(attrs)}>{content}</voice>"
|
|
401
|
+
return content
|
|
402
|
+
|
|
403
|
+
def _say_as_to_ssml(self, say_as: SayAsAttrs, content: str) -> str:
|
|
404
|
+
"""Convert say-as to SSML."""
|
|
405
|
+
interpret = _escape_xml_attr(say_as.interpret_as)
|
|
406
|
+
attrs = [f'interpret-as="{interpret}"']
|
|
407
|
+
|
|
408
|
+
if say_as.format:
|
|
409
|
+
fmt = _escape_xml_attr(say_as.format)
|
|
410
|
+
attrs.append(f'format="{fmt}"')
|
|
411
|
+
if say_as.detail:
|
|
412
|
+
detail = _escape_xml_attr(str(say_as.detail))
|
|
413
|
+
attrs.append(f'detail="{detail}"')
|
|
414
|
+
|
|
415
|
+
return f"<say-as {' '.join(attrs)}>{content}</say-as>"
|
|
416
|
+
|
|
417
|
+
def _audio_to_ssml(self, audio: AudioAttrs) -> str:
|
|
418
|
+
"""Convert audio to SSML."""
|
|
419
|
+
src = _escape_xml_attr(audio.src)
|
|
420
|
+
attrs = [f'src="{src}"']
|
|
421
|
+
|
|
422
|
+
if audio.clip_begin:
|
|
423
|
+
cb = _escape_xml_attr(audio.clip_begin)
|
|
424
|
+
attrs.append(f'clipBegin="{cb}"')
|
|
425
|
+
if audio.clip_end:
|
|
426
|
+
ce = _escape_xml_attr(audio.clip_end)
|
|
427
|
+
attrs.append(f'clipEnd="{ce}"')
|
|
428
|
+
if audio.speed:
|
|
429
|
+
speed = _escape_xml_attr(audio.speed)
|
|
430
|
+
attrs.append(f'speed="{speed}"')
|
|
431
|
+
if audio.repeat_count:
|
|
432
|
+
rc = _escape_xml_attr(str(audio.repeat_count))
|
|
433
|
+
attrs.append(f'repeatCount="{rc}"')
|
|
434
|
+
if audio.repeat_dur:
|
|
435
|
+
rd = _escape_xml_attr(audio.repeat_dur)
|
|
436
|
+
attrs.append(f'repeatDur="{rd}"')
|
|
437
|
+
if audio.sound_level:
|
|
438
|
+
sl = _escape_xml_attr(audio.sound_level)
|
|
439
|
+
attrs.append(f'soundLevel="{sl}"')
|
|
440
|
+
|
|
441
|
+
desc = f"<desc>{self.text}</desc>" if self.text else ""
|
|
442
|
+
alt = _escape_xml_text(audio.alt_text) if audio.alt_text else ""
|
|
443
|
+
|
|
444
|
+
return f"<audio {' '.join(attrs)}>{desc}{alt}</audio>"
|
|
445
|
+
|
|
446
|
+
def _break_to_ssml(self, brk: BreakAttrs) -> str:
|
|
447
|
+
"""Convert break to SSML."""
|
|
448
|
+
if brk.time:
|
|
449
|
+
time = _escape_xml_attr(brk.time)
|
|
450
|
+
return f'<break time="{time}"/>'
|
|
451
|
+
elif brk.strength:
|
|
452
|
+
strength = _escape_xml_attr(brk.strength)
|
|
453
|
+
return f'<break strength="{strength}"/>'
|
|
454
|
+
return "<break/>"
|
|
455
|
+
|
|
456
|
+
def to_ssmd(self) -> str:
|
|
457
|
+
"""Convert segment to SSMD markdown.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
SSMD string
|
|
461
|
+
"""
|
|
462
|
+
result = ""
|
|
463
|
+
|
|
464
|
+
# Add marks before
|
|
465
|
+
for mark in self.marks_before:
|
|
466
|
+
result += f"@{mark} "
|
|
467
|
+
|
|
468
|
+
# Add breaks before
|
|
469
|
+
for brk in self.breaks_before:
|
|
470
|
+
result += self._break_to_ssmd(brk) + " "
|
|
471
|
+
|
|
472
|
+
# Build content
|
|
473
|
+
content = self._build_content_ssmd()
|
|
474
|
+
result += content
|
|
475
|
+
|
|
476
|
+
# Add breaks after
|
|
477
|
+
for brk in self.breaks_after:
|
|
478
|
+
result += " " + self._break_to_ssmd(brk)
|
|
479
|
+
|
|
480
|
+
# Add marks after
|
|
481
|
+
for mark in self.marks_after:
|
|
482
|
+
result += f" @{mark}"
|
|
483
|
+
|
|
484
|
+
return result
|
|
485
|
+
|
|
486
|
+
def _build_content_ssmd(self) -> str: # noqa: C901
|
|
487
|
+
"""Build SSMD content with markup."""
|
|
488
|
+
text = self.text
|
|
489
|
+
|
|
490
|
+
# Handle audio
|
|
491
|
+
if self.audio:
|
|
492
|
+
return self._audio_to_ssmd(self.audio)
|
|
493
|
+
|
|
494
|
+
# Collect annotations
|
|
495
|
+
annotations = []
|
|
496
|
+
|
|
497
|
+
# Language
|
|
498
|
+
if self.language:
|
|
499
|
+
annotations.append(self.language)
|
|
500
|
+
|
|
501
|
+
# Voice
|
|
502
|
+
if self.voice:
|
|
503
|
+
voice_str = self._voice_to_ssmd_annotation(self.voice)
|
|
504
|
+
if voice_str:
|
|
505
|
+
annotations.append(voice_str)
|
|
506
|
+
|
|
507
|
+
# Say-as
|
|
508
|
+
if self.say_as:
|
|
509
|
+
sa_str = f"as: {self.say_as.interpret_as}"
|
|
510
|
+
if self.say_as.format:
|
|
511
|
+
sa_str += f', format: "{self.say_as.format}"'
|
|
512
|
+
if self.say_as.detail:
|
|
513
|
+
sa_str += f", detail: {self.say_as.detail}"
|
|
514
|
+
annotations.append(sa_str)
|
|
515
|
+
|
|
516
|
+
# Substitution
|
|
517
|
+
if self.substitution:
|
|
518
|
+
annotations.append(f"sub: {self.substitution}")
|
|
519
|
+
|
|
520
|
+
# Phoneme - include alphabet
|
|
521
|
+
if self.phoneme:
|
|
522
|
+
annotations.append(
|
|
523
|
+
f"ph: {self.phoneme.ph}, alphabet: {self.phoneme.alphabet}"
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Extension
|
|
527
|
+
if self.extension:
|
|
528
|
+
annotations.append(f"ext: {self.extension}")
|
|
529
|
+
|
|
530
|
+
# Determine if we can use prosody shorthand
|
|
531
|
+
# Shorthand is only used when: single prosody attr AND no other annotations
|
|
532
|
+
use_prosody_shorthand = False
|
|
533
|
+
if self.prosody and not annotations:
|
|
534
|
+
# Check if only one prosody attribute is set
|
|
535
|
+
attrs_set = sum(
|
|
536
|
+
[
|
|
537
|
+
1 if self.prosody.volume else 0,
|
|
538
|
+
1 if self.prosody.rate else 0,
|
|
539
|
+
1 if self.prosody.pitch else 0,
|
|
540
|
+
]
|
|
541
|
+
)
|
|
542
|
+
if attrs_set == 1:
|
|
543
|
+
# Check if the value has a shorthand
|
|
544
|
+
if self.prosody.volume and self.prosody.volume in VOLUME_TO_SSMD:
|
|
545
|
+
use_prosody_shorthand = True
|
|
546
|
+
elif self.prosody.rate and self.prosody.rate in RATE_TO_SSMD:
|
|
547
|
+
use_prosody_shorthand = True
|
|
548
|
+
elif self.prosody.pitch and self.prosody.pitch in PITCH_TO_SSMD:
|
|
549
|
+
use_prosody_shorthand = True
|
|
550
|
+
|
|
551
|
+
# Add prosody to annotations if not using shorthand
|
|
552
|
+
if self.prosody and not use_prosody_shorthand:
|
|
553
|
+
prosody_str = self._prosody_to_ssmd_annotation(self.prosody)
|
|
554
|
+
if prosody_str:
|
|
555
|
+
annotations.append(prosody_str)
|
|
556
|
+
|
|
557
|
+
# Apply emphasis shorthand or include in annotations
|
|
558
|
+
if self.emphasis:
|
|
559
|
+
if annotations:
|
|
560
|
+
# Use annotation form
|
|
561
|
+
if self.emphasis == "none":
|
|
562
|
+
annotations.append("emphasis: none")
|
|
563
|
+
# Other emphasis levels handled by shorthand below
|
|
564
|
+
else:
|
|
565
|
+
# Use shorthand
|
|
566
|
+
if self.emphasis is True or self.emphasis == "moderate":
|
|
567
|
+
text = f"*{text}*"
|
|
568
|
+
elif self.emphasis == "strong":
|
|
569
|
+
text = f"**{text}**"
|
|
570
|
+
elif self.emphasis == "reduced":
|
|
571
|
+
text = f"_{text}_"
|
|
572
|
+
elif self.emphasis == "none":
|
|
573
|
+
annotations.append("emphasis: none")
|
|
574
|
+
|
|
575
|
+
# If we have annotations, wrap in [text](annotations)
|
|
576
|
+
if annotations:
|
|
577
|
+
# If we also have emphasis shorthand, wrap the emphasized text
|
|
578
|
+
if (
|
|
579
|
+
self.emphasis
|
|
580
|
+
and self.emphasis != "none"
|
|
581
|
+
and not any("emphasis:" in a for a in annotations)
|
|
582
|
+
):
|
|
583
|
+
if self.emphasis is True or self.emphasis == "moderate":
|
|
584
|
+
text = f"*{text}*"
|
|
585
|
+
elif self.emphasis == "strong":
|
|
586
|
+
text = f"**{text}**"
|
|
587
|
+
elif self.emphasis == "reduced":
|
|
588
|
+
text = f"_{text}_"
|
|
589
|
+
return f"[{text}]({', '.join(annotations)})"
|
|
590
|
+
|
|
591
|
+
# Apply prosody shorthand if no annotations
|
|
592
|
+
if use_prosody_shorthand and self.prosody:
|
|
593
|
+
text = self._apply_prosody_shorthand(self.prosody, text)
|
|
594
|
+
|
|
595
|
+
return text
|
|
596
|
+
|
|
597
|
+
def _prosody_to_ssmd_annotation(self, prosody: ProsodyAttrs) -> str:
|
|
598
|
+
"""Convert prosody to SSMD annotation format."""
|
|
599
|
+
parts = []
|
|
600
|
+
|
|
601
|
+
if prosody.volume:
|
|
602
|
+
# Check if it's a relative value
|
|
603
|
+
if prosody.volume.startswith(("+", "-")) or prosody.volume.endswith("dB"):
|
|
604
|
+
parts.append(f"v: {prosody.volume}")
|
|
605
|
+
else:
|
|
606
|
+
# Map to numeric
|
|
607
|
+
vol_map = {v: k for k, v in VOLUME_MAP.items()}
|
|
608
|
+
num = vol_map.get(prosody.volume, prosody.volume)
|
|
609
|
+
parts.append(f"v: {num}")
|
|
610
|
+
|
|
611
|
+
if prosody.rate:
|
|
612
|
+
if prosody.rate.endswith("%"):
|
|
613
|
+
parts.append(f"r: {prosody.rate}")
|
|
614
|
+
else:
|
|
615
|
+
rate_map = {v: k for k, v in RATE_MAP.items()}
|
|
616
|
+
num = rate_map.get(prosody.rate, prosody.rate)
|
|
617
|
+
parts.append(f"r: {num}")
|
|
618
|
+
|
|
619
|
+
if prosody.pitch:
|
|
620
|
+
if prosody.pitch.startswith(("+", "-")) or prosody.pitch.endswith("%"):
|
|
621
|
+
parts.append(f"p: {prosody.pitch}")
|
|
622
|
+
else:
|
|
623
|
+
pitch_map = {v: k for k, v in PITCH_MAP.items()}
|
|
624
|
+
num = pitch_map.get(prosody.pitch, prosody.pitch)
|
|
625
|
+
parts.append(f"p: {num}")
|
|
626
|
+
|
|
627
|
+
return ", ".join(parts)
|
|
628
|
+
|
|
629
|
+
def _apply_prosody_shorthand(self, prosody: ProsodyAttrs, text: str) -> str:
|
|
630
|
+
"""Apply prosody shorthand notation."""
|
|
631
|
+
# Only one attribute at a time for shorthand
|
|
632
|
+
attrs_set = sum(
|
|
633
|
+
[
|
|
634
|
+
1 if prosody.volume else 0,
|
|
635
|
+
1 if prosody.rate else 0,
|
|
636
|
+
1 if prosody.pitch else 0,
|
|
637
|
+
]
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
if attrs_set != 1:
|
|
641
|
+
# Multiple attrs, use annotation
|
|
642
|
+
ann = self._prosody_to_ssmd_annotation(prosody)
|
|
643
|
+
if ann:
|
|
644
|
+
return f"[{text}]({ann})"
|
|
645
|
+
return text
|
|
646
|
+
|
|
647
|
+
if prosody.volume:
|
|
648
|
+
wrap = VOLUME_TO_SSMD.get(prosody.volume)
|
|
649
|
+
if wrap:
|
|
650
|
+
return f"{wrap[0]}{text}{wrap[1]}"
|
|
651
|
+
|
|
652
|
+
if prosody.rate:
|
|
653
|
+
wrap = RATE_TO_SSMD.get(prosody.rate)
|
|
654
|
+
if wrap:
|
|
655
|
+
return f"{wrap[0]}{text}{wrap[1]}"
|
|
656
|
+
|
|
657
|
+
if prosody.pitch:
|
|
658
|
+
wrap = PITCH_TO_SSMD.get(prosody.pitch)
|
|
659
|
+
if wrap:
|
|
660
|
+
return f"{wrap[0]}{text}{wrap[1]}"
|
|
661
|
+
|
|
662
|
+
return text
|
|
663
|
+
|
|
664
|
+
def _voice_to_ssmd_annotation(self, voice: VoiceAttrs) -> str:
|
|
665
|
+
"""Convert voice to SSMD annotation format."""
|
|
666
|
+
if voice.name:
|
|
667
|
+
return f"voice: {voice.name}"
|
|
668
|
+
else:
|
|
669
|
+
parts = []
|
|
670
|
+
if voice.language:
|
|
671
|
+
parts.append(f"voice: {voice.language}")
|
|
672
|
+
if voice.gender:
|
|
673
|
+
parts.append(f"gender: {voice.gender}")
|
|
674
|
+
if voice.variant:
|
|
675
|
+
parts.append(f"variant: {voice.variant}")
|
|
676
|
+
return ", ".join(parts)
|
|
677
|
+
|
|
678
|
+
def _audio_to_ssmd(self, audio: AudioAttrs) -> str:
|
|
679
|
+
"""Convert audio to SSMD format."""
|
|
680
|
+
parts = [audio.src]
|
|
681
|
+
|
|
682
|
+
# Add attributes
|
|
683
|
+
if audio.clip_begin and audio.clip_end:
|
|
684
|
+
parts.append(f"clip: {audio.clip_begin}-{audio.clip_end}")
|
|
685
|
+
if audio.speed:
|
|
686
|
+
parts.append(f"speed: {audio.speed}")
|
|
687
|
+
if audio.repeat_count:
|
|
688
|
+
parts.append(f"repeat: {audio.repeat_count}")
|
|
689
|
+
if audio.repeat_dur:
|
|
690
|
+
parts.append(f"repeatDur: {audio.repeat_dur}")
|
|
691
|
+
if audio.sound_level:
|
|
692
|
+
parts.append(f"level: {audio.sound_level}")
|
|
693
|
+
|
|
694
|
+
# Add alt text
|
|
695
|
+
if audio.alt_text:
|
|
696
|
+
parts.append(audio.alt_text)
|
|
697
|
+
|
|
698
|
+
# Use self.text as description (can be empty)
|
|
699
|
+
# Audio attributes are space-separated per spec
|
|
700
|
+
return f"[{self.text}]({' '.join(parts)})"
|
|
701
|
+
|
|
702
|
+
def _break_to_ssmd(self, brk: BreakAttrs) -> str:
|
|
703
|
+
"""Convert break to SSMD format."""
|
|
704
|
+
if brk.time:
|
|
705
|
+
return f"...{brk.time}"
|
|
706
|
+
elif brk.strength:
|
|
707
|
+
return SSMD_BREAK_STRENGTH_MAP.get(brk.strength, "...s")
|
|
708
|
+
return "...s"
|
|
709
|
+
|
|
710
|
+
def to_text(self) -> str:
|
|
711
|
+
"""Convert segment to plain text.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Plain text with all markup removed
|
|
715
|
+
"""
|
|
716
|
+
if self.audio:
|
|
717
|
+
return self.text # Return description
|
|
718
|
+
if self.substitution:
|
|
719
|
+
return self.substitution # Return the spoken alias
|
|
720
|
+
return self.text
|