ssmd 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssmd/__init__.py +189 -0
- ssmd/_version.py +34 -0
- ssmd/capabilities.py +277 -0
- ssmd/document.py +918 -0
- ssmd/formatter.py +244 -0
- ssmd/parser.py +1049 -0
- ssmd/parser_types.py +41 -0
- ssmd/py.typed +0 -0
- ssmd/segment.py +720 -0
- ssmd/sentence.py +270 -0
- ssmd/ssml_conversions.py +124 -0
- ssmd/ssml_parser.py +599 -0
- ssmd/types.py +122 -0
- ssmd/utils.py +333 -0
- ssmd/xsampa_to_ipa.txt +174 -0
- ssmd-0.5.3.dist-info/METADATA +1210 -0
- ssmd-0.5.3.dist-info/RECORD +20 -0
- ssmd-0.5.3.dist-info/WHEEL +5 -0
- ssmd-0.5.3.dist-info/licenses/LICENSE +21 -0
- ssmd-0.5.3.dist-info/top_level.txt +1 -0
ssmd/parser.py
ADDED
|
@@ -0,0 +1,1049 @@
|
|
|
1
|
+
"""SSMD parser - Parse SSMD text into structured Sentence/Segment objects.
|
|
2
|
+
|
|
3
|
+
This module provides functions to parse SSMD markdown into structured data
|
|
4
|
+
that can be used for TTS processing or conversion to SSML.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from ssmd.segment import Segment
|
|
11
|
+
from ssmd.sentence import Sentence
|
|
12
|
+
from ssmd.ssml_conversions import (
|
|
13
|
+
PROSODY_PITCH_MAP,
|
|
14
|
+
PROSODY_RATE_MAP,
|
|
15
|
+
PROSODY_VOLUME_MAP,
|
|
16
|
+
SSMD_BREAK_MARKER_TO_STRENGTH,
|
|
17
|
+
)
|
|
18
|
+
from ssmd.types import (
|
|
19
|
+
DEFAULT_HEADING_LEVELS,
|
|
20
|
+
AudioAttrs,
|
|
21
|
+
BreakAttrs,
|
|
22
|
+
PhonemeAttrs,
|
|
23
|
+
ProsodyAttrs,
|
|
24
|
+
SayAsAttrs,
|
|
25
|
+
VoiceAttrs,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from ssmd.capabilities import TTSCapabilities
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
33
|
+
# REGEX PATTERNS
|
|
34
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
35
|
+
|
|
36
|
+
# Voice directive: @voice: name or @voice(name)
|
|
37
|
+
# Supports: name, language code, gender:, variant:, language:
|
|
38
|
+
VOICE_DIRECTIVE_PATTERN = re.compile(
|
|
39
|
+
r"^@voice(?::\s*|\()"
|
|
40
|
+
r"([a-zA-Z0-9_-]+(?:\s*,\s*(?:gender|variant|language):\s*[a-zA-Z0-9_-]+)*)"
|
|
41
|
+
r"\)?\s*$",
|
|
42
|
+
re.MULTILINE,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Emphasis patterns
|
|
46
|
+
STRONG_EMPHASIS_PATTERN = re.compile(r"\*\*([^\*]+)\*\*")
|
|
47
|
+
MODERATE_EMPHASIS_PATTERN = re.compile(r"\*([^\*]+)\*")
|
|
48
|
+
REDUCED_EMPHASIS_PATTERN = re.compile(r"(?<!_)_(?!_)([^_]+?)(?<!_)_(?!_)")
|
|
49
|
+
|
|
50
|
+
# Annotation pattern: [text](annotation)
|
|
51
|
+
ANNOTATION_PATTERN = re.compile(r"\[([^\]]*)\]\(([^\)]+)\)")
|
|
52
|
+
|
|
53
|
+
# Break pattern: ...500ms, ...2s, ...n, ...w, ...c, ...s, ...p
|
|
54
|
+
BREAK_PATTERN = re.compile(r"\.\.\.(\d+(?:s|ms)|[nwcsp])")
|
|
55
|
+
|
|
56
|
+
# Mark pattern: @name (but not @voice)
|
|
57
|
+
MARK_PATTERN = re.compile(r"@(?!voice[:(])(\w+)")
|
|
58
|
+
|
|
59
|
+
# Heading pattern: # ## ###
|
|
60
|
+
HEADING_PATTERN = re.compile(r"^\s*(#{1,6})\s*(.+)$", re.MULTILINE)
|
|
61
|
+
|
|
62
|
+
# Prosody shorthand patterns (applied after XML escaping, but we handle raw here)
|
|
63
|
+
PROSODY_VOLUME_PATTERN = re.compile(
|
|
64
|
+
r"(?<![a-zA-Z0-9])"
|
|
65
|
+
r"(~~|--|\+\+|-(?!-)|(?<!\+)\+|~)" # Volume markers
|
|
66
|
+
r"([^~\-+<>_^]+?)"
|
|
67
|
+
r"\1"
|
|
68
|
+
r"(?![a-zA-Z0-9])"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
PROSODY_RATE_PATTERN = re.compile(
|
|
72
|
+
r"(?<![a-zA-Z0-9])"
|
|
73
|
+
r"(<<|<(?!<)|(?<!>)>|>>)" # Rate markers
|
|
74
|
+
r"([^<>]+?)"
|
|
75
|
+
r"\1"
|
|
76
|
+
r"(?![a-zA-Z0-9])"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
PROSODY_PITCH_PATTERN = re.compile(
|
|
80
|
+
r"(?<![a-zA-Z0-9_])"
|
|
81
|
+
r"(__|\^\^|(?<!_)_(?!_)|(?<!\^)\^(?!\^))" # Pitch markers
|
|
82
|
+
r"([^_^]+?)"
|
|
83
|
+
r"\1"
|
|
84
|
+
r"(?![a-zA-Z0-9_])"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Paragraph break: two or more newlines
|
|
88
|
+
PARAGRAPH_PATTERN = re.compile(r"\n\n+")
|
|
89
|
+
|
|
90
|
+
# Space before punctuation (to normalize)
|
|
91
|
+
SPACE_BEFORE_PUNCT = re.compile(r"\s+([.!?,:;])")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
95
|
+
# MAIN PARSING FUNCTIONS
|
|
96
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_text(text: str) -> str:
|
|
100
|
+
"""Normalize text by removing extra whitespace and fixing spacing.
|
|
101
|
+
|
|
102
|
+
- Removes space before punctuation
|
|
103
|
+
- Collapses multiple spaces
|
|
104
|
+
"""
|
|
105
|
+
text = SPACE_BEFORE_PUNCT.sub(r"\1", text)
|
|
106
|
+
text = re.sub(r"\s+", " ", text)
|
|
107
|
+
return text.strip()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def parse_ssmd(
|
|
111
|
+
text: str,
|
|
112
|
+
*,
|
|
113
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
114
|
+
heading_levels: dict | None = None,
|
|
115
|
+
extensions: dict | None = None,
|
|
116
|
+
sentence_detection: bool = True,
|
|
117
|
+
language: str = "en",
|
|
118
|
+
use_spacy: bool | None = None,
|
|
119
|
+
model_size: str | None = None,
|
|
120
|
+
) -> list[Sentence]:
|
|
121
|
+
"""Parse SSMD text into a list of Sentences.
|
|
122
|
+
|
|
123
|
+
This is the main parsing function. It handles:
|
|
124
|
+
- Voice directives (@voice: name)
|
|
125
|
+
- Paragraph and sentence splitting
|
|
126
|
+
- All SSMD markup (emphasis, annotations, breaks, etc.)
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
text: SSMD markdown text
|
|
130
|
+
capabilities: TTS capabilities for filtering (optional)
|
|
131
|
+
heading_levels: Custom heading configurations
|
|
132
|
+
extensions: Custom extension handlers
|
|
133
|
+
sentence_detection: If True, split text into sentences
|
|
134
|
+
language: Default language for sentence detection
|
|
135
|
+
use_spacy: If True, use spaCy for sentence detection
|
|
136
|
+
model_size: spaCy model size ("sm", "md", "lg")
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of Sentence objects
|
|
140
|
+
"""
|
|
141
|
+
if not text or not text.strip():
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
# Resolve capabilities
|
|
145
|
+
caps = _resolve_capabilities(capabilities)
|
|
146
|
+
|
|
147
|
+
# Split text into voice blocks
|
|
148
|
+
voice_blocks = _split_voice_blocks(text)
|
|
149
|
+
|
|
150
|
+
sentences = []
|
|
151
|
+
|
|
152
|
+
for voice, block_text in voice_blocks:
|
|
153
|
+
# Split block into paragraphs
|
|
154
|
+
paragraphs = PARAGRAPH_PATTERN.split(block_text)
|
|
155
|
+
|
|
156
|
+
for para_idx, paragraph in enumerate(paragraphs):
|
|
157
|
+
paragraph = paragraph.strip()
|
|
158
|
+
if not paragraph:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
is_last_paragraph = para_idx == len(paragraphs) - 1
|
|
162
|
+
|
|
163
|
+
# Split paragraph into sentences if enabled
|
|
164
|
+
if sentence_detection:
|
|
165
|
+
sent_texts = _split_sentences(
|
|
166
|
+
paragraph,
|
|
167
|
+
language=language,
|
|
168
|
+
use_spacy=use_spacy,
|
|
169
|
+
model_size=model_size,
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
sent_texts = [paragraph]
|
|
173
|
+
|
|
174
|
+
for sent_idx, sent_text in enumerate(sent_texts):
|
|
175
|
+
sent_text = sent_text.strip()
|
|
176
|
+
if not sent_text:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
is_last_sent_in_para = sent_idx == len(sent_texts) - 1
|
|
180
|
+
|
|
181
|
+
# Parse the sentence content into segments
|
|
182
|
+
segments = _parse_segments(
|
|
183
|
+
sent_text,
|
|
184
|
+
capabilities=caps,
|
|
185
|
+
heading_levels=heading_levels,
|
|
186
|
+
extensions=extensions,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if segments:
|
|
190
|
+
sentence = Sentence(
|
|
191
|
+
segments=segments,
|
|
192
|
+
voice=voice,
|
|
193
|
+
is_paragraph_end=is_last_sent_in_para and not is_last_paragraph,
|
|
194
|
+
)
|
|
195
|
+
sentences.append(sentence)
|
|
196
|
+
|
|
197
|
+
return sentences
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _resolve_capabilities(
|
|
201
|
+
capabilities: "TTSCapabilities | str | None",
|
|
202
|
+
) -> "TTSCapabilities | None":
|
|
203
|
+
"""Resolve capabilities from string or object."""
|
|
204
|
+
if capabilities is None:
|
|
205
|
+
return None
|
|
206
|
+
if isinstance(capabilities, str):
|
|
207
|
+
from ssmd.capabilities import get_preset
|
|
208
|
+
|
|
209
|
+
return get_preset(capabilities)
|
|
210
|
+
return capabilities
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _split_voice_blocks(text: str) -> list[tuple[VoiceAttrs | None, str]]:
|
|
214
|
+
"""Split text into voice blocks.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
text: SSMD text
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
List of (voice, text) tuples
|
|
221
|
+
"""
|
|
222
|
+
blocks: list[tuple[VoiceAttrs | None, str]] = []
|
|
223
|
+
lines = text.split("\n")
|
|
224
|
+
|
|
225
|
+
current_voice: VoiceAttrs | None = None
|
|
226
|
+
current_lines: list[str] = []
|
|
227
|
+
|
|
228
|
+
for line in lines:
|
|
229
|
+
# Check if this line is a voice directive
|
|
230
|
+
match = VOICE_DIRECTIVE_PATTERN.match(line)
|
|
231
|
+
if match:
|
|
232
|
+
# Save previous block if any
|
|
233
|
+
if current_lines:
|
|
234
|
+
block_text = "\n".join(current_lines)
|
|
235
|
+
if block_text.strip():
|
|
236
|
+
blocks.append((current_voice, block_text))
|
|
237
|
+
current_lines = []
|
|
238
|
+
|
|
239
|
+
# Parse new voice
|
|
240
|
+
params = match.group(1)
|
|
241
|
+
current_voice = _parse_voice_params(params)
|
|
242
|
+
else:
|
|
243
|
+
current_lines.append(line)
|
|
244
|
+
|
|
245
|
+
# Save final block
|
|
246
|
+
if current_lines:
|
|
247
|
+
block_text = "\n".join(current_lines)
|
|
248
|
+
if block_text.strip():
|
|
249
|
+
blocks.append((current_voice, block_text))
|
|
250
|
+
|
|
251
|
+
# If no blocks, return entire text with no voice
|
|
252
|
+
if not blocks and text.strip():
|
|
253
|
+
blocks.append((None, text.strip()))
|
|
254
|
+
|
|
255
|
+
return blocks
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _parse_voice_params(params: str) -> VoiceAttrs:
|
|
259
|
+
"""Parse voice parameters from directive string."""
|
|
260
|
+
voice = VoiceAttrs()
|
|
261
|
+
|
|
262
|
+
has_gender = "gender:" in params
|
|
263
|
+
has_variant = "variant:" in params
|
|
264
|
+
has_language = "language:" in params
|
|
265
|
+
|
|
266
|
+
# Extract voice name or language code (first value before any comma)
|
|
267
|
+
voice_match = re.match(r"([a-zA-Z0-9_-]+)", params)
|
|
268
|
+
if voice_match:
|
|
269
|
+
value = voice_match.group(1)
|
|
270
|
+
# If explicit language: is provided, or gender/variant present
|
|
271
|
+
# with language-like
|
|
272
|
+
# first value, or looks like language code, treat first value as language
|
|
273
|
+
if (has_gender or has_variant) and not has_language:
|
|
274
|
+
# Pattern like "@voice: fr-FR, gender: female" - first value is language
|
|
275
|
+
if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", value):
|
|
276
|
+
voice.language = value
|
|
277
|
+
else:
|
|
278
|
+
voice.name = value
|
|
279
|
+
elif has_language:
|
|
280
|
+
# Explicit language: provided, so first value is the name
|
|
281
|
+
voice.name = value
|
|
282
|
+
elif re.match(r"^[a-z]{2}(-[A-Z]{2})?$", value):
|
|
283
|
+
# Looks like a language code
|
|
284
|
+
voice.language = value
|
|
285
|
+
else:
|
|
286
|
+
# Just a name
|
|
287
|
+
voice.name = value
|
|
288
|
+
|
|
289
|
+
# Parse explicit language: parameter
|
|
290
|
+
lang_match = re.search(r"language:\s*([a-zA-Z0-9_-]+)", params, re.IGNORECASE)
|
|
291
|
+
if lang_match:
|
|
292
|
+
voice.language = lang_match.group(1)
|
|
293
|
+
|
|
294
|
+
# Parse gender
|
|
295
|
+
gender_match = re.search(r"gender:\s*(male|female|neutral)", params, re.IGNORECASE)
|
|
296
|
+
if gender_match:
|
|
297
|
+
voice.gender = gender_match.group(1).lower() # type: ignore
|
|
298
|
+
|
|
299
|
+
# Parse variant
|
|
300
|
+
variant_match = re.search(r"variant:\s*(\d+)", params)
|
|
301
|
+
if variant_match:
|
|
302
|
+
voice.variant = int(variant_match.group(1))
|
|
303
|
+
|
|
304
|
+
return voice
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _split_sentences(
|
|
308
|
+
text: str,
|
|
309
|
+
language: str = "en",
|
|
310
|
+
use_spacy: bool | None = None,
|
|
311
|
+
model_size: str | None = None,
|
|
312
|
+
) -> list[str]:
|
|
313
|
+
"""Split text into sentences using phrasplit."""
|
|
314
|
+
try:
|
|
315
|
+
from phrasplit import split_text
|
|
316
|
+
|
|
317
|
+
# Build model name
|
|
318
|
+
size = model_size or "sm"
|
|
319
|
+
lang_code = language.split("-")[0] if "-" in language else language
|
|
320
|
+
|
|
321
|
+
# Language-specific model patterns
|
|
322
|
+
web_langs = {
|
|
323
|
+
"en",
|
|
324
|
+
"zh",
|
|
325
|
+
}
|
|
326
|
+
if lang_code in web_langs:
|
|
327
|
+
model = f"{lang_code}_core_web_{size}"
|
|
328
|
+
else:
|
|
329
|
+
model = f"{lang_code}_core_news_{size}"
|
|
330
|
+
|
|
331
|
+
segments = split_text(
|
|
332
|
+
text,
|
|
333
|
+
mode="sentence",
|
|
334
|
+
language_model=model,
|
|
335
|
+
apply_corrections=True,
|
|
336
|
+
split_on_colon=True,
|
|
337
|
+
use_spacy=use_spacy,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Group segments by sentence
|
|
341
|
+
sentences = []
|
|
342
|
+
current = ""
|
|
343
|
+
last_sent_id = None
|
|
344
|
+
|
|
345
|
+
for seg in segments:
|
|
346
|
+
if last_sent_id is not None and seg.sentence != last_sent_id:
|
|
347
|
+
if current.strip():
|
|
348
|
+
sentences.append(current)
|
|
349
|
+
current = ""
|
|
350
|
+
current += seg.text
|
|
351
|
+
last_sent_id = seg.sentence
|
|
352
|
+
|
|
353
|
+
if current.strip():
|
|
354
|
+
sentences.append(current)
|
|
355
|
+
|
|
356
|
+
return sentences if sentences else [text]
|
|
357
|
+
|
|
358
|
+
except ImportError:
|
|
359
|
+
# Fallback: simple sentence splitting
|
|
360
|
+
return _simple_sentence_split(text)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _simple_sentence_split(text: str) -> list[str]:
|
|
364
|
+
"""Simple regex-based sentence splitting."""
|
|
365
|
+
# Split on sentence-ending punctuation followed by space or newline
|
|
366
|
+
parts = re.split(r"(?<=[.!?])\s+", text)
|
|
367
|
+
return [p.strip() for p in parts if p.strip()]
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _parse_segments( # noqa: C901
|
|
371
|
+
text: str,
|
|
372
|
+
capabilities: "TTSCapabilities | None" = None,
|
|
373
|
+
heading_levels: dict | None = None,
|
|
374
|
+
extensions: dict | None = None,
|
|
375
|
+
) -> list[Segment]:
|
|
376
|
+
"""Parse text into segments with SSMD features."""
|
|
377
|
+
# Check for heading
|
|
378
|
+
heading_match = HEADING_PATTERN.match(text)
|
|
379
|
+
if heading_match:
|
|
380
|
+
return _parse_heading(heading_match, heading_levels or DEFAULT_HEADING_LEVELS)
|
|
381
|
+
|
|
382
|
+
segments: list[Segment] = []
|
|
383
|
+
position = 0
|
|
384
|
+
|
|
385
|
+
# Build combined pattern for all markup
|
|
386
|
+
# Order matters: longer patterns first
|
|
387
|
+
combined = re.compile(
|
|
388
|
+
r"("
|
|
389
|
+
r"\*\*[^\*]+\*\*" # **strong**
|
|
390
|
+
r"|\*[^\*]+\*" # *moderate*
|
|
391
|
+
r"|(?<![_a-zA-Z0-9])_(?!_)[^_]+?(?<!_)_(?![_a-zA-Z0-9])" # _reduced_
|
|
392
|
+
r"|\[[^\]]*\]\([^\)]+\)" # [text](annotation)
|
|
393
|
+
r"|\.\.\.(?:\d+(?:s|ms)|[nwcsp])" # breaks
|
|
394
|
+
r"|@(?!voice[:(])\w+" # marks
|
|
395
|
+
r"|~~[^~]+~~" # ~silent~
|
|
396
|
+
r"|--[^-]+--" # --x-soft--
|
|
397
|
+
r"|\+\+[^+]+\+\+" # ++x-loud++
|
|
398
|
+
r"|(?<![a-zA-Z0-9+])\+[^+]+\+(?![a-zA-Z0-9+])" # +loud+
|
|
399
|
+
r"|(?<![a-zA-Z0-9-])-[^-]+-(?![a-zA-Z0-9-])" # -soft-
|
|
400
|
+
r"|<<[^<>]+<<" # <<x-slow<<
|
|
401
|
+
r"|(?<![<a-zA-Z0-9])<[^<>]+<(?![<a-zA-Z0-9])" # <slow<
|
|
402
|
+
r"|>>[^<>]+>>" # >>x-fast>>
|
|
403
|
+
r"|(?<![>a-zA-Z0-9])>[^<>]+>(?![>a-zA-Z0-9])" # >fast>
|
|
404
|
+
r"|__[^_]+__" # __x-low__
|
|
405
|
+
r"|\^\^[^^]+\^\^" # ^^x-high^^
|
|
406
|
+
r"|(?<![a-zA-Z0-9^])\^[^^]+\^(?![a-zA-Z0-9^])" # ^high^
|
|
407
|
+
r")"
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
pending_breaks: list[BreakAttrs] = []
|
|
411
|
+
pending_marks: list[str] = []
|
|
412
|
+
|
|
413
|
+
for match in combined.finditer(text):
|
|
414
|
+
if match.start() > position:
|
|
415
|
+
plain = _normalize_text(text[position : match.start()])
|
|
416
|
+
if plain:
|
|
417
|
+
seg = Segment(text=plain)
|
|
418
|
+
if pending_breaks:
|
|
419
|
+
seg.breaks_before = pending_breaks
|
|
420
|
+
pending_breaks = []
|
|
421
|
+
if pending_marks:
|
|
422
|
+
seg.marks_before = pending_marks
|
|
423
|
+
pending_marks = []
|
|
424
|
+
segments.append(seg)
|
|
425
|
+
|
|
426
|
+
markup = match.group(0)
|
|
427
|
+
pending_breaks, pending_marks, markup_seg = _handle_markup(
|
|
428
|
+
markup,
|
|
429
|
+
segments,
|
|
430
|
+
pending_breaks,
|
|
431
|
+
pending_marks,
|
|
432
|
+
extensions,
|
|
433
|
+
)
|
|
434
|
+
if markup_seg:
|
|
435
|
+
segments.append(markup_seg)
|
|
436
|
+
|
|
437
|
+
position = match.end()
|
|
438
|
+
|
|
439
|
+
# Add remaining text
|
|
440
|
+
if position < len(text):
|
|
441
|
+
plain = _normalize_text(text[position:])
|
|
442
|
+
if plain:
|
|
443
|
+
seg = Segment(text=plain)
|
|
444
|
+
_apply_pending(seg, pending_breaks, pending_marks)
|
|
445
|
+
segments.append(seg)
|
|
446
|
+
|
|
447
|
+
# If no segments created but we have text, create a plain segment
|
|
448
|
+
if not segments and text.strip():
|
|
449
|
+
seg = Segment(text=text.strip())
|
|
450
|
+
_apply_pending(seg, pending_breaks, pending_marks)
|
|
451
|
+
segments.append(seg)
|
|
452
|
+
|
|
453
|
+
return segments
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _handle_markup(
|
|
457
|
+
markup: str,
|
|
458
|
+
segments: list[Segment],
|
|
459
|
+
pending_breaks: list[BreakAttrs],
|
|
460
|
+
pending_marks: list[str],
|
|
461
|
+
extensions: dict | None,
|
|
462
|
+
) -> tuple[list[BreakAttrs], list[str], Segment | None]:
|
|
463
|
+
"""Handle a single markup token and return any segment."""
|
|
464
|
+
if markup.startswith("..."):
|
|
465
|
+
brk = _parse_break(markup[3:])
|
|
466
|
+
if segments:
|
|
467
|
+
segments[-1].breaks_after.append(brk)
|
|
468
|
+
else:
|
|
469
|
+
pending_breaks.append(brk)
|
|
470
|
+
return pending_breaks, pending_marks, None
|
|
471
|
+
|
|
472
|
+
if markup.startswith("@"):
|
|
473
|
+
mark_name = markup[1:]
|
|
474
|
+
if segments:
|
|
475
|
+
segments[-1].marks_after.append(mark_name)
|
|
476
|
+
else:
|
|
477
|
+
pending_marks.append(mark_name)
|
|
478
|
+
return pending_breaks, pending_marks, None
|
|
479
|
+
|
|
480
|
+
seg = _segment_from_markup(markup, extensions)
|
|
481
|
+
if seg:
|
|
482
|
+
_apply_pending(seg, pending_breaks, pending_marks)
|
|
483
|
+
return [], [], seg
|
|
484
|
+
|
|
485
|
+
return pending_breaks, pending_marks, None
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _segment_from_markup(markup: str, extensions: dict | None) -> Segment | None:
|
|
489
|
+
"""Build a segment from emphasis, annotation, or prosody markup."""
|
|
490
|
+
if markup.startswith("**"):
|
|
491
|
+
inner = STRONG_EMPHASIS_PATTERN.match(markup)
|
|
492
|
+
if inner:
|
|
493
|
+
return Segment(text=inner.group(1), emphasis="strong")
|
|
494
|
+
return None
|
|
495
|
+
|
|
496
|
+
if markup.startswith("*"):
|
|
497
|
+
inner = MODERATE_EMPHASIS_PATTERN.match(markup)
|
|
498
|
+
if inner:
|
|
499
|
+
return Segment(text=inner.group(1), emphasis=True)
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
if markup.startswith("_") and not markup.startswith("__"):
|
|
503
|
+
inner = REDUCED_EMPHASIS_PATTERN.match(markup)
|
|
504
|
+
if inner:
|
|
505
|
+
return Segment(text=inner.group(1), emphasis="reduced")
|
|
506
|
+
return None
|
|
507
|
+
|
|
508
|
+
if markup.startswith("["):
|
|
509
|
+
return _parse_annotation(markup, extensions)
|
|
510
|
+
|
|
511
|
+
return _parse_prosody_shorthand(markup)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def _apply_pending(
|
|
515
|
+
seg: Segment,
|
|
516
|
+
pending_breaks: list[BreakAttrs],
|
|
517
|
+
pending_marks: list[str],
|
|
518
|
+
) -> None:
|
|
519
|
+
"""Apply pending breaks and marks to a segment."""
|
|
520
|
+
if pending_breaks:
|
|
521
|
+
seg.breaks_before = pending_breaks.copy()
|
|
522
|
+
if pending_marks:
|
|
523
|
+
seg.marks_before = pending_marks.copy()
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _parse_heading(
|
|
527
|
+
match: re.Match,
|
|
528
|
+
heading_levels: dict,
|
|
529
|
+
) -> list[Segment]:
|
|
530
|
+
"""Parse heading into segments."""
|
|
531
|
+
level = len(match.group(1))
|
|
532
|
+
text = match.group(2).strip()
|
|
533
|
+
|
|
534
|
+
if level not in heading_levels:
|
|
535
|
+
return [Segment(text=text)]
|
|
536
|
+
|
|
537
|
+
# Build segment with heading effects
|
|
538
|
+
seg = Segment(text=text)
|
|
539
|
+
|
|
540
|
+
for effect_type, value in heading_levels[level]:
|
|
541
|
+
if effect_type == "emphasis":
|
|
542
|
+
seg.emphasis = value
|
|
543
|
+
elif effect_type == "pause":
|
|
544
|
+
seg.breaks_after.append(BreakAttrs(time=value))
|
|
545
|
+
elif effect_type == "pause_before":
|
|
546
|
+
seg.breaks_before.append(BreakAttrs(time=value))
|
|
547
|
+
elif effect_type == "prosody" and isinstance(value, dict):
|
|
548
|
+
seg.prosody = ProsodyAttrs(
|
|
549
|
+
volume=value.get("volume"),
|
|
550
|
+
rate=value.get("rate"),
|
|
551
|
+
pitch=value.get("pitch"),
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
return [seg]
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def _parse_break(modifier: str) -> BreakAttrs:
|
|
558
|
+
"""Parse break modifier into BreakAttrs."""
|
|
559
|
+
if modifier in SSMD_BREAK_MARKER_TO_STRENGTH:
|
|
560
|
+
return BreakAttrs(strength=SSMD_BREAK_MARKER_TO_STRENGTH[modifier])
|
|
561
|
+
elif modifier.endswith("s") or modifier.endswith("ms"):
|
|
562
|
+
return BreakAttrs(time=modifier)
|
|
563
|
+
else:
|
|
564
|
+
return BreakAttrs(time=f"{modifier}ms")
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _parse_annotation(markup: str, extensions: dict | None = None) -> Segment | None:
|
|
568
|
+
"""Parse [text](annotation) markup."""
|
|
569
|
+
match = ANNOTATION_PATTERN.match(markup)
|
|
570
|
+
if not match:
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
text = match.group(1)
|
|
574
|
+
params = match.group(2).strip()
|
|
575
|
+
|
|
576
|
+
seg = Segment(text=text)
|
|
577
|
+
|
|
578
|
+
# Try to identify annotation type
|
|
579
|
+
# Audio (URL or file extension)
|
|
580
|
+
if _is_audio_annotation(params):
|
|
581
|
+
seg.audio = _parse_audio_params(params)
|
|
582
|
+
return seg
|
|
583
|
+
|
|
584
|
+
# Extension: ext: name
|
|
585
|
+
ext_match = re.match(r"^ext:\s*(\w+)$", params)
|
|
586
|
+
if ext_match:
|
|
587
|
+
seg.extension = ext_match.group(1)
|
|
588
|
+
return seg
|
|
589
|
+
|
|
590
|
+
# Voice: voice: name or voice: lang, gender: X
|
|
591
|
+
if params.startswith("voice:"):
|
|
592
|
+
seg.voice = _parse_voice_annotation(params[6:].strip())
|
|
593
|
+
return seg
|
|
594
|
+
|
|
595
|
+
# Say-as: as: type or say-as: type
|
|
596
|
+
sayas_match = re.match(
|
|
597
|
+
r"^(?:say-as|as):\s*(\w+)"
|
|
598
|
+
r'(?:\s*,\s*format:\s*["\']?([^"\']+)["\']?)?'
|
|
599
|
+
r"(?:\s*,\s*detail:\s*(\d+))?$",
|
|
600
|
+
params,
|
|
601
|
+
)
|
|
602
|
+
if sayas_match:
|
|
603
|
+
seg.say_as = SayAsAttrs(
|
|
604
|
+
interpret_as=sayas_match.group(1),
|
|
605
|
+
format=sayas_match.group(2),
|
|
606
|
+
detail=sayas_match.group(3),
|
|
607
|
+
)
|
|
608
|
+
return seg
|
|
609
|
+
|
|
610
|
+
# Phoneme: ph: or ipa: or sampa:
|
|
611
|
+
# Stop at comma to allow combined annotations like "ph: value, alphabet: ipa"
|
|
612
|
+
ph_match = re.match(r"^(ph|ipa|sampa):\s*([^,]+)", params)
|
|
613
|
+
if ph_match:
|
|
614
|
+
alphabet_type = ph_match.group(1)
|
|
615
|
+
phonemes = ph_match.group(2).strip()
|
|
616
|
+
|
|
617
|
+
# Map shorthand alphabet names
|
|
618
|
+
if alphabet_type == "sampa":
|
|
619
|
+
alphabet_type = "x-sampa"
|
|
620
|
+
elif alphabet_type == "ph":
|
|
621
|
+
# Default to ipa when using generic "ph:"
|
|
622
|
+
alphabet_type = "ipa"
|
|
623
|
+
|
|
624
|
+
# Check for explicit alphabet specification in remaining params
|
|
625
|
+
remaining = params[ph_match.end() :].strip()
|
|
626
|
+
if remaining.startswith(","):
|
|
627
|
+
remaining = remaining[1:].strip()
|
|
628
|
+
alph_match = re.match(r"^alphabet:\s*([^,]+)", remaining)
|
|
629
|
+
if alph_match:
|
|
630
|
+
specified_alphabet = alph_match.group(1).strip().lower()
|
|
631
|
+
if specified_alphabet in ("ipa", "x-sampa", "sampa"):
|
|
632
|
+
# Normalize sampa to x-sampa
|
|
633
|
+
alphabet_type = (
|
|
634
|
+
"x-sampa"
|
|
635
|
+
if specified_alphabet == "sampa"
|
|
636
|
+
else specified_alphabet
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Store phonemes as-is - conversion to IPA happens at SSML render time
|
|
640
|
+
seg.phoneme = PhonemeAttrs(ph=phonemes, alphabet=alphabet_type)
|
|
641
|
+
return seg
|
|
642
|
+
|
|
643
|
+
# Substitution: sub: alias
|
|
644
|
+
sub_match = re.match(r"^sub:\s*(.+)$", params)
|
|
645
|
+
if sub_match:
|
|
646
|
+
seg.substitution = sub_match.group(1).strip()
|
|
647
|
+
return seg
|
|
648
|
+
|
|
649
|
+
# Emphasis: emphasis: level
|
|
650
|
+
emph_match = re.match(
|
|
651
|
+
r"^emphasis:\s*(none|reduced|moderate|strong)$", params, re.IGNORECASE
|
|
652
|
+
)
|
|
653
|
+
if emph_match:
|
|
654
|
+
level = emph_match.group(1).lower()
|
|
655
|
+
seg.emphasis = level if level != "moderate" else True
|
|
656
|
+
return seg
|
|
657
|
+
|
|
658
|
+
# Prosody: vrp:, v:, r:, p:, volume:, rate:, pitch:
|
|
659
|
+
if _is_prosody_annotation(params):
|
|
660
|
+
seg.prosody = _parse_prosody_annotation(params)
|
|
661
|
+
return seg
|
|
662
|
+
|
|
663
|
+
# Language code: en, en-US, fr-FR, etc.
|
|
664
|
+
lang_match = re.match(r"^(?:lang:\s*)?([a-z]{2}(?:-[A-Z]{2})?)$", params)
|
|
665
|
+
if lang_match:
|
|
666
|
+
seg.language = lang_match.group(1)
|
|
667
|
+
return seg
|
|
668
|
+
|
|
669
|
+
# Combined annotations (comma-separated)
|
|
670
|
+
if "," in params:
|
|
671
|
+
_parse_combined_annotations(seg, params, extensions)
|
|
672
|
+
|
|
673
|
+
return seg
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _is_audio_annotation(params: str) -> bool:
|
|
677
|
+
"""Check if params represent an audio annotation."""
|
|
678
|
+
audio_extensions = (".mp3", ".ogg", ".wav", ".m4a", ".aac", ".flac")
|
|
679
|
+
first_part = params.split()[0] if params else ""
|
|
680
|
+
return first_part.startswith(("http://", "https://", "file://")) or any(
|
|
681
|
+
first_part.lower().endswith(ext) for ext in audio_extensions
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def _parse_audio_params(params: str) -> AudioAttrs:
|
|
686
|
+
"""Parse audio annotation parameters."""
|
|
687
|
+
parts = params.split()
|
|
688
|
+
url = parts[0]
|
|
689
|
+
|
|
690
|
+
audio = AudioAttrs(src=url)
|
|
691
|
+
|
|
692
|
+
remaining = " ".join(parts[1:]) if len(parts) > 1 else ""
|
|
693
|
+
|
|
694
|
+
# Parse clip: start-end
|
|
695
|
+
clip_match = re.search(
|
|
696
|
+
r"clip:\s*(\d+(?:\.\d+)?[ms]+)-(\d+(?:\.\d+)?[ms]+)", remaining
|
|
697
|
+
)
|
|
698
|
+
if clip_match:
|
|
699
|
+
audio.clip_begin = clip_match.group(1)
|
|
700
|
+
audio.clip_end = clip_match.group(2)
|
|
701
|
+
remaining = remaining[: clip_match.start()] + remaining[clip_match.end() :]
|
|
702
|
+
|
|
703
|
+
# Parse speed: percent
|
|
704
|
+
speed_match = re.search(r"speed:\s*(\d+(?:\.\d+)?%)", remaining)
|
|
705
|
+
if speed_match:
|
|
706
|
+
audio.speed = speed_match.group(1)
|
|
707
|
+
remaining = remaining[: speed_match.start()] + remaining[speed_match.end() :]
|
|
708
|
+
|
|
709
|
+
# Parse repeat: count
|
|
710
|
+
repeat_match = re.search(r"repeat:\s*(\d+)", remaining)
|
|
711
|
+
if repeat_match:
|
|
712
|
+
audio.repeat_count = int(repeat_match.group(1))
|
|
713
|
+
remaining = remaining[: repeat_match.start()] + remaining[repeat_match.end() :]
|
|
714
|
+
|
|
715
|
+
# Parse level: dB
|
|
716
|
+
level_match = re.search(r"level:\s*([+-]?\d+(?:\.\d+)?dB)", remaining)
|
|
717
|
+
if level_match:
|
|
718
|
+
audio.sound_level = level_match.group(1)
|
|
719
|
+
remaining = remaining[: level_match.start()] + remaining[level_match.end() :]
|
|
720
|
+
|
|
721
|
+
# Remaining text is alt text
|
|
722
|
+
remaining = re.sub(r"[,\s]+", " ", remaining).strip()
|
|
723
|
+
if remaining:
|
|
724
|
+
audio.alt_text = remaining
|
|
725
|
+
|
|
726
|
+
return audio
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _parse_voice_annotation(params: str) -> VoiceAttrs:
|
|
730
|
+
"""Parse voice annotation parameters."""
|
|
731
|
+
voice = VoiceAttrs()
|
|
732
|
+
|
|
733
|
+
# Check for complex params (with gender/variant)
|
|
734
|
+
if "," in params:
|
|
735
|
+
parts = [p.strip() for p in params.split(",")]
|
|
736
|
+
first = parts[0]
|
|
737
|
+
|
|
738
|
+
# First part is name or language
|
|
739
|
+
if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", first):
|
|
740
|
+
voice.language = first
|
|
741
|
+
else:
|
|
742
|
+
voice.name = first
|
|
743
|
+
|
|
744
|
+
# Parse remaining parts
|
|
745
|
+
for part in parts[1:]:
|
|
746
|
+
if part.startswith("gender:"):
|
|
747
|
+
voice.gender = part[7:].strip().lower() # type: ignore
|
|
748
|
+
elif part.startswith("variant:"):
|
|
749
|
+
voice.variant = int(part[8:].strip())
|
|
750
|
+
else:
|
|
751
|
+
# Simple name or language
|
|
752
|
+
if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", params):
|
|
753
|
+
voice.language = params
|
|
754
|
+
else:
|
|
755
|
+
voice.name = params
|
|
756
|
+
|
|
757
|
+
return voice
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
def _is_prosody_annotation(params: str) -> bool:
|
|
761
|
+
"""Check if params represent a prosody annotation."""
|
|
762
|
+
return bool(re.match(r"^(?:vrp:|[vrp]:|volume:|rate:|pitch:)", params))
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def _parse_prosody_annotation(params: str) -> ProsodyAttrs:
|
|
766
|
+
"""Parse prosody annotation parameters."""
|
|
767
|
+
prosody = ProsodyAttrs()
|
|
768
|
+
|
|
769
|
+
volume_map = PROSODY_VOLUME_MAP
|
|
770
|
+
rate_map = PROSODY_RATE_MAP
|
|
771
|
+
pitch_map = PROSODY_PITCH_MAP
|
|
772
|
+
|
|
773
|
+
# VRP shorthand: vrp: 555
|
|
774
|
+
vrp_match = re.match(r"^vrp:\s*(\d{1,3})$", params)
|
|
775
|
+
if vrp_match:
|
|
776
|
+
vrp = vrp_match.group(1)
|
|
777
|
+
if len(vrp) >= 1:
|
|
778
|
+
prosody.volume = volume_map.get(vrp[0])
|
|
779
|
+
if len(vrp) >= 2:
|
|
780
|
+
prosody.rate = rate_map.get(vrp[1])
|
|
781
|
+
if len(vrp) >= 3:
|
|
782
|
+
prosody.pitch = pitch_map.get(vrp[2])
|
|
783
|
+
return prosody
|
|
784
|
+
|
|
785
|
+
# Individual parameters
|
|
786
|
+
for part in params.split(","):
|
|
787
|
+
part = part.strip()
|
|
788
|
+
if ":" not in part:
|
|
789
|
+
continue
|
|
790
|
+
|
|
791
|
+
key, value = part.split(":", 1)
|
|
792
|
+
key = key.strip().lower()
|
|
793
|
+
value = value.strip()
|
|
794
|
+
|
|
795
|
+
# Normalize key names
|
|
796
|
+
if key in ("v", "volume"):
|
|
797
|
+
if value.startswith(("+", "-")) or value.endswith(("dB", "%")):
|
|
798
|
+
prosody.volume = value
|
|
799
|
+
else:
|
|
800
|
+
prosody.volume = volume_map.get(value, value)
|
|
801
|
+
elif key in ("r", "rate"):
|
|
802
|
+
if value.endswith("%"):
|
|
803
|
+
prosody.rate = value
|
|
804
|
+
else:
|
|
805
|
+
prosody.rate = rate_map.get(value, value)
|
|
806
|
+
elif key in ("p", "pitch"):
|
|
807
|
+
if value.startswith(("+", "-")) or value.endswith("%"):
|
|
808
|
+
prosody.pitch = value
|
|
809
|
+
else:
|
|
810
|
+
prosody.pitch = pitch_map.get(value, value)
|
|
811
|
+
|
|
812
|
+
return prosody
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def _parse_prosody_shorthand(markup: str) -> Segment | None:
|
|
816
|
+
"""Parse prosody shorthand markup like ++loud++ or <<slow<<.
|
|
817
|
+
|
|
818
|
+
Also handles nested emphasis inside prosody, e.g., +**WARNING**+
|
|
819
|
+
"""
|
|
820
|
+
# Volume: ~~silent~~, --x-soft--, -soft-, +loud+, ++x-loud++
|
|
821
|
+
# Order by length (longest first) to ensure ++ matches before +
|
|
822
|
+
volume_patterns = [
|
|
823
|
+
("++", "x-loud"),
|
|
824
|
+
("~~", "silent"),
|
|
825
|
+
("--", "x-soft"),
|
|
826
|
+
("+", "loud"),
|
|
827
|
+
("-", "soft"),
|
|
828
|
+
]
|
|
829
|
+
|
|
830
|
+
# Rate: <<x-slow<<, <slow<, >fast>, >>x-fast>>
|
|
831
|
+
rate_patterns = [
|
|
832
|
+
("<<", "x-slow"),
|
|
833
|
+
(">>", "x-fast"),
|
|
834
|
+
("<", "slow"),
|
|
835
|
+
(">", "fast"),
|
|
836
|
+
]
|
|
837
|
+
|
|
838
|
+
# Pitch: __x-low__, _low_ (single _ handled by emphasis), ^high^, ^^x-high^^
|
|
839
|
+
pitch_patterns = [
|
|
840
|
+
("^^", "x-high"),
|
|
841
|
+
("__", "x-low"),
|
|
842
|
+
("^", "high"),
|
|
843
|
+
]
|
|
844
|
+
|
|
845
|
+
# Try to match each pattern type
|
|
846
|
+
for marker, value in volume_patterns:
|
|
847
|
+
pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
|
|
848
|
+
match = pattern.match(markup)
|
|
849
|
+
if match:
|
|
850
|
+
inner_text = match.group(1)
|
|
851
|
+
emphasis = _check_inner_emphasis(inner_text)
|
|
852
|
+
if emphasis:
|
|
853
|
+
return Segment(
|
|
854
|
+
text=emphasis[0],
|
|
855
|
+
emphasis=emphasis[1],
|
|
856
|
+
prosody=ProsodyAttrs(volume=value),
|
|
857
|
+
)
|
|
858
|
+
return Segment(text=inner_text, prosody=ProsodyAttrs(volume=value))
|
|
859
|
+
|
|
860
|
+
for marker, value in rate_patterns:
|
|
861
|
+
pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
|
|
862
|
+
match = pattern.match(markup)
|
|
863
|
+
if match:
|
|
864
|
+
inner_text = match.group(1)
|
|
865
|
+
emphasis = _check_inner_emphasis(inner_text)
|
|
866
|
+
if emphasis:
|
|
867
|
+
return Segment(
|
|
868
|
+
text=emphasis[0],
|
|
869
|
+
emphasis=emphasis[1],
|
|
870
|
+
prosody=ProsodyAttrs(rate=value),
|
|
871
|
+
)
|
|
872
|
+
return Segment(text=inner_text, prosody=ProsodyAttrs(rate=value))
|
|
873
|
+
|
|
874
|
+
for marker, value in pitch_patterns:
|
|
875
|
+
pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
|
|
876
|
+
match = pattern.match(markup)
|
|
877
|
+
if match:
|
|
878
|
+
inner_text = match.group(1)
|
|
879
|
+
emphasis = _check_inner_emphasis(inner_text)
|
|
880
|
+
if emphasis:
|
|
881
|
+
return Segment(
|
|
882
|
+
text=emphasis[0],
|
|
883
|
+
emphasis=emphasis[1],
|
|
884
|
+
prosody=ProsodyAttrs(pitch=value),
|
|
885
|
+
)
|
|
886
|
+
return Segment(text=inner_text, prosody=ProsodyAttrs(pitch=value))
|
|
887
|
+
|
|
888
|
+
return None
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def _check_inner_emphasis(text: str) -> tuple[str, str | bool] | None:
|
|
892
|
+
"""Check if text is wrapped in emphasis markers.
|
|
893
|
+
|
|
894
|
+
Returns (inner_text, emphasis_level) or None if no emphasis found.
|
|
895
|
+
"""
|
|
896
|
+
# Strong emphasis: **text**
|
|
897
|
+
strong_match = STRONG_EMPHASIS_PATTERN.fullmatch(text)
|
|
898
|
+
if strong_match:
|
|
899
|
+
return (strong_match.group(1), "strong")
|
|
900
|
+
|
|
901
|
+
# Moderate emphasis: *text*
|
|
902
|
+
moderate_match = MODERATE_EMPHASIS_PATTERN.fullmatch(text)
|
|
903
|
+
if moderate_match:
|
|
904
|
+
return (moderate_match.group(1), True)
|
|
905
|
+
|
|
906
|
+
# Reduced emphasis: _text_
|
|
907
|
+
reduced_match = REDUCED_EMPHASIS_PATTERN.fullmatch(text)
|
|
908
|
+
if reduced_match:
|
|
909
|
+
return (reduced_match.group(1), "reduced")
|
|
910
|
+
|
|
911
|
+
return None
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def _parse_combined_annotations(
|
|
915
|
+
seg: Segment,
|
|
916
|
+
params: str,
|
|
917
|
+
extensions: dict | None = None,
|
|
918
|
+
) -> None:
|
|
919
|
+
"""Parse combined comma-separated annotations."""
|
|
920
|
+
# Split by comma, but be careful with quoted values
|
|
921
|
+
parts = _smart_split(params, ",")
|
|
922
|
+
|
|
923
|
+
for part in parts:
|
|
924
|
+
part = part.strip()
|
|
925
|
+
if not part:
|
|
926
|
+
continue
|
|
927
|
+
|
|
928
|
+
# Language code
|
|
929
|
+
if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", part):
|
|
930
|
+
if not seg.language:
|
|
931
|
+
seg.language = part
|
|
932
|
+
continue
|
|
933
|
+
|
|
934
|
+
# Prosody
|
|
935
|
+
if re.match(r"^[vrp]:\s*", part) or re.match(r"^(volume|rate|pitch):", part):
|
|
936
|
+
prosody = _parse_prosody_annotation(part)
|
|
937
|
+
if seg.prosody:
|
|
938
|
+
# Merge
|
|
939
|
+
if prosody.volume and not seg.prosody.volume:
|
|
940
|
+
seg.prosody.volume = prosody.volume
|
|
941
|
+
if prosody.rate and not seg.prosody.rate:
|
|
942
|
+
seg.prosody.rate = prosody.rate
|
|
943
|
+
if prosody.pitch and not seg.prosody.pitch:
|
|
944
|
+
seg.prosody.pitch = prosody.pitch
|
|
945
|
+
else:
|
|
946
|
+
seg.prosody = prosody
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
def _smart_split(s: str, delimiter: str) -> list[str]:
|
|
950
|
+
"""Split string by delimiter, respecting quoted strings."""
|
|
951
|
+
parts = []
|
|
952
|
+
current = ""
|
|
953
|
+
in_quotes = False
|
|
954
|
+
quote_char = None
|
|
955
|
+
|
|
956
|
+
for char in s:
|
|
957
|
+
if char in ('"', "'") and not in_quotes:
|
|
958
|
+
in_quotes = True
|
|
959
|
+
quote_char = char
|
|
960
|
+
current += char
|
|
961
|
+
elif char == quote_char and in_quotes:
|
|
962
|
+
in_quotes = False
|
|
963
|
+
quote_char = None
|
|
964
|
+
current += char
|
|
965
|
+
elif char == delimiter and not in_quotes:
|
|
966
|
+
parts.append(current)
|
|
967
|
+
current = ""
|
|
968
|
+
else:
|
|
969
|
+
current += char
|
|
970
|
+
|
|
971
|
+
if current:
|
|
972
|
+
parts.append(current)
|
|
973
|
+
|
|
974
|
+
return parts
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
978
|
+
# BACKWARD COMPATIBILITY
|
|
979
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
980
|
+
|
|
981
|
+
# Re-export old names for compatibility
|
|
982
|
+
SSMDSegment = Segment
|
|
983
|
+
SSMDSentence = Sentence
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def parse_sentences(
|
|
987
|
+
ssmd_text: str,
|
|
988
|
+
*,
|
|
989
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
990
|
+
include_default_voice: bool = True,
|
|
991
|
+
sentence_detection: bool = True,
|
|
992
|
+
language: str = "en",
|
|
993
|
+
model_size: str | None = None,
|
|
994
|
+
spacy_model: str | None = None,
|
|
995
|
+
use_spacy: bool | None = None,
|
|
996
|
+
heading_levels: dict | None = None,
|
|
997
|
+
extensions: dict | None = None,
|
|
998
|
+
) -> list[Sentence]:
|
|
999
|
+
"""Parse SSMD text into sentences (backward compatible API).
|
|
1000
|
+
|
|
1001
|
+
This is an alias for parse_ssmd() with the old parameter names.
|
|
1002
|
+
|
|
1003
|
+
Args:
|
|
1004
|
+
ssmd_text: SSMD formatted text to parse
|
|
1005
|
+
capabilities: TTS capabilities or preset name
|
|
1006
|
+
include_default_voice: If False, exclude sentences without voice context
|
|
1007
|
+
sentence_detection: Enable/disable sentence splitting
|
|
1008
|
+
language: Language code for sentence detection
|
|
1009
|
+
model_size: Size of spacy model (sm/md/lg)
|
|
1010
|
+
spacy_model: Full spacy model name (deprecated, use model_size)
|
|
1011
|
+
use_spacy: Force use of spacy for sentence detection
|
|
1012
|
+
heading_levels: Custom heading configurations
|
|
1013
|
+
extensions: Custom extension handlers
|
|
1014
|
+
|
|
1015
|
+
Returns:
|
|
1016
|
+
List of Sentence objects
|
|
1017
|
+
"""
|
|
1018
|
+
sentences = parse_ssmd(
|
|
1019
|
+
ssmd_text,
|
|
1020
|
+
capabilities=capabilities,
|
|
1021
|
+
sentence_detection=sentence_detection,
|
|
1022
|
+
language=language,
|
|
1023
|
+
model_size=model_size or (spacy_model.split("_")[-1] if spacy_model else None),
|
|
1024
|
+
use_spacy=use_spacy,
|
|
1025
|
+
heading_levels=heading_levels,
|
|
1026
|
+
extensions=extensions,
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
# Filter out sentences without voice if requested
|
|
1030
|
+
if not include_default_voice:
|
|
1031
|
+
sentences = [s for s in sentences if s.voice is not None]
|
|
1032
|
+
|
|
1033
|
+
return sentences
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
def parse_segments(
|
|
1037
|
+
ssmd_text: str,
|
|
1038
|
+
*,
|
|
1039
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
1040
|
+
voice_context: VoiceAttrs | None = None,
|
|
1041
|
+
) -> list[Segment]:
|
|
1042
|
+
"""Parse SSMD text into segments (backward compatible API)."""
|
|
1043
|
+
caps = _resolve_capabilities(capabilities)
|
|
1044
|
+
return _parse_segments(ssmd_text, capabilities=caps)
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def parse_voice_blocks(ssmd_text: str) -> list[tuple[VoiceAttrs | None, str]]:
|
|
1048
|
+
"""Parse SSMD text into voice blocks (backward compatible API)."""
|
|
1049
|
+
return _split_voice_blocks(ssmd_text)
|