ssmd 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ssmd/parser.py ADDED
@@ -0,0 +1,1049 @@
1
+ """SSMD parser - Parse SSMD text into structured Sentence/Segment objects.
2
+
3
+ This module provides functions to parse SSMD markdown into structured data
4
+ that can be used for TTS processing or conversion to SSML.
5
+ """
6
+
7
+ import re
8
+ from typing import TYPE_CHECKING
9
+
10
+ from ssmd.segment import Segment
11
+ from ssmd.sentence import Sentence
12
+ from ssmd.ssml_conversions import (
13
+ PROSODY_PITCH_MAP,
14
+ PROSODY_RATE_MAP,
15
+ PROSODY_VOLUME_MAP,
16
+ SSMD_BREAK_MARKER_TO_STRENGTH,
17
+ )
18
+ from ssmd.types import (
19
+ DEFAULT_HEADING_LEVELS,
20
+ AudioAttrs,
21
+ BreakAttrs,
22
+ PhonemeAttrs,
23
+ ProsodyAttrs,
24
+ SayAsAttrs,
25
+ VoiceAttrs,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from ssmd.capabilities import TTSCapabilities
30
+
31
+
32
+ # ═══════════════════════════════════════════════════════════════════════════════
33
+ # REGEX PATTERNS
34
+ # ═══════════════════════════════════════════════════════════════════════════════
35
+
36
+ # Voice directive: @voice: name or @voice(name)
37
+ # Supports: name, language code, gender:, variant:, language:
38
+ VOICE_DIRECTIVE_PATTERN = re.compile(
39
+ r"^@voice(?::\s*|\()"
40
+ r"([a-zA-Z0-9_-]+(?:\s*,\s*(?:gender|variant|language):\s*[a-zA-Z0-9_-]+)*)"
41
+ r"\)?\s*$",
42
+ re.MULTILINE,
43
+ )
44
+
45
+ # Emphasis patterns
46
+ STRONG_EMPHASIS_PATTERN = re.compile(r"\*\*([^\*]+)\*\*")
47
+ MODERATE_EMPHASIS_PATTERN = re.compile(r"\*([^\*]+)\*")
48
+ REDUCED_EMPHASIS_PATTERN = re.compile(r"(?<!_)_(?!_)([^_]+?)(?<!_)_(?!_)")
49
+
50
+ # Annotation pattern: [text](annotation)
51
+ ANNOTATION_PATTERN = re.compile(r"\[([^\]]*)\]\(([^\)]+)\)")
52
+
53
+ # Break pattern: ...500ms, ...2s, ...n, ...w, ...c, ...s, ...p
54
+ BREAK_PATTERN = re.compile(r"\.\.\.(\d+(?:s|ms)|[nwcsp])")
55
+
56
+ # Mark pattern: @name (but not @voice)
57
+ MARK_PATTERN = re.compile(r"@(?!voice[:(])(\w+)")
58
+
59
+ # Heading pattern: # ## ###
60
+ HEADING_PATTERN = re.compile(r"^\s*(#{1,6})\s*(.+)$", re.MULTILINE)
61
+
62
+ # Prosody shorthand patterns (applied after XML escaping, but we handle raw here)
63
+ PROSODY_VOLUME_PATTERN = re.compile(
64
+ r"(?<![a-zA-Z0-9])"
65
+ r"(~~|--|\+\+|-(?!-)|(?<!\+)\+|~)" # Volume markers
66
+ r"([^~\-+<>_^]+?)"
67
+ r"\1"
68
+ r"(?![a-zA-Z0-9])"
69
+ )
70
+
71
+ PROSODY_RATE_PATTERN = re.compile(
72
+ r"(?<![a-zA-Z0-9])"
73
+ r"(<<|<(?!<)|(?<!>)>|>>)" # Rate markers
74
+ r"([^<>]+?)"
75
+ r"\1"
76
+ r"(?![a-zA-Z0-9])"
77
+ )
78
+
79
+ PROSODY_PITCH_PATTERN = re.compile(
80
+ r"(?<![a-zA-Z0-9_])"
81
+ r"(__|\^\^|(?<!_)_(?!_)|(?<!\^)\^(?!\^))" # Pitch markers
82
+ r"([^_^]+?)"
83
+ r"\1"
84
+ r"(?![a-zA-Z0-9_])"
85
+ )
86
+
87
+ # Paragraph break: two or more newlines
88
+ PARAGRAPH_PATTERN = re.compile(r"\n\n+")
89
+
90
+ # Space before punctuation (to normalize)
91
+ SPACE_BEFORE_PUNCT = re.compile(r"\s+([.!?,:;])")
92
+
93
+
94
+ # ═══════════════════════════════════════════════════════════════════════════════
95
+ # MAIN PARSING FUNCTIONS
96
+ # ═══════════════════════════════════════════════════════════════════════════════
97
+
98
+
99
+ def _normalize_text(text: str) -> str:
100
+ """Normalize text by removing extra whitespace and fixing spacing.
101
+
102
+ - Removes space before punctuation
103
+ - Collapses multiple spaces
104
+ """
105
+ text = SPACE_BEFORE_PUNCT.sub(r"\1", text)
106
+ text = re.sub(r"\s+", " ", text)
107
+ return text.strip()
108
+
109
+
110
+ def parse_ssmd(
111
+ text: str,
112
+ *,
113
+ capabilities: "TTSCapabilities | str | None" = None,
114
+ heading_levels: dict | None = None,
115
+ extensions: dict | None = None,
116
+ sentence_detection: bool = True,
117
+ language: str = "en",
118
+ use_spacy: bool | None = None,
119
+ model_size: str | None = None,
120
+ ) -> list[Sentence]:
121
+ """Parse SSMD text into a list of Sentences.
122
+
123
+ This is the main parsing function. It handles:
124
+ - Voice directives (@voice: name)
125
+ - Paragraph and sentence splitting
126
+ - All SSMD markup (emphasis, annotations, breaks, etc.)
127
+
128
+ Args:
129
+ text: SSMD markdown text
130
+ capabilities: TTS capabilities for filtering (optional)
131
+ heading_levels: Custom heading configurations
132
+ extensions: Custom extension handlers
133
+ sentence_detection: If True, split text into sentences
134
+ language: Default language for sentence detection
135
+ use_spacy: If True, use spaCy for sentence detection
136
+ model_size: spaCy model size ("sm", "md", "lg")
137
+
138
+ Returns:
139
+ List of Sentence objects
140
+ """
141
+ if not text or not text.strip():
142
+ return []
143
+
144
+ # Resolve capabilities
145
+ caps = _resolve_capabilities(capabilities)
146
+
147
+ # Split text into voice blocks
148
+ voice_blocks = _split_voice_blocks(text)
149
+
150
+ sentences = []
151
+
152
+ for voice, block_text in voice_blocks:
153
+ # Split block into paragraphs
154
+ paragraphs = PARAGRAPH_PATTERN.split(block_text)
155
+
156
+ for para_idx, paragraph in enumerate(paragraphs):
157
+ paragraph = paragraph.strip()
158
+ if not paragraph:
159
+ continue
160
+
161
+ is_last_paragraph = para_idx == len(paragraphs) - 1
162
+
163
+ # Split paragraph into sentences if enabled
164
+ if sentence_detection:
165
+ sent_texts = _split_sentences(
166
+ paragraph,
167
+ language=language,
168
+ use_spacy=use_spacy,
169
+ model_size=model_size,
170
+ )
171
+ else:
172
+ sent_texts = [paragraph]
173
+
174
+ for sent_idx, sent_text in enumerate(sent_texts):
175
+ sent_text = sent_text.strip()
176
+ if not sent_text:
177
+ continue
178
+
179
+ is_last_sent_in_para = sent_idx == len(sent_texts) - 1
180
+
181
+ # Parse the sentence content into segments
182
+ segments = _parse_segments(
183
+ sent_text,
184
+ capabilities=caps,
185
+ heading_levels=heading_levels,
186
+ extensions=extensions,
187
+ )
188
+
189
+ if segments:
190
+ sentence = Sentence(
191
+ segments=segments,
192
+ voice=voice,
193
+ is_paragraph_end=is_last_sent_in_para and not is_last_paragraph,
194
+ )
195
+ sentences.append(sentence)
196
+
197
+ return sentences
198
+
199
+
200
+ def _resolve_capabilities(
201
+ capabilities: "TTSCapabilities | str | None",
202
+ ) -> "TTSCapabilities | None":
203
+ """Resolve capabilities from string or object."""
204
+ if capabilities is None:
205
+ return None
206
+ if isinstance(capabilities, str):
207
+ from ssmd.capabilities import get_preset
208
+
209
+ return get_preset(capabilities)
210
+ return capabilities
211
+
212
+
213
+ def _split_voice_blocks(text: str) -> list[tuple[VoiceAttrs | None, str]]:
214
+ """Split text into voice blocks.
215
+
216
+ Args:
217
+ text: SSMD text
218
+
219
+ Returns:
220
+ List of (voice, text) tuples
221
+ """
222
+ blocks: list[tuple[VoiceAttrs | None, str]] = []
223
+ lines = text.split("\n")
224
+
225
+ current_voice: VoiceAttrs | None = None
226
+ current_lines: list[str] = []
227
+
228
+ for line in lines:
229
+ # Check if this line is a voice directive
230
+ match = VOICE_DIRECTIVE_PATTERN.match(line)
231
+ if match:
232
+ # Save previous block if any
233
+ if current_lines:
234
+ block_text = "\n".join(current_lines)
235
+ if block_text.strip():
236
+ blocks.append((current_voice, block_text))
237
+ current_lines = []
238
+
239
+ # Parse new voice
240
+ params = match.group(1)
241
+ current_voice = _parse_voice_params(params)
242
+ else:
243
+ current_lines.append(line)
244
+
245
+ # Save final block
246
+ if current_lines:
247
+ block_text = "\n".join(current_lines)
248
+ if block_text.strip():
249
+ blocks.append((current_voice, block_text))
250
+
251
+ # If no blocks, return entire text with no voice
252
+ if not blocks and text.strip():
253
+ blocks.append((None, text.strip()))
254
+
255
+ return blocks
256
+
257
+
258
+ def _parse_voice_params(params: str) -> VoiceAttrs:
259
+ """Parse voice parameters from directive string."""
260
+ voice = VoiceAttrs()
261
+
262
+ has_gender = "gender:" in params
263
+ has_variant = "variant:" in params
264
+ has_language = "language:" in params
265
+
266
+ # Extract voice name or language code (first value before any comma)
267
+ voice_match = re.match(r"([a-zA-Z0-9_-]+)", params)
268
+ if voice_match:
269
+ value = voice_match.group(1)
270
+ # If explicit language: is provided, or gender/variant present
271
+ # with language-like
272
+ # first value, or looks like language code, treat first value as language
273
+ if (has_gender or has_variant) and not has_language:
274
+ # Pattern like "@voice: fr-FR, gender: female" - first value is language
275
+ if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", value):
276
+ voice.language = value
277
+ else:
278
+ voice.name = value
279
+ elif has_language:
280
+ # Explicit language: provided, so first value is the name
281
+ voice.name = value
282
+ elif re.match(r"^[a-z]{2}(-[A-Z]{2})?$", value):
283
+ # Looks like a language code
284
+ voice.language = value
285
+ else:
286
+ # Just a name
287
+ voice.name = value
288
+
289
+ # Parse explicit language: parameter
290
+ lang_match = re.search(r"language:\s*([a-zA-Z0-9_-]+)", params, re.IGNORECASE)
291
+ if lang_match:
292
+ voice.language = lang_match.group(1)
293
+
294
+ # Parse gender
295
+ gender_match = re.search(r"gender:\s*(male|female|neutral)", params, re.IGNORECASE)
296
+ if gender_match:
297
+ voice.gender = gender_match.group(1).lower() # type: ignore
298
+
299
+ # Parse variant
300
+ variant_match = re.search(r"variant:\s*(\d+)", params)
301
+ if variant_match:
302
+ voice.variant = int(variant_match.group(1))
303
+
304
+ return voice
305
+
306
+
307
+ def _split_sentences(
308
+ text: str,
309
+ language: str = "en",
310
+ use_spacy: bool | None = None,
311
+ model_size: str | None = None,
312
+ ) -> list[str]:
313
+ """Split text into sentences using phrasplit."""
314
+ try:
315
+ from phrasplit import split_text
316
+
317
+ # Build model name
318
+ size = model_size or "sm"
319
+ lang_code = language.split("-")[0] if "-" in language else language
320
+
321
+ # Language-specific model patterns
322
+ web_langs = {
323
+ "en",
324
+ "zh",
325
+ }
326
+ if lang_code in web_langs:
327
+ model = f"{lang_code}_core_web_{size}"
328
+ else:
329
+ model = f"{lang_code}_core_news_{size}"
330
+
331
+ segments = split_text(
332
+ text,
333
+ mode="sentence",
334
+ language_model=model,
335
+ apply_corrections=True,
336
+ split_on_colon=True,
337
+ use_spacy=use_spacy,
338
+ )
339
+
340
+ # Group segments by sentence
341
+ sentences = []
342
+ current = ""
343
+ last_sent_id = None
344
+
345
+ for seg in segments:
346
+ if last_sent_id is not None and seg.sentence != last_sent_id:
347
+ if current.strip():
348
+ sentences.append(current)
349
+ current = ""
350
+ current += seg.text
351
+ last_sent_id = seg.sentence
352
+
353
+ if current.strip():
354
+ sentences.append(current)
355
+
356
+ return sentences if sentences else [text]
357
+
358
+ except ImportError:
359
+ # Fallback: simple sentence splitting
360
+ return _simple_sentence_split(text)
361
+
362
+
363
+ def _simple_sentence_split(text: str) -> list[str]:
364
+ """Simple regex-based sentence splitting."""
365
+ # Split on sentence-ending punctuation followed by space or newline
366
+ parts = re.split(r"(?<=[.!?])\s+", text)
367
+ return [p.strip() for p in parts if p.strip()]
368
+
369
+
370
+ def _parse_segments( # noqa: C901
371
+ text: str,
372
+ capabilities: "TTSCapabilities | None" = None,
373
+ heading_levels: dict | None = None,
374
+ extensions: dict | None = None,
375
+ ) -> list[Segment]:
376
+ """Parse text into segments with SSMD features."""
377
+ # Check for heading
378
+ heading_match = HEADING_PATTERN.match(text)
379
+ if heading_match:
380
+ return _parse_heading(heading_match, heading_levels or DEFAULT_HEADING_LEVELS)
381
+
382
+ segments: list[Segment] = []
383
+ position = 0
384
+
385
+ # Build combined pattern for all markup
386
+ # Order matters: longer patterns first
387
+ combined = re.compile(
388
+ r"("
389
+ r"\*\*[^\*]+\*\*" # **strong**
390
+ r"|\*[^\*]+\*" # *moderate*
391
+ r"|(?<![_a-zA-Z0-9])_(?!_)[^_]+?(?<!_)_(?![_a-zA-Z0-9])" # _reduced_
392
+ r"|\[[^\]]*\]\([^\)]+\)" # [text](annotation)
393
+ r"|\.\.\.(?:\d+(?:s|ms)|[nwcsp])" # breaks
394
+ r"|@(?!voice[:(])\w+" # marks
395
+ r"|~~[^~]+~~" # ~silent~
396
+ r"|--[^-]+--" # --x-soft--
397
+ r"|\+\+[^+]+\+\+" # ++x-loud++
398
+ r"|(?<![a-zA-Z0-9+])\+[^+]+\+(?![a-zA-Z0-9+])" # +loud+
399
+ r"|(?<![a-zA-Z0-9-])-[^-]+-(?![a-zA-Z0-9-])" # -soft-
400
+ r"|<<[^<>]+<<" # <<x-slow<<
401
+ r"|(?<![<a-zA-Z0-9])<[^<>]+<(?![<a-zA-Z0-9])" # <slow<
402
+ r"|>>[^<>]+>>" # >>x-fast>>
403
+ r"|(?<![>a-zA-Z0-9])>[^<>]+>(?![>a-zA-Z0-9])" # >fast>
404
+ r"|__[^_]+__" # __x-low__
405
+ r"|\^\^[^^]+\^\^" # ^^x-high^^
406
+ r"|(?<![a-zA-Z0-9^])\^[^^]+\^(?![a-zA-Z0-9^])" # ^high^
407
+ r")"
408
+ )
409
+
410
+ pending_breaks: list[BreakAttrs] = []
411
+ pending_marks: list[str] = []
412
+
413
+ for match in combined.finditer(text):
414
+ if match.start() > position:
415
+ plain = _normalize_text(text[position : match.start()])
416
+ if plain:
417
+ seg = Segment(text=plain)
418
+ if pending_breaks:
419
+ seg.breaks_before = pending_breaks
420
+ pending_breaks = []
421
+ if pending_marks:
422
+ seg.marks_before = pending_marks
423
+ pending_marks = []
424
+ segments.append(seg)
425
+
426
+ markup = match.group(0)
427
+ pending_breaks, pending_marks, markup_seg = _handle_markup(
428
+ markup,
429
+ segments,
430
+ pending_breaks,
431
+ pending_marks,
432
+ extensions,
433
+ )
434
+ if markup_seg:
435
+ segments.append(markup_seg)
436
+
437
+ position = match.end()
438
+
439
+ # Add remaining text
440
+ if position < len(text):
441
+ plain = _normalize_text(text[position:])
442
+ if plain:
443
+ seg = Segment(text=plain)
444
+ _apply_pending(seg, pending_breaks, pending_marks)
445
+ segments.append(seg)
446
+
447
+ # If no segments created but we have text, create a plain segment
448
+ if not segments and text.strip():
449
+ seg = Segment(text=text.strip())
450
+ _apply_pending(seg, pending_breaks, pending_marks)
451
+ segments.append(seg)
452
+
453
+ return segments
454
+
455
+
456
+ def _handle_markup(
457
+ markup: str,
458
+ segments: list[Segment],
459
+ pending_breaks: list[BreakAttrs],
460
+ pending_marks: list[str],
461
+ extensions: dict | None,
462
+ ) -> tuple[list[BreakAttrs], list[str], Segment | None]:
463
+ """Handle a single markup token and return any segment."""
464
+ if markup.startswith("..."):
465
+ brk = _parse_break(markup[3:])
466
+ if segments:
467
+ segments[-1].breaks_after.append(brk)
468
+ else:
469
+ pending_breaks.append(brk)
470
+ return pending_breaks, pending_marks, None
471
+
472
+ if markup.startswith("@"):
473
+ mark_name = markup[1:]
474
+ if segments:
475
+ segments[-1].marks_after.append(mark_name)
476
+ else:
477
+ pending_marks.append(mark_name)
478
+ return pending_breaks, pending_marks, None
479
+
480
+ seg = _segment_from_markup(markup, extensions)
481
+ if seg:
482
+ _apply_pending(seg, pending_breaks, pending_marks)
483
+ return [], [], seg
484
+
485
+ return pending_breaks, pending_marks, None
486
+
487
+
488
+ def _segment_from_markup(markup: str, extensions: dict | None) -> Segment | None:
489
+ """Build a segment from emphasis, annotation, or prosody markup."""
490
+ if markup.startswith("**"):
491
+ inner = STRONG_EMPHASIS_PATTERN.match(markup)
492
+ if inner:
493
+ return Segment(text=inner.group(1), emphasis="strong")
494
+ return None
495
+
496
+ if markup.startswith("*"):
497
+ inner = MODERATE_EMPHASIS_PATTERN.match(markup)
498
+ if inner:
499
+ return Segment(text=inner.group(1), emphasis=True)
500
+ return None
501
+
502
+ if markup.startswith("_") and not markup.startswith("__"):
503
+ inner = REDUCED_EMPHASIS_PATTERN.match(markup)
504
+ if inner:
505
+ return Segment(text=inner.group(1), emphasis="reduced")
506
+ return None
507
+
508
+ if markup.startswith("["):
509
+ return _parse_annotation(markup, extensions)
510
+
511
+ return _parse_prosody_shorthand(markup)
512
+
513
+
514
+ def _apply_pending(
515
+ seg: Segment,
516
+ pending_breaks: list[BreakAttrs],
517
+ pending_marks: list[str],
518
+ ) -> None:
519
+ """Apply pending breaks and marks to a segment."""
520
+ if pending_breaks:
521
+ seg.breaks_before = pending_breaks.copy()
522
+ if pending_marks:
523
+ seg.marks_before = pending_marks.copy()
524
+
525
+
526
+ def _parse_heading(
527
+ match: re.Match,
528
+ heading_levels: dict,
529
+ ) -> list[Segment]:
530
+ """Parse heading into segments."""
531
+ level = len(match.group(1))
532
+ text = match.group(2).strip()
533
+
534
+ if level not in heading_levels:
535
+ return [Segment(text=text)]
536
+
537
+ # Build segment with heading effects
538
+ seg = Segment(text=text)
539
+
540
+ for effect_type, value in heading_levels[level]:
541
+ if effect_type == "emphasis":
542
+ seg.emphasis = value
543
+ elif effect_type == "pause":
544
+ seg.breaks_after.append(BreakAttrs(time=value))
545
+ elif effect_type == "pause_before":
546
+ seg.breaks_before.append(BreakAttrs(time=value))
547
+ elif effect_type == "prosody" and isinstance(value, dict):
548
+ seg.prosody = ProsodyAttrs(
549
+ volume=value.get("volume"),
550
+ rate=value.get("rate"),
551
+ pitch=value.get("pitch"),
552
+ )
553
+
554
+ return [seg]
555
+
556
+
557
+ def _parse_break(modifier: str) -> BreakAttrs:
558
+ """Parse break modifier into BreakAttrs."""
559
+ if modifier in SSMD_BREAK_MARKER_TO_STRENGTH:
560
+ return BreakAttrs(strength=SSMD_BREAK_MARKER_TO_STRENGTH[modifier])
561
+ elif modifier.endswith("s") or modifier.endswith("ms"):
562
+ return BreakAttrs(time=modifier)
563
+ else:
564
+ return BreakAttrs(time=f"{modifier}ms")
565
+
566
+
567
+ def _parse_annotation(markup: str, extensions: dict | None = None) -> Segment | None:
568
+ """Parse [text](annotation) markup."""
569
+ match = ANNOTATION_PATTERN.match(markup)
570
+ if not match:
571
+ return None
572
+
573
+ text = match.group(1)
574
+ params = match.group(2).strip()
575
+
576
+ seg = Segment(text=text)
577
+
578
+ # Try to identify annotation type
579
+ # Audio (URL or file extension)
580
+ if _is_audio_annotation(params):
581
+ seg.audio = _parse_audio_params(params)
582
+ return seg
583
+
584
+ # Extension: ext: name
585
+ ext_match = re.match(r"^ext:\s*(\w+)$", params)
586
+ if ext_match:
587
+ seg.extension = ext_match.group(1)
588
+ return seg
589
+
590
+ # Voice: voice: name or voice: lang, gender: X
591
+ if params.startswith("voice:"):
592
+ seg.voice = _parse_voice_annotation(params[6:].strip())
593
+ return seg
594
+
595
+ # Say-as: as: type or say-as: type
596
+ sayas_match = re.match(
597
+ r"^(?:say-as|as):\s*(\w+)"
598
+ r'(?:\s*,\s*format:\s*["\']?([^"\']+)["\']?)?'
599
+ r"(?:\s*,\s*detail:\s*(\d+))?$",
600
+ params,
601
+ )
602
+ if sayas_match:
603
+ seg.say_as = SayAsAttrs(
604
+ interpret_as=sayas_match.group(1),
605
+ format=sayas_match.group(2),
606
+ detail=sayas_match.group(3),
607
+ )
608
+ return seg
609
+
610
+ # Phoneme: ph: or ipa: or sampa:
611
+ # Stop at comma to allow combined annotations like "ph: value, alphabet: ipa"
612
+ ph_match = re.match(r"^(ph|ipa|sampa):\s*([^,]+)", params)
613
+ if ph_match:
614
+ alphabet_type = ph_match.group(1)
615
+ phonemes = ph_match.group(2).strip()
616
+
617
+ # Map shorthand alphabet names
618
+ if alphabet_type == "sampa":
619
+ alphabet_type = "x-sampa"
620
+ elif alphabet_type == "ph":
621
+ # Default to ipa when using generic "ph:"
622
+ alphabet_type = "ipa"
623
+
624
+ # Check for explicit alphabet specification in remaining params
625
+ remaining = params[ph_match.end() :].strip()
626
+ if remaining.startswith(","):
627
+ remaining = remaining[1:].strip()
628
+ alph_match = re.match(r"^alphabet:\s*([^,]+)", remaining)
629
+ if alph_match:
630
+ specified_alphabet = alph_match.group(1).strip().lower()
631
+ if specified_alphabet in ("ipa", "x-sampa", "sampa"):
632
+ # Normalize sampa to x-sampa
633
+ alphabet_type = (
634
+ "x-sampa"
635
+ if specified_alphabet == "sampa"
636
+ else specified_alphabet
637
+ )
638
+
639
+ # Store phonemes as-is - conversion to IPA happens at SSML render time
640
+ seg.phoneme = PhonemeAttrs(ph=phonemes, alphabet=alphabet_type)
641
+ return seg
642
+
643
+ # Substitution: sub: alias
644
+ sub_match = re.match(r"^sub:\s*(.+)$", params)
645
+ if sub_match:
646
+ seg.substitution = sub_match.group(1).strip()
647
+ return seg
648
+
649
+ # Emphasis: emphasis: level
650
+ emph_match = re.match(
651
+ r"^emphasis:\s*(none|reduced|moderate|strong)$", params, re.IGNORECASE
652
+ )
653
+ if emph_match:
654
+ level = emph_match.group(1).lower()
655
+ seg.emphasis = level if level != "moderate" else True
656
+ return seg
657
+
658
+ # Prosody: vrp:, v:, r:, p:, volume:, rate:, pitch:
659
+ if _is_prosody_annotation(params):
660
+ seg.prosody = _parse_prosody_annotation(params)
661
+ return seg
662
+
663
+ # Language code: en, en-US, fr-FR, etc.
664
+ lang_match = re.match(r"^(?:lang:\s*)?([a-z]{2}(?:-[A-Z]{2})?)$", params)
665
+ if lang_match:
666
+ seg.language = lang_match.group(1)
667
+ return seg
668
+
669
+ # Combined annotations (comma-separated)
670
+ if "," in params:
671
+ _parse_combined_annotations(seg, params, extensions)
672
+
673
+ return seg
674
+
675
+
676
+ def _is_audio_annotation(params: str) -> bool:
677
+ """Check if params represent an audio annotation."""
678
+ audio_extensions = (".mp3", ".ogg", ".wav", ".m4a", ".aac", ".flac")
679
+ first_part = params.split()[0] if params else ""
680
+ return first_part.startswith(("http://", "https://", "file://")) or any(
681
+ first_part.lower().endswith(ext) for ext in audio_extensions
682
+ )
683
+
684
+
685
+ def _parse_audio_params(params: str) -> AudioAttrs:
686
+ """Parse audio annotation parameters."""
687
+ parts = params.split()
688
+ url = parts[0]
689
+
690
+ audio = AudioAttrs(src=url)
691
+
692
+ remaining = " ".join(parts[1:]) if len(parts) > 1 else ""
693
+
694
+ # Parse clip: start-end
695
+ clip_match = re.search(
696
+ r"clip:\s*(\d+(?:\.\d+)?[ms]+)-(\d+(?:\.\d+)?[ms]+)", remaining
697
+ )
698
+ if clip_match:
699
+ audio.clip_begin = clip_match.group(1)
700
+ audio.clip_end = clip_match.group(2)
701
+ remaining = remaining[: clip_match.start()] + remaining[clip_match.end() :]
702
+
703
+ # Parse speed: percent
704
+ speed_match = re.search(r"speed:\s*(\d+(?:\.\d+)?%)", remaining)
705
+ if speed_match:
706
+ audio.speed = speed_match.group(1)
707
+ remaining = remaining[: speed_match.start()] + remaining[speed_match.end() :]
708
+
709
+ # Parse repeat: count
710
+ repeat_match = re.search(r"repeat:\s*(\d+)", remaining)
711
+ if repeat_match:
712
+ audio.repeat_count = int(repeat_match.group(1))
713
+ remaining = remaining[: repeat_match.start()] + remaining[repeat_match.end() :]
714
+
715
+ # Parse level: dB
716
+ level_match = re.search(r"level:\s*([+-]?\d+(?:\.\d+)?dB)", remaining)
717
+ if level_match:
718
+ audio.sound_level = level_match.group(1)
719
+ remaining = remaining[: level_match.start()] + remaining[level_match.end() :]
720
+
721
+ # Remaining text is alt text
722
+ remaining = re.sub(r"[,\s]+", " ", remaining).strip()
723
+ if remaining:
724
+ audio.alt_text = remaining
725
+
726
+ return audio
727
+
728
+
729
+ def _parse_voice_annotation(params: str) -> VoiceAttrs:
730
+ """Parse voice annotation parameters."""
731
+ voice = VoiceAttrs()
732
+
733
+ # Check for complex params (with gender/variant)
734
+ if "," in params:
735
+ parts = [p.strip() for p in params.split(",")]
736
+ first = parts[0]
737
+
738
+ # First part is name or language
739
+ if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", first):
740
+ voice.language = first
741
+ else:
742
+ voice.name = first
743
+
744
+ # Parse remaining parts
745
+ for part in parts[1:]:
746
+ if part.startswith("gender:"):
747
+ voice.gender = part[7:].strip().lower() # type: ignore
748
+ elif part.startswith("variant:"):
749
+ voice.variant = int(part[8:].strip())
750
+ else:
751
+ # Simple name or language
752
+ if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", params):
753
+ voice.language = params
754
+ else:
755
+ voice.name = params
756
+
757
+ return voice
758
+
759
+
760
+ def _is_prosody_annotation(params: str) -> bool:
761
+ """Check if params represent a prosody annotation."""
762
+ return bool(re.match(r"^(?:vrp:|[vrp]:|volume:|rate:|pitch:)", params))
763
+
764
+
765
+ def _parse_prosody_annotation(params: str) -> ProsodyAttrs:
766
+ """Parse prosody annotation parameters."""
767
+ prosody = ProsodyAttrs()
768
+
769
+ volume_map = PROSODY_VOLUME_MAP
770
+ rate_map = PROSODY_RATE_MAP
771
+ pitch_map = PROSODY_PITCH_MAP
772
+
773
+ # VRP shorthand: vrp: 555
774
+ vrp_match = re.match(r"^vrp:\s*(\d{1,3})$", params)
775
+ if vrp_match:
776
+ vrp = vrp_match.group(1)
777
+ if len(vrp) >= 1:
778
+ prosody.volume = volume_map.get(vrp[0])
779
+ if len(vrp) >= 2:
780
+ prosody.rate = rate_map.get(vrp[1])
781
+ if len(vrp) >= 3:
782
+ prosody.pitch = pitch_map.get(vrp[2])
783
+ return prosody
784
+
785
+ # Individual parameters
786
+ for part in params.split(","):
787
+ part = part.strip()
788
+ if ":" not in part:
789
+ continue
790
+
791
+ key, value = part.split(":", 1)
792
+ key = key.strip().lower()
793
+ value = value.strip()
794
+
795
+ # Normalize key names
796
+ if key in ("v", "volume"):
797
+ if value.startswith(("+", "-")) or value.endswith(("dB", "%")):
798
+ prosody.volume = value
799
+ else:
800
+ prosody.volume = volume_map.get(value, value)
801
+ elif key in ("r", "rate"):
802
+ if value.endswith("%"):
803
+ prosody.rate = value
804
+ else:
805
+ prosody.rate = rate_map.get(value, value)
806
+ elif key in ("p", "pitch"):
807
+ if value.startswith(("+", "-")) or value.endswith("%"):
808
+ prosody.pitch = value
809
+ else:
810
+ prosody.pitch = pitch_map.get(value, value)
811
+
812
+ return prosody
813
+
814
+
815
+ def _parse_prosody_shorthand(markup: str) -> Segment | None:
816
+ """Parse prosody shorthand markup like ++loud++ or <<slow<<.
817
+
818
+ Also handles nested emphasis inside prosody, e.g., +**WARNING**+
819
+ """
820
+ # Volume: ~~silent~~, --x-soft--, -soft-, +loud+, ++x-loud++
821
+ # Order by length (longest first) to ensure ++ matches before +
822
+ volume_patterns = [
823
+ ("++", "x-loud"),
824
+ ("~~", "silent"),
825
+ ("--", "x-soft"),
826
+ ("+", "loud"),
827
+ ("-", "soft"),
828
+ ]
829
+
830
+ # Rate: <<x-slow<<, <slow<, >fast>, >>x-fast>>
831
+ rate_patterns = [
832
+ ("<<", "x-slow"),
833
+ (">>", "x-fast"),
834
+ ("<", "slow"),
835
+ (">", "fast"),
836
+ ]
837
+
838
+ # Pitch: __x-low__, _low_ (single _ handled by emphasis), ^high^, ^^x-high^^
839
+ pitch_patterns = [
840
+ ("^^", "x-high"),
841
+ ("__", "x-low"),
842
+ ("^", "high"),
843
+ ]
844
+
845
+ # Try to match each pattern type
846
+ for marker, value in volume_patterns:
847
+ pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
848
+ match = pattern.match(markup)
849
+ if match:
850
+ inner_text = match.group(1)
851
+ emphasis = _check_inner_emphasis(inner_text)
852
+ if emphasis:
853
+ return Segment(
854
+ text=emphasis[0],
855
+ emphasis=emphasis[1],
856
+ prosody=ProsodyAttrs(volume=value),
857
+ )
858
+ return Segment(text=inner_text, prosody=ProsodyAttrs(volume=value))
859
+
860
+ for marker, value in rate_patterns:
861
+ pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
862
+ match = pattern.match(markup)
863
+ if match:
864
+ inner_text = match.group(1)
865
+ emphasis = _check_inner_emphasis(inner_text)
866
+ if emphasis:
867
+ return Segment(
868
+ text=emphasis[0],
869
+ emphasis=emphasis[1],
870
+ prosody=ProsodyAttrs(rate=value),
871
+ )
872
+ return Segment(text=inner_text, prosody=ProsodyAttrs(rate=value))
873
+
874
+ for marker, value in pitch_patterns:
875
+ pattern = re.compile(rf"^{re.escape(marker)}(.+?){re.escape(marker)}$")
876
+ match = pattern.match(markup)
877
+ if match:
878
+ inner_text = match.group(1)
879
+ emphasis = _check_inner_emphasis(inner_text)
880
+ if emphasis:
881
+ return Segment(
882
+ text=emphasis[0],
883
+ emphasis=emphasis[1],
884
+ prosody=ProsodyAttrs(pitch=value),
885
+ )
886
+ return Segment(text=inner_text, prosody=ProsodyAttrs(pitch=value))
887
+
888
+ return None
889
+
890
+
891
+ def _check_inner_emphasis(text: str) -> tuple[str, str | bool] | None:
892
+ """Check if text is wrapped in emphasis markers.
893
+
894
+ Returns (inner_text, emphasis_level) or None if no emphasis found.
895
+ """
896
+ # Strong emphasis: **text**
897
+ strong_match = STRONG_EMPHASIS_PATTERN.fullmatch(text)
898
+ if strong_match:
899
+ return (strong_match.group(1), "strong")
900
+
901
+ # Moderate emphasis: *text*
902
+ moderate_match = MODERATE_EMPHASIS_PATTERN.fullmatch(text)
903
+ if moderate_match:
904
+ return (moderate_match.group(1), True)
905
+
906
+ # Reduced emphasis: _text_
907
+ reduced_match = REDUCED_EMPHASIS_PATTERN.fullmatch(text)
908
+ if reduced_match:
909
+ return (reduced_match.group(1), "reduced")
910
+
911
+ return None
912
+
913
+
914
+ def _parse_combined_annotations(
915
+ seg: Segment,
916
+ params: str,
917
+ extensions: dict | None = None,
918
+ ) -> None:
919
+ """Parse combined comma-separated annotations."""
920
+ # Split by comma, but be careful with quoted values
921
+ parts = _smart_split(params, ",")
922
+
923
+ for part in parts:
924
+ part = part.strip()
925
+ if not part:
926
+ continue
927
+
928
+ # Language code
929
+ if re.match(r"^[a-z]{2}(-[A-Z]{2})?$", part):
930
+ if not seg.language:
931
+ seg.language = part
932
+ continue
933
+
934
+ # Prosody
935
+ if re.match(r"^[vrp]:\s*", part) or re.match(r"^(volume|rate|pitch):", part):
936
+ prosody = _parse_prosody_annotation(part)
937
+ if seg.prosody:
938
+ # Merge
939
+ if prosody.volume and not seg.prosody.volume:
940
+ seg.prosody.volume = prosody.volume
941
+ if prosody.rate and not seg.prosody.rate:
942
+ seg.prosody.rate = prosody.rate
943
+ if prosody.pitch and not seg.prosody.pitch:
944
+ seg.prosody.pitch = prosody.pitch
945
+ else:
946
+ seg.prosody = prosody
947
+
948
+
949
+ def _smart_split(s: str, delimiter: str) -> list[str]:
950
+ """Split string by delimiter, respecting quoted strings."""
951
+ parts = []
952
+ current = ""
953
+ in_quotes = False
954
+ quote_char = None
955
+
956
+ for char in s:
957
+ if char in ('"', "'") and not in_quotes:
958
+ in_quotes = True
959
+ quote_char = char
960
+ current += char
961
+ elif char == quote_char and in_quotes:
962
+ in_quotes = False
963
+ quote_char = None
964
+ current += char
965
+ elif char == delimiter and not in_quotes:
966
+ parts.append(current)
967
+ current = ""
968
+ else:
969
+ current += char
970
+
971
+ if current:
972
+ parts.append(current)
973
+
974
+ return parts
975
+
976
+
977
+ # ═══════════════════════════════════════════════════════════════════════════════
978
+ # BACKWARD COMPATIBILITY
979
+ # ═══════════════════════════════════════════════════════════════════════════════
980
+
981
+ # Re-export old names for compatibility
982
+ SSMDSegment = Segment
983
+ SSMDSentence = Sentence
984
+
985
+
986
+ def parse_sentences(
987
+ ssmd_text: str,
988
+ *,
989
+ capabilities: "TTSCapabilities | str | None" = None,
990
+ include_default_voice: bool = True,
991
+ sentence_detection: bool = True,
992
+ language: str = "en",
993
+ model_size: str | None = None,
994
+ spacy_model: str | None = None,
995
+ use_spacy: bool | None = None,
996
+ heading_levels: dict | None = None,
997
+ extensions: dict | None = None,
998
+ ) -> list[Sentence]:
999
+ """Parse SSMD text into sentences (backward compatible API).
1000
+
1001
+ This is an alias for parse_ssmd() with the old parameter names.
1002
+
1003
+ Args:
1004
+ ssmd_text: SSMD formatted text to parse
1005
+ capabilities: TTS capabilities or preset name
1006
+ include_default_voice: If False, exclude sentences without voice context
1007
+ sentence_detection: Enable/disable sentence splitting
1008
+ language: Language code for sentence detection
1009
+ model_size: Size of spacy model (sm/md/lg)
1010
+ spacy_model: Full spacy model name (deprecated, use model_size)
1011
+ use_spacy: Force use of spacy for sentence detection
1012
+ heading_levels: Custom heading configurations
1013
+ extensions: Custom extension handlers
1014
+
1015
+ Returns:
1016
+ List of Sentence objects
1017
+ """
1018
+ sentences = parse_ssmd(
1019
+ ssmd_text,
1020
+ capabilities=capabilities,
1021
+ sentence_detection=sentence_detection,
1022
+ language=language,
1023
+ model_size=model_size or (spacy_model.split("_")[-1] if spacy_model else None),
1024
+ use_spacy=use_spacy,
1025
+ heading_levels=heading_levels,
1026
+ extensions=extensions,
1027
+ )
1028
+
1029
+ # Filter out sentences without voice if requested
1030
+ if not include_default_voice:
1031
+ sentences = [s for s in sentences if s.voice is not None]
1032
+
1033
+ return sentences
1034
+
1035
+
1036
+ def parse_segments(
1037
+ ssmd_text: str,
1038
+ *,
1039
+ capabilities: "TTSCapabilities | str | None" = None,
1040
+ voice_context: VoiceAttrs | None = None,
1041
+ ) -> list[Segment]:
1042
+ """Parse SSMD text into segments (backward compatible API)."""
1043
+ caps = _resolve_capabilities(capabilities)
1044
+ return _parse_segments(ssmd_text, capabilities=caps)
1045
+
1046
+
1047
+ def parse_voice_blocks(ssmd_text: str) -> list[tuple[VoiceAttrs | None, str]]:
1048
+ """Parse SSMD text into voice blocks (backward compatible API)."""
1049
+ return _split_voice_blocks(ssmd_text)