PyPI - ssmd - Versions diffs - 0.5.3__py3-none-any.whl - Mend

ssmd 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

ssmd/__init__.py +189 -0
ssmd/_version.py +34 -0
ssmd/capabilities.py +277 -0
ssmd/document.py +918 -0
ssmd/formatter.py +244 -0
ssmd/parser.py +1049 -0
ssmd/parser_types.py +41 -0
ssmd/py.typed +0 -0
ssmd/segment.py +720 -0
ssmd/sentence.py +270 -0
ssmd/ssml_conversions.py +124 -0
ssmd/ssml_parser.py +599 -0
ssmd/types.py +122 -0
ssmd/utils.py +333 -0
ssmd/xsampa_to_ipa.txt +174 -0
ssmd-0.5.3.dist-info/METADATA +1210 -0
ssmd-0.5.3.dist-info/RECORD +20 -0
ssmd-0.5.3.dist-info/WHEEL +5 -0
ssmd-0.5.3.dist-info/licenses/LICENSE +21 -0
ssmd-0.5.3.dist-info/top_level.txt +1 -0

ssmd/types.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Data types for SSMD.
+This module defines the core data structures used throughout the SSMD library.
+"""
+from dataclasses import dataclass
+from typing import Literal
+@dataclass
+class VoiceAttrs:
+    """Voice attributes for TTS voice selection.
+    Attributes:
+        name: Voice name (e.g., "Joanna", "en-US-Wavenet-A")
+        language: BCP-47 language code (e.g., "en-US", "fr-FR")
+        gender: Voice gender
+        variant: Variant number for disambiguation
+    """
+    name: str | None = None
+    language: str | None = None
+    gender: Literal["male", "female", "neutral"] | None = None
+    variant: int | None = None
+@dataclass
+class ProsodyAttrs:
+    """Prosody attributes for volume, rate, and pitch control.
+    Attributes:
+        volume: Volume level ('silent', 'x-soft', 'soft', 'medium', 'loud',
+                'x-loud', or relative like '+10dB')
+        rate: Speech rate ('x-slow', 'slow', 'medium', 'fast', 'x-fast',
+              or relative like '+20%')
+        pitch: Pitch level ('x-low', 'low', 'medium', 'high', 'x-high',
+               or relative like '-5%')
+    """
+    volume: str | None = None
+    rate: str | None = None
+    pitch: str | None = None
+@dataclass
+class BreakAttrs:
+    """Break/pause attributes.
+    Attributes:
+        time: Time duration (e.g., '500ms', '2s')
+        strength: Break strength ('none', 'x-weak', 'medium', 'strong', 'x-strong')
+    """
+    time: str | None = None
+    strength: str | None = None
+@dataclass
+class SayAsAttrs:
+    """Say-as attributes for text interpretation.
+    Attributes:
+        interpret_as: Interpretation type ('telephone', 'date', 'cardinal',
+                      'ordinal', 'characters', 'expletive', etc.)
+        format: Optional format string (e.g., 'dd.mm.yyyy' for dates)
+        detail: Optional detail level (e.g., '2' for verbosity)
+    """
+    interpret_as: str
+    format: str | None = None
+    detail: str | None = None
+@dataclass
+class AudioAttrs:
+    """Audio file attributes.
+    Attributes:
+        src: Audio file URL or path
+        alt_text: Fallback text if audio cannot be played
+        clip_begin: Start time for playback (e.g., "0s", "500ms")
+        clip_end: End time for playback (e.g., "10s", "5000ms")
+        speed: Playback speed as percentage (e.g., "150%", "80%")
+        repeat_count: Number of times to repeat audio
+        repeat_dur: Total duration for repetitions (e.g., "10s")
+        sound_level: Volume adjustment in dB (e.g., "+6dB", "-3dB")
+    """
+    src: str
+    alt_text: str | None = None
+    clip_begin: str | None = None
+    clip_end: str | None = None
+    speed: str | None = None
+    repeat_count: int | None = None
+    repeat_dur: str | None = None
+    sound_level: str | None = None
+@dataclass
+class PhonemeAttrs:
+    """Phoneme pronunciation attributes.
+    Attributes:
+        ph: Phonetic pronunciation string
+        alphabet: Phonetic alphabet (ipa or x-sampa)
+    """
+    ph: str
+    alphabet: str = "ipa"
+# Heading configuration type
+HeadingEffect = tuple[str, str | dict[str, str]]  # e.g., ('emphasis', 'strong')
+HeadingConfig = dict[int, list[HeadingEffect]]
+# Default heading configurations
+DEFAULT_HEADING_LEVELS: HeadingConfig = {
+    1: [("pause_before", "300ms"), ("emphasis", "strong"), ("pause", "300ms")],
+    2: [("pause_before", "75ms"), ("emphasis", "moderate"), ("pause", "75ms")],
+    3: [("pause_before", "50ms"), ("pause", "50ms")],
+}

ssmd/utils.py ADDED Viewed

@@ -0,0 +1,333 @@
+"""Utility functions for SSMD processing."""
+import html
+import re
+def escape_xml(text: str) -> str:
+    """Escape XML special characters.
+    Args:
+        text: Input text to escape
+    Returns:
+        Text with XML entities escaped
+    """
+    return html.escape(text, quote=True)
+def unescape_xml(text: str) -> str:
+    """Unescape XML entities.
+    Args:
+        text: Text with XML entities
+    Returns:
+        Unescaped text
+    """
+    return html.unescape(text)
+def format_xml(xml_text: str, pretty: bool = True) -> str:
+    """Format XML with optional pretty printing.
+    Args:
+        xml_text: XML string to format
+        pretty: Enable pretty printing
+    Returns:
+        Formatted XML string
+    """
+    if not pretty:
+        return xml_text
+    try:
+        from xml.dom import minidom
+        dom = minidom.parseString(xml_text)
+        return dom.toprettyxml(indent="  ", encoding=None)
+    except Exception:
+        # Fallback: return as-is if parsing fails
+        return xml_text
+def extract_sentences(ssml: str) -> list[str]:
+    """Extract sentences from SSML.
+    Looks for <s> tags or splits by sentence boundaries.
+    Args:
+        ssml: SSML string
+    Returns:
+        List of SSML sentence strings
+    """
+    # First try to extract <s> tags
+    s_tag_pattern = re.compile(r"<s>(.*?)</s>", re.DOTALL)
+    sentences = s_tag_pattern.findall(ssml)
+    if sentences:
+        return sentences
+    # Fallback: extract <p> tags
+    p_tag_pattern = re.compile(r"<p>(.*?)</p>", re.DOTALL)
+    paragraphs = p_tag_pattern.findall(ssml)
+    if paragraphs:
+        return paragraphs
+    # Last resort: remove <speak> wrapper and return as single sentence
+    clean = re.sub(r"</?speak>", "", ssml).strip()
+    return [clean] if clean else []
+# Unicode private use area characters for placeholders
+# Using \uf000+ range which is not transformed by phrasplit/spaCy
+# (The \ue000-\ue00f range gets converted to dots/ellipses by some NLP tools)
+_PLACEHOLDER_MAP = {
+    "*": "\uf000",  # ASTERISK
+    "_": "\uf001",  # UNDERSCORE
+    "[": "\uf002",  # LEFT BRACKET
+    "]": "\uf003",  # RIGHT BRACKET
+    ".": "\uf004",  # DOT
+    "@": "\uf005",  # AT SIGN
+    "#": "\uf006",  # HASH
+    "~": "\uf007",  # TILDE
+    "+": "\uf008",  # PLUS
+    "-": "\uf009",  # HYPHEN
+    "<": "\uf00a",  # LESS THAN
+    ">": "\uf00b",  # GREATER THAN
+    "^": "\uf00c",  # CARET
+}
+# Reverse map for unescaping
+_REVERSE_PLACEHOLDER_MAP = {v: k for k, v in _PLACEHOLDER_MAP.items()}
+def escape_ssmd_syntax(
+    text: str,
+    patterns: list[str] | None = None,
+) -> str:
+    """Escape SSMD syntax patterns to prevent interpretation as markup.
+    This is useful when processing plain text or markdown that may contain
+    characters that coincidentally match SSMD syntax patterns. Uses placeholder
+    replacement which is reversed after SSML processing.
+    Args:
+        text: Input text that may contain SSMD-like patterns
+        patterns: List of pattern types to escape. If None, escapes all.
+            Valid values: 'emphasis', 'annotations', 'breaks', 'marks',
+            'headings', 'voice_directives', 'prosody_shorthand'
+    Returns:
+        Text with SSMD patterns replaced with placeholders
+    Example:
+        >>> text = "This *word* should not be emphasized"
+        >>> escape_ssmd_syntax(text)
+        'This \ue000word\ue000 should not be emphasized'
+        >>> text = "Visit [our site](https://example.com)"
+        >>> escaped = escape_ssmd_syntax(text)
+        # Placeholders prevent SSMD interpretation
+        >>> # Selective escaping
+        >>> escape_ssmd_syntax(text, patterns=['emphasis', 'breaks'])
+    """
+    if patterns is None:
+        # Escape all patterns by default
+        patterns = [
+            "emphasis",
+            "annotations",
+            "breaks",
+            "marks",
+            "headings",
+            "voice_directives",
+            "prosody_shorthand",
+        ]
+    result = text
+    # Process patterns in specific order (most specific first)
+    # Replace special characters with placeholders
+    if "voice_directives" in patterns:
+        # Voice directives at line start: @voice: or @voice(
+        result = re.sub(
+            r"^(@)voice([:(])",
+            lambda m: _PLACEHOLDER_MAP["@"] + "voice" + m.group(2),
+            result,
+            flags=re.MULTILINE,
+        )
+    if "headings" in patterns:
+        # Headings at line start: #, ##, ###
+        result = re.sub(
+            r"^(#{1,6})(\s)",
+            lambda m: _PLACEHOLDER_MAP["#"] * len(m.group(1)) + m.group(2),
+            result,
+            flags=re.MULTILINE,
+        )
+    if "emphasis" in patterns:
+        # Strong emphasis: **text**
+        result = re.sub(
+            r"\*\*([^*]+)\*\*",
+            lambda m: _PLACEHOLDER_MAP["*"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["*"] * 2,
+            result,
+        )
+        # Moderate emphasis: *text*
+        result = re.sub(
+            r"\*([^*\n]+)\*",
+            lambda m: _PLACEHOLDER_MAP["*"] + m.group(1) + _PLACEHOLDER_MAP["*"],
+            result,
+        )
+        # Reduced emphasis/pitch: _text_ (but not in middle of words)
+        result = re.sub(
+            r"(?<!\w)_([^_\n]+)_(?!\w)",
+            lambda m: _PLACEHOLDER_MAP["_"] + m.group(1) + _PLACEHOLDER_MAP["_"],
+            result,
+        )
+    if "annotations" in patterns:
+        # Annotations: [text](params) - replace the brackets
+        result = re.sub(
+            r"\[([^\]]+)\]\(([^)]+)\)",
+            lambda m: _PLACEHOLDER_MAP["["]
+            + m.group(1)
+            + _PLACEHOLDER_MAP["]"]
+            + "("
+            + m.group(2)
+            + ")",
+            result,
+        )
+    if "breaks" in patterns:
+        # Breaks: ...n, ...w, ...c, ...s, ...p, ...500ms, ...5s
+        result = re.sub(
+            r"\.\.\.((?:[nwcsp]|\d+(?:ms|s))(?:\s|$))",
+            lambda m: _PLACEHOLDER_MAP["."] * 3 + m.group(1),
+            result,
+        )
+    if "marks" in patterns:
+        # Marks: @word (but not @voice which is handled above)
+        # Use word boundary to avoid matching @domain in emails
+        result = re.sub(
+            r"(?<!\w)@(?!voice)(\w+)",
+            lambda m: _PLACEHOLDER_MAP["@"] + m.group(1),
+            result,
+        )
+    if "prosody_shorthand" in patterns:
+        # Prosody shorthand - paired characters around text
+        # Double character versions first
+        result = re.sub(
+            r"~~([^~\n]+)~~",
+            lambda m: _PLACEHOLDER_MAP["~"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["~"] * 2,
+            result,
+        )
+        result = re.sub(
+            r"\+\+([^+\n]+)\+\+",
+            lambda m: _PLACEHOLDER_MAP["+"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["+"] * 2,
+            result,
+        )
+        result = re.sub(
+            r"--([^-\n]+)--",
+            lambda m: _PLACEHOLDER_MAP["-"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["-"] * 2,
+            result,
+        )
+        result = re.sub(
+            r"<<([^<\n]+)<<",
+            lambda m: _PLACEHOLDER_MAP["<"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["<"] * 2,
+            result,
+        )
+        result = re.sub(
+            r">>([^>\n]+)>>",
+            lambda m: _PLACEHOLDER_MAP[">"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP[">"] * 2,
+            result,
+        )
+        result = re.sub(
+            r"\^\^([^^|\n]+)\^\^",
+            lambda m: _PLACEHOLDER_MAP["^"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["^"] * 2,
+            result,
+        )
+        result = re.sub(
+            r"__([^_\n]+)__",
+            lambda m: _PLACEHOLDER_MAP["_"] * 2
+            + m.group(1)
+            + _PLACEHOLDER_MAP["_"] * 2,
+            result,
+        )
+        # Single character versions
+        result = re.sub(
+            r"~([^~\n]+)~",
+            lambda m: _PLACEHOLDER_MAP["~"] + m.group(1) + _PLACEHOLDER_MAP["~"],
+            result,
+        )
+        result = re.sub(
+            r"\+([^+\n]+)\+",
+            lambda m: _PLACEHOLDER_MAP["+"] + m.group(1) + _PLACEHOLDER_MAP["+"],
+            result,
+        )
+        result = re.sub(
+            r"-([^-\n]+)-",
+            lambda m: _PLACEHOLDER_MAP["-"] + m.group(1) + _PLACEHOLDER_MAP["-"],
+            result,
+        )
+        result = re.sub(
+            r"<([^<\n]+)<",
+            lambda m: _PLACEHOLDER_MAP["<"] + m.group(1) + _PLACEHOLDER_MAP["<"],
+            result,
+        )
+        result = re.sub(
+            r">([^>\n]+)>",
+            lambda m: _PLACEHOLDER_MAP[">"] + m.group(1) + _PLACEHOLDER_MAP[">"],
+            result,
+        )
+        result = re.sub(
+            r"\^([^^\n]+)\^",
+            lambda m: _PLACEHOLDER_MAP["^"] + m.group(1) + _PLACEHOLDER_MAP["^"],
+            result,
+        )
+    return result
+def unescape_ssmd_syntax(text: str) -> str:
+    """Remove placeholder escaping from SSMD syntax.
+    This is used internally to replace placeholders with original characters
+    after TTS processing.
+    Args:
+        text: Text with placeholder-escaped SSMD syntax
+    Returns:
+        Text with placeholders replaced by original characters
+    Example:
+        >>> unescape_ssmd_syntax("This \ue000word\ue000 is escaped")
+        'This *word* is escaped'
+    """
+    result = text
+    # Replace all placeholders with their original characters
+    for placeholder, original in _REVERSE_PLACEHOLDER_MAP.items():
+        result = result.replace(placeholder, original)
+    return result

ssmd/xsampa_to_ipa.txt ADDED Viewed

@@ -0,0 +1,174 @@
+a a
+b b
+b_< ɓ
+c c
+d d
+d` ɖ
+d_< ɗ
+e e
+f f
+g ɡ
+g_< ɠ
+h h
+h\ ɦ
+i i
+j j
+j\ ʝ
+k k
+l l
+l` ɭ
+l\ ɺ
+m m
+n n
+n` ɳ
+o o
+p p
+p\ ɸ
+q q
+r r
+r` ɽ
+r\ ɹ
+r\` ɻ
+s s
+s` ʂ
+s\ ɕ
+t t
+t` ʈ
+u u
+v v
+v\ ʋ
+P ʋ
+w w
+x x
+x\ ɧ
+y y
+z z
+z` ʐ
+z\ ʑ
+A ɑ
+B β
+B\ ʙ
+C ç
+D ð
+E ɛ
+F ɱ
+G ɣ
+G\ ɢ
+G\_< ʛ
+H ɥ
+H\ ʜ
+I ɪ
+I\ ɪ̈
+I\ ɨ̞
+J ɲ
+J\ ɟ
+J\_< ʄ
+K ɬ
+K\ ɮ
+L ʎ
+L\ ʟ
+M ɯ
+M\ ɰ
+N ŋ
+N\ ɴ
+O ɔ
+O\ ʘ
+P ʋ
+v\ ʋ
+Q ɒ
+R ʁ
+R\ ʀ
+S ʃ
+T θ
+U ʊ
+U\ ʊ̈
+U\ ʉ̞
+V ʌ
+W ʍ
+X χ
+X\ ħ
+Y ʏ
+Z ʒ
+. .
+" ˈ
+% ˌ
+' ʲ
+_j ʲ
+: ː
+:\ ˑ
+@ ə
+@\ ɘ
+{ æ
+} ʉ
+1 ɨ
+2 ø
+3 ɜ
+3\ ɞ
+4 ɾ
+5 ɫ
+6 ɐ
+7 ɤ
+8 ɵ
+9 œ
+& ɶ
+? ʔ
+?\ ʕ
+<\ ʢ
+>\ ʡ
+^ ꜛ
+! ꜜ
+!\ ǃ
+| |
+|\ ǀ
+|| ‖
+|\|\ ǁ
+=\ ǂ
+-\ ‿
+_" ̈
+_+ ̟
+_- ̠
+_/ ̌
+_0 ̥
+= ̩
+_= ̩
+_> ʼ
+_?\ ˤ
+_\ ̂
+_^ ̯
+_} ̚
+` ˞
+~ ̃
+_~ ̃
+_A ̘
+_a ̺
+_B ̏
+_B_L ᷅
+_c ̜
+_d ̪
+_e ̴
+_F ̂
+_G ˠ
+_H ́
+_H_T ᷄
+_h ʰ
+_j ʲ
+' ʲ
+_k ̰
+_L ̀
+_l ˡ
+_M ̄
+_m ̻
+_N ̼
+_n ⁿ
+_O ̹
+_o ̞
+_q ̙
+_R ̌
+_R_F ᷈
+_r ̝
+_T ̋
+_t ̤
+_v ̬
+_w ʷ
+_X ̆
+_x ̽