ssmd 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssmd/__init__.py +189 -0
- ssmd/_version.py +34 -0
- ssmd/capabilities.py +277 -0
- ssmd/document.py +918 -0
- ssmd/formatter.py +244 -0
- ssmd/parser.py +1049 -0
- ssmd/parser_types.py +41 -0
- ssmd/py.typed +0 -0
- ssmd/segment.py +720 -0
- ssmd/sentence.py +270 -0
- ssmd/ssml_conversions.py +124 -0
- ssmd/ssml_parser.py +599 -0
- ssmd/types.py +122 -0
- ssmd/utils.py +333 -0
- ssmd/xsampa_to_ipa.txt +174 -0
- ssmd-0.5.3.dist-info/METADATA +1210 -0
- ssmd-0.5.3.dist-info/RECORD +20 -0
- ssmd-0.5.3.dist-info/WHEEL +5 -0
- ssmd-0.5.3.dist-info/licenses/LICENSE +21 -0
- ssmd-0.5.3.dist-info/top_level.txt +1 -0
ssmd/formatter.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""SSMD formatting utilities for properly formatted output.
|
|
2
|
+
|
|
3
|
+
This module provides utilities to format parsed SSMD sentences with proper
|
|
4
|
+
line breaks, paragraph spacing, and structural elements according to SSMD
|
|
5
|
+
formatting conventions.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from ssmd.segment import Segment
|
|
9
|
+
from ssmd.sentence import Sentence
|
|
10
|
+
from ssmd.ssml_conversions import SSMD_BREAK_STRENGTH_MAP
|
|
11
|
+
from ssmd.types import BreakAttrs, VoiceAttrs
|
|
12
|
+
|
|
13
|
+
# Backward compatibility aliases
|
|
14
|
+
SSMDSentence = Sentence
|
|
15
|
+
SSMDSegment = Segment
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _format_segment(segment: Segment) -> str:
|
|
19
|
+
"""Format a single segment to SSMD (backward compatibility wrapper).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
segment: Segment object to format
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Formatted SSMD string for this segment
|
|
26
|
+
"""
|
|
27
|
+
return segment.to_ssmd().strip()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _format_sentence(sentence: Sentence) -> str:
|
|
31
|
+
"""Format a sentence's content (backward compatibility wrapper).
|
|
32
|
+
|
|
33
|
+
This is an alias for _format_sentence_content.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
sentence: Sentence object to format
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Formatted sentence text
|
|
40
|
+
"""
|
|
41
|
+
return _format_sentence_content(sentence)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def format_ssmd(sentences: list[Sentence]) -> str:
|
|
45
|
+
"""Format parsed SSMD sentences with proper line breaks.
|
|
46
|
+
|
|
47
|
+
This function takes a list of parsed Sentence objects and formats them
|
|
48
|
+
according to SSMD formatting conventions:
|
|
49
|
+
|
|
50
|
+
- Each sentence on a new line (after . ? !)
|
|
51
|
+
- Break markers at sentence boundaries: end of previous line
|
|
52
|
+
- Break markers mid-sentence: stay inline between segments
|
|
53
|
+
- Paragraph breaks: double newline
|
|
54
|
+
- Voice directives: separate line with blank line after
|
|
55
|
+
- Headings: blank lines before and after
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
sentences: List of parsed Sentence objects
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Properly formatted SSMD string
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
>>> from ssmd.parser import parse_sentences
|
|
65
|
+
>>> sentences = parse_sentences("Hello. ...s How are you?")
|
|
66
|
+
>>> formatted = format_ssmd(sentences)
|
|
67
|
+
>>> print(formatted)
|
|
68
|
+
Hello. ...s
|
|
69
|
+
How are you?
|
|
70
|
+
"""
|
|
71
|
+
if not sentences:
|
|
72
|
+
return ""
|
|
73
|
+
|
|
74
|
+
output_lines: list[str] = []
|
|
75
|
+
previous_voice = None
|
|
76
|
+
|
|
77
|
+
for i, sentence in enumerate(sentences):
|
|
78
|
+
# Check if voice changed - output directive on its own line
|
|
79
|
+
if sentence.voice != previous_voice and sentence.voice is not None:
|
|
80
|
+
# Add voice directive
|
|
81
|
+
voice_directive = _format_voice_directive(sentence.voice)
|
|
82
|
+
if voice_directive:
|
|
83
|
+
# Add blank line before voice directive if not first
|
|
84
|
+
if output_lines and output_lines[-1] != "":
|
|
85
|
+
output_lines.append("")
|
|
86
|
+
output_lines.append(voice_directive)
|
|
87
|
+
output_lines.append("") # Blank line after voice directive
|
|
88
|
+
previous_voice = sentence.voice
|
|
89
|
+
|
|
90
|
+
# Check if sentence has breaks_before (from previous sentence boundary)
|
|
91
|
+
# These should be appended to the previous line
|
|
92
|
+
if i > 0 and sentence.segments and sentence.segments[0].breaks_before:
|
|
93
|
+
# Append break to previous line
|
|
94
|
+
if output_lines:
|
|
95
|
+
break_marker = _format_breaks(sentence.segments[0].breaks_before)
|
|
96
|
+
output_lines[-1] += " " + break_marker
|
|
97
|
+
|
|
98
|
+
# Format the sentence using to_ssmd() but without voice directive
|
|
99
|
+
# (we handle voice directives separately above)
|
|
100
|
+
sentence_text = _format_sentence_content(sentence)
|
|
101
|
+
|
|
102
|
+
if sentence_text:
|
|
103
|
+
output_lines.append(sentence_text)
|
|
104
|
+
|
|
105
|
+
# Add paragraph break if needed
|
|
106
|
+
if sentence.is_paragraph_end:
|
|
107
|
+
output_lines.append("") # Extra blank line for paragraph
|
|
108
|
+
|
|
109
|
+
# Join lines and ensure trailing newline
|
|
110
|
+
result = "\n".join(output_lines)
|
|
111
|
+
|
|
112
|
+
# Clean up multiple consecutive blank lines (max 1 blank line)
|
|
113
|
+
while "\n\n\n" in result:
|
|
114
|
+
result = result.replace("\n\n\n", "\n\n")
|
|
115
|
+
|
|
116
|
+
return result.rstrip() + "\n" if result else ""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _format_sentence_content(sentence: Sentence) -> str:
|
|
120
|
+
"""Format a single sentence's content (segments only, no voice directive).
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
sentence: Sentence object to format
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Formatted sentence text with inline and trailing breaks
|
|
127
|
+
"""
|
|
128
|
+
if not sentence.segments:
|
|
129
|
+
return ""
|
|
130
|
+
|
|
131
|
+
# Build segments using their to_ssmd() method
|
|
132
|
+
result_parts: list[str] = []
|
|
133
|
+
|
|
134
|
+
for _i, segment in enumerate(sentence.segments):
|
|
135
|
+
# Format the segment using its to_ssmd() method
|
|
136
|
+
segment_text = segment.to_ssmd()
|
|
137
|
+
|
|
138
|
+
# Preserve the trailing space if segment has breaks_after
|
|
139
|
+
if segment.breaks_after:
|
|
140
|
+
segment_text = segment_text.rstrip() + " "
|
|
141
|
+
else:
|
|
142
|
+
segment_text = segment_text.strip()
|
|
143
|
+
|
|
144
|
+
if not segment_text.strip():
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Add this segment
|
|
148
|
+
result_parts.append(segment_text)
|
|
149
|
+
|
|
150
|
+
# Join segments intelligently
|
|
151
|
+
sentence_text = ""
|
|
152
|
+
for i, part in enumerate(result_parts):
|
|
153
|
+
if i == 0:
|
|
154
|
+
sentence_text = part
|
|
155
|
+
elif part.startswith("..."):
|
|
156
|
+
# This is a break marker - append without extra space
|
|
157
|
+
sentence_text += part
|
|
158
|
+
elif i > 0 and _ends_with_break_marker(result_parts[i - 1]):
|
|
159
|
+
# Previous part ends with break marker, already has space
|
|
160
|
+
sentence_text += part
|
|
161
|
+
elif i > 0 and result_parts[i - 1].endswith((" ", "\n")):
|
|
162
|
+
# Previous part ends with whitespace
|
|
163
|
+
sentence_text += part
|
|
164
|
+
else:
|
|
165
|
+
# Normal text segment - add space
|
|
166
|
+
sentence_text += " " + part
|
|
167
|
+
|
|
168
|
+
# Add sentence-level breaks at end of line
|
|
169
|
+
if sentence.breaks_after:
|
|
170
|
+
break_marker = _format_breaks(sentence.breaks_after)
|
|
171
|
+
sentence_text += " " + break_marker
|
|
172
|
+
|
|
173
|
+
return sentence_text.strip()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _ends_with_break_marker(text: str) -> bool:
|
|
177
|
+
"""Check if text ends with a break marker like ...s, ...500ms, etc."""
|
|
178
|
+
import re
|
|
179
|
+
|
|
180
|
+
# Break marker pattern: ... followed by strength letter or time
|
|
181
|
+
return bool(re.search(r"\.\.\.[swcpn]$|\.\.\.\d+(ms|s)$", text.rstrip()))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _format_breaks(breaks: list[BreakAttrs]) -> str:
|
|
185
|
+
"""Convert break attributes to SSMD break markers.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
breaks: List of BreakAttrs objects
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
SSMD break marker string (e.g., "...s", "...500ms")
|
|
192
|
+
"""
|
|
193
|
+
if not breaks:
|
|
194
|
+
return ""
|
|
195
|
+
|
|
196
|
+
# Format each break
|
|
197
|
+
break_markers = []
|
|
198
|
+
for brk in breaks:
|
|
199
|
+
if brk.time:
|
|
200
|
+
# Time-based break: ...500ms or ...2s
|
|
201
|
+
break_markers.append(f"...{brk.time}")
|
|
202
|
+
elif brk.strength:
|
|
203
|
+
# Strength-based break
|
|
204
|
+
marker = SSMD_BREAK_STRENGTH_MAP.get(brk.strength, "...s")
|
|
205
|
+
break_markers.append(marker)
|
|
206
|
+
else:
|
|
207
|
+
# Default to strong break
|
|
208
|
+
break_markers.append("...s")
|
|
209
|
+
|
|
210
|
+
return " ".join(break_markers)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _format_voice_directive(voice: VoiceAttrs) -> str:
|
|
214
|
+
"""Format a voice directive.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
voice: VoiceAttrs object
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Voice directive string
|
|
221
|
+
(e.g., "@voice: sarah" or "@voice: fr-FR, gender: female")
|
|
222
|
+
"""
|
|
223
|
+
if not voice:
|
|
224
|
+
return ""
|
|
225
|
+
|
|
226
|
+
# Build parts for the directive
|
|
227
|
+
parts = []
|
|
228
|
+
|
|
229
|
+
# Add name or language as first part
|
|
230
|
+
if voice.name:
|
|
231
|
+
parts.append(voice.name)
|
|
232
|
+
elif voice.language:
|
|
233
|
+
parts.append(voice.language)
|
|
234
|
+
|
|
235
|
+
# Add optional attributes
|
|
236
|
+
if voice.gender:
|
|
237
|
+
parts.append(f"gender: {voice.gender}")
|
|
238
|
+
if voice.variant:
|
|
239
|
+
parts.append(f"variant: {voice.variant}")
|
|
240
|
+
|
|
241
|
+
if parts:
|
|
242
|
+
return f"@voice: {', '.join(parts)}"
|
|
243
|
+
|
|
244
|
+
return ""
|