ssmd 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssmd/__init__.py +189 -0
- ssmd/_version.py +34 -0
- ssmd/capabilities.py +277 -0
- ssmd/document.py +918 -0
- ssmd/formatter.py +244 -0
- ssmd/parser.py +1049 -0
- ssmd/parser_types.py +41 -0
- ssmd/py.typed +0 -0
- ssmd/segment.py +720 -0
- ssmd/sentence.py +270 -0
- ssmd/ssml_conversions.py +124 -0
- ssmd/ssml_parser.py +599 -0
- ssmd/types.py +122 -0
- ssmd/utils.py +333 -0
- ssmd/xsampa_to_ipa.txt +174 -0
- ssmd-0.5.3.dist-info/METADATA +1210 -0
- ssmd-0.5.3.dist-info/RECORD +20 -0
- ssmd-0.5.3.dist-info/WHEEL +5 -0
- ssmd-0.5.3.dist-info/licenses/LICENSE +21 -0
- ssmd-0.5.3.dist-info/top_level.txt +1 -0
ssmd/document.py
ADDED
|
@@ -0,0 +1,918 @@
|
|
|
1
|
+
"""SSMD Document - Main document container with rich TTS features."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import TYPE_CHECKING, Any, overload
|
|
5
|
+
|
|
6
|
+
from ssmd.formatter import format_ssmd
|
|
7
|
+
from ssmd.parser import parse_sentences
|
|
8
|
+
from ssmd.utils import extract_sentences, format_xml
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from ssmd.capabilities import TTSCapabilities
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Document:
|
|
15
|
+
"""Main SSMD document container with incremental building and editing.
|
|
16
|
+
|
|
17
|
+
This is the primary interface for working with SSMD documents. It provides
|
|
18
|
+
a clean, document-centric API for creating, editing, and exporting TTS content.
|
|
19
|
+
|
|
20
|
+
The Document stores content as fragments (pieces of text) with separators
|
|
21
|
+
between them, allowing efficient incremental building and editing while
|
|
22
|
+
preserving the document structure.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
Basic usage::
|
|
26
|
+
|
|
27
|
+
import ssmd
|
|
28
|
+
|
|
29
|
+
# Create and build a document
|
|
30
|
+
doc = ssmd.Document()
|
|
31
|
+
doc.add_sentence("Hello world!")
|
|
32
|
+
doc.add_sentence("This is SSMD.")
|
|
33
|
+
|
|
34
|
+
# Export to different formats
|
|
35
|
+
ssml = doc.to_ssml()
|
|
36
|
+
text = doc.to_text()
|
|
37
|
+
|
|
38
|
+
# Iterate for streaming TTS
|
|
39
|
+
for sentence in doc.sentences():
|
|
40
|
+
tts_engine.speak(sentence)
|
|
41
|
+
|
|
42
|
+
Advanced usage::
|
|
43
|
+
|
|
44
|
+
# Load from SSML
|
|
45
|
+
doc = ssmd.Document.from_ssml("<speak>Hello</speak>")
|
|
46
|
+
|
|
47
|
+
# Edit the document
|
|
48
|
+
doc[0] = "Modified content"
|
|
49
|
+
doc.add_paragraph("New paragraph")
|
|
50
|
+
|
|
51
|
+
# Access raw content
|
|
52
|
+
print(doc.ssmd) # Raw SSMD markdown
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
content: str = "",
|
|
58
|
+
config: dict[str, Any] | None = None,
|
|
59
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
60
|
+
escape_syntax: bool = False,
|
|
61
|
+
escape_patterns: list[str] | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Initialize a new SSMD document.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
content: Optional initial SSMD content
|
|
67
|
+
config: Configuration dictionary with options:
|
|
68
|
+
- skip (list): Processor names to skip
|
|
69
|
+
- output_speak_tag (bool): Wrap in <speak> tags (default: True)
|
|
70
|
+
- pretty_print (bool): Format XML output (default: False)
|
|
71
|
+
- auto_sentence_tags (bool): Auto-wrap sentences (default: False)
|
|
72
|
+
- heading_levels (dict): Custom heading configurations
|
|
73
|
+
- extensions (dict): Registered extension handlers
|
|
74
|
+
- sentence_model_size (str): spaCy model size for sentence
|
|
75
|
+
detection ("sm", "md", "lg", "trf"). Default: "sm"
|
|
76
|
+
- sentence_spacy_model (str): Custom spaCy model name
|
|
77
|
+
(overrides sentence_model_size)
|
|
78
|
+
- sentence_use_spacy (bool): If False, use fast regex splitting
|
|
79
|
+
instead of spaCy. Default: True
|
|
80
|
+
capabilities: TTS capabilities (TTSCapabilities instance or
|
|
81
|
+
preset name). Presets: 'espeak', 'pyttsx3', 'google',
|
|
82
|
+
'polly', 'azure', 'minimal', 'full'
|
|
83
|
+
escape_syntax: If True, escape SSMD-like syntax in content to
|
|
84
|
+
prevent interpretation as markup. Useful for plain text or
|
|
85
|
+
markdown that may coincidentally contain SSMD patterns.
|
|
86
|
+
escape_patterns: List of specific pattern types to escape when
|
|
87
|
+
escape_syntax=True. If None, escapes all patterns.
|
|
88
|
+
Valid values: 'emphasis', 'annotations', 'breaks', 'marks',
|
|
89
|
+
'headings', 'voice_directives', 'prosody_shorthand'
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> doc = ssmd.Document("Hello *world*!")
|
|
93
|
+
>>> doc = ssmd.Document(capabilities='pyttsx3')
|
|
94
|
+
>>> doc = ssmd.Document("Text", config={'auto_sentence_tags': True})
|
|
95
|
+
>>> # Fast sentence detection (no spaCy required)
|
|
96
|
+
>>> doc = ssmd.Document(config={'sentence_use_spacy': False})
|
|
97
|
+
>>> # High quality sentence detection
|
|
98
|
+
>>> doc = ssmd.Document(config={'sentence_model_size': 'lg'})
|
|
99
|
+
>>> # Escape SSMD syntax for plain text/markdown
|
|
100
|
+
>>> doc = ssmd.Document(markdown, escape_syntax=True)
|
|
101
|
+
>>> # Selective escaping
|
|
102
|
+
>>> doc = ssmd.Document(
|
|
103
|
+
... text,
|
|
104
|
+
... escape_syntax=True,
|
|
105
|
+
... escape_patterns=['emphasis', 'annotations']
|
|
106
|
+
... )
|
|
107
|
+
"""
|
|
108
|
+
self._fragments: list[str] = []
|
|
109
|
+
self._separators: list[str] = []
|
|
110
|
+
self._config = config or {}
|
|
111
|
+
self._capabilities = capabilities
|
|
112
|
+
self._capabilities_obj: TTSCapabilities | None = None # Resolved capabilities
|
|
113
|
+
self._cached_ssml: str | None = None
|
|
114
|
+
self._cached_sentences: list[str] | None = None
|
|
115
|
+
self._escape_syntax = escape_syntax
|
|
116
|
+
self._escape_patterns = escape_patterns
|
|
117
|
+
|
|
118
|
+
# Add initial content if provided
|
|
119
|
+
if content:
|
|
120
|
+
if escape_syntax:
|
|
121
|
+
from ssmd.utils import escape_ssmd_syntax
|
|
122
|
+
|
|
123
|
+
content = escape_ssmd_syntax(content, patterns=escape_patterns)
|
|
124
|
+
self._fragments.append(content)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def from_ssml(
|
|
128
|
+
cls,
|
|
129
|
+
ssml: str,
|
|
130
|
+
config: dict[str, Any] | None = None,
|
|
131
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
132
|
+
) -> "Document":
|
|
133
|
+
"""Create a Document from SSML string.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
ssml: SSML XML string
|
|
137
|
+
config: Optional configuration parameters
|
|
138
|
+
capabilities: Optional TTS capabilities
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
New Document instance with converted content
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
>>> ssml = '<speak><emphasis>Hello</emphasis> world</speak>'
|
|
145
|
+
>>> doc = ssmd.Document.from_ssml(ssml)
|
|
146
|
+
>>> doc.ssmd
|
|
147
|
+
'*Hello* world'
|
|
148
|
+
"""
|
|
149
|
+
from ssmd.ssml_parser import SSMLParser
|
|
150
|
+
|
|
151
|
+
parser = SSMLParser(config or {})
|
|
152
|
+
ssmd_content = parser.to_ssmd(ssml)
|
|
153
|
+
return cls(ssmd_content, config, capabilities)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_text(
|
|
157
|
+
cls,
|
|
158
|
+
text: str,
|
|
159
|
+
config: dict[str, Any] | None = None,
|
|
160
|
+
capabilities: "TTSCapabilities | str | None" = None,
|
|
161
|
+
) -> "Document":
|
|
162
|
+
"""Create a Document from plain text.
|
|
163
|
+
|
|
164
|
+
This is essentially the same as Document(text), but provides
|
|
165
|
+
a symmetric API with from_ssml().
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
text: Plain text or SSMD content
|
|
169
|
+
config: Optional configuration parameters
|
|
170
|
+
capabilities: Optional TTS capabilities
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
New Document instance
|
|
174
|
+
|
|
175
|
+
Example:
|
|
176
|
+
>>> doc = ssmd.Document.from_text("Hello world")
|
|
177
|
+
>>> doc.ssmd
|
|
178
|
+
'Hello world'
|
|
179
|
+
"""
|
|
180
|
+
return cls(text, config, capabilities)
|
|
181
|
+
|
|
182
|
+
# ═══════════════════════════════════════════════════════════
|
|
183
|
+
# BUILDING METHODS
|
|
184
|
+
# ═══════════════════════════════════════════════════════════
|
|
185
|
+
|
|
186
|
+
def add(self, text: str) -> "Document":
|
|
187
|
+
"""Append text without separator.
|
|
188
|
+
|
|
189
|
+
Use this when you want to append content immediately after
|
|
190
|
+
the previous content with no spacing.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
text: SSMD text to append
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Self for method chaining
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
>>> doc = ssmd.Document("Hello")
|
|
200
|
+
>>> doc.add(" world")
|
|
201
|
+
>>> doc.ssmd
|
|
202
|
+
'Hello world'
|
|
203
|
+
"""
|
|
204
|
+
if not text:
|
|
205
|
+
return self
|
|
206
|
+
|
|
207
|
+
self._invalidate_cache()
|
|
208
|
+
|
|
209
|
+
if not self._fragments:
|
|
210
|
+
self._fragments.append(text)
|
|
211
|
+
else:
|
|
212
|
+
self._separators.append("")
|
|
213
|
+
self._fragments.append(text)
|
|
214
|
+
|
|
215
|
+
return self
|
|
216
|
+
|
|
217
|
+
def add_sentence(self, text: str) -> "Document":
|
|
218
|
+
"""Append text with newline separator.
|
|
219
|
+
|
|
220
|
+
Use this to add a new sentence on a new line.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
text: SSMD text to append
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Self for method chaining
|
|
227
|
+
|
|
228
|
+
Example:
|
|
229
|
+
>>> doc = ssmd.Document("First sentence.")
|
|
230
|
+
>>> doc.add_sentence("Second sentence.")
|
|
231
|
+
>>> doc.ssmd
|
|
232
|
+
'First sentence.\\nSecond sentence.'
|
|
233
|
+
"""
|
|
234
|
+
if not text:
|
|
235
|
+
return self
|
|
236
|
+
|
|
237
|
+
self._invalidate_cache()
|
|
238
|
+
|
|
239
|
+
if not self._fragments:
|
|
240
|
+
self._fragments.append(text)
|
|
241
|
+
else:
|
|
242
|
+
self._separators.append("\n")
|
|
243
|
+
self._fragments.append(text)
|
|
244
|
+
|
|
245
|
+
return self
|
|
246
|
+
|
|
247
|
+
def add_paragraph(self, text: str) -> "Document":
|
|
248
|
+
"""Append text with double newline separator.
|
|
249
|
+
|
|
250
|
+
Use this to start a new paragraph.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
text: SSMD text to append
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Self for method chaining
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
>>> doc = ssmd.Document("First paragraph.")
|
|
260
|
+
>>> doc.add_paragraph("Second paragraph.")
|
|
261
|
+
>>> doc.ssmd
|
|
262
|
+
'First paragraph.\\n\\nSecond paragraph.'
|
|
263
|
+
"""
|
|
264
|
+
if not text:
|
|
265
|
+
return self
|
|
266
|
+
|
|
267
|
+
self._invalidate_cache()
|
|
268
|
+
|
|
269
|
+
if not self._fragments:
|
|
270
|
+
self._fragments.append(text)
|
|
271
|
+
else:
|
|
272
|
+
self._separators.append("\n\n")
|
|
273
|
+
self._fragments.append(text)
|
|
274
|
+
|
|
275
|
+
return self
|
|
276
|
+
|
|
277
|
+
# ═══════════════════════════════════════════════════════════
|
|
278
|
+
# EXPORT METHODS
|
|
279
|
+
# ═══════════════════════════════════════════════════════════
|
|
280
|
+
|
|
281
|
+
def to_ssml(self) -> str:
|
|
282
|
+
"""Export document to SSML format.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
SSML XML string
|
|
286
|
+
|
|
287
|
+
Example:
|
|
288
|
+
>>> doc = ssmd.Document("Hello *world*!")
|
|
289
|
+
>>> doc.to_ssml()
|
|
290
|
+
'<speak>Hello <emphasis>world</emphasis>!</speak>'
|
|
291
|
+
"""
|
|
292
|
+
if self._cached_ssml is None:
|
|
293
|
+
ssmd_content = self.ssmd
|
|
294
|
+
|
|
295
|
+
# Get resolved capabilities
|
|
296
|
+
capabilities = self._get_capabilities()
|
|
297
|
+
|
|
298
|
+
# Get config options
|
|
299
|
+
output_speak_tag = self._config.get("output_speak_tag", True)
|
|
300
|
+
auto_sentence_tags = self._config.get("auto_sentence_tags", False)
|
|
301
|
+
pretty_print = self._config.get("pretty_print", False)
|
|
302
|
+
extensions = self._config.get("extensions")
|
|
303
|
+
heading_levels = self._config.get("heading_levels")
|
|
304
|
+
|
|
305
|
+
# Get sentence detection config
|
|
306
|
+
model_size = self._config.get("sentence_model_size")
|
|
307
|
+
spacy_model = self._config.get("sentence_spacy_model")
|
|
308
|
+
use_spacy = self._config.get("sentence_use_spacy")
|
|
309
|
+
|
|
310
|
+
# Parse SSMD into sentences (with placeholders if escape_syntax=True)
|
|
311
|
+
sentences = parse_sentences(
|
|
312
|
+
ssmd_content,
|
|
313
|
+
capabilities=capabilities,
|
|
314
|
+
model_size=model_size,
|
|
315
|
+
spacy_model=spacy_model,
|
|
316
|
+
use_spacy=use_spacy,
|
|
317
|
+
heading_levels=heading_levels,
|
|
318
|
+
extensions=extensions,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Build SSML from sentences
|
|
322
|
+
ssml_parts = []
|
|
323
|
+
for sentence in sentences:
|
|
324
|
+
ssml_parts.append(
|
|
325
|
+
sentence.to_ssml(
|
|
326
|
+
capabilities=capabilities,
|
|
327
|
+
extensions=extensions,
|
|
328
|
+
wrap_sentence=auto_sentence_tags,
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
ssml = "".join(ssml_parts)
|
|
333
|
+
|
|
334
|
+
# Wrap in <speak> tags if configured
|
|
335
|
+
if output_speak_tag:
|
|
336
|
+
ssml = f"<speak>{ssml}</speak>"
|
|
337
|
+
|
|
338
|
+
# Unescape placeholders AFTER generating SSML
|
|
339
|
+
# (restore original characters in output)
|
|
340
|
+
if self._escape_syntax:
|
|
341
|
+
from ssmd.utils import unescape_ssmd_syntax
|
|
342
|
+
|
|
343
|
+
ssml = unescape_ssmd_syntax(ssml)
|
|
344
|
+
|
|
345
|
+
# Pretty print if configured
|
|
346
|
+
if pretty_print:
|
|
347
|
+
ssml = format_xml(ssml, pretty=True)
|
|
348
|
+
|
|
349
|
+
self._cached_ssml = ssml
|
|
350
|
+
return self._cached_ssml
|
|
351
|
+
|
|
352
|
+
def to_ssmd(self) -> str:
|
|
353
|
+
"""Export document to SSMD format with proper formatting.
|
|
354
|
+
|
|
355
|
+
Returns SSMD with proper line breaks (each sentence on a new line).
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
SSMD markdown string with proper formatting
|
|
359
|
+
|
|
360
|
+
Example:
|
|
361
|
+
>>> doc = ssmd.Document.from_ssml('<speak><emphasis>Hi</emphasis></speak>')
|
|
362
|
+
>>> doc.to_ssmd()
|
|
363
|
+
'*Hi*'
|
|
364
|
+
"""
|
|
365
|
+
raw_ssmd = self.ssmd
|
|
366
|
+
if not raw_ssmd.strip():
|
|
367
|
+
return raw_ssmd
|
|
368
|
+
|
|
369
|
+
# Parse into sentences and format with proper line breaks
|
|
370
|
+
sentences = parse_sentences(raw_ssmd)
|
|
371
|
+
return format_ssmd(sentences).rstrip("\n")
|
|
372
|
+
|
|
373
|
+
def to_text(self) -> str:
|
|
374
|
+
"""Export document to plain text (strips all markup).
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Plain text string with all SSMD markup removed
|
|
378
|
+
|
|
379
|
+
Example:
|
|
380
|
+
>>> doc = ssmd.Document("Hello *world* @marker!")
|
|
381
|
+
>>> doc.to_text()
|
|
382
|
+
'Hello world!'
|
|
383
|
+
"""
|
|
384
|
+
ssmd_content = self.ssmd
|
|
385
|
+
sentences = parse_sentences(ssmd_content)
|
|
386
|
+
text_parts = []
|
|
387
|
+
for sentence in sentences:
|
|
388
|
+
text_parts.append(sentence.to_text())
|
|
389
|
+
return " ".join(text_parts)
|
|
390
|
+
|
|
391
|
+
# ═══════════════════════════════════════════════════════════
|
|
392
|
+
# PROPERTIES
|
|
393
|
+
# ═══════════════════════════════════════════════════════════
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def ssmd(self) -> str:
|
|
397
|
+
"""Get raw SSMD content.
|
|
398
|
+
|
|
399
|
+
Returns the complete SSMD document by joining all fragments
|
|
400
|
+
with their separators.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
SSMD markdown string
|
|
404
|
+
"""
|
|
405
|
+
if not self._fragments:
|
|
406
|
+
return ""
|
|
407
|
+
|
|
408
|
+
if len(self._fragments) == 1:
|
|
409
|
+
return self._fragments[0]
|
|
410
|
+
|
|
411
|
+
result = self._fragments[0]
|
|
412
|
+
for i, separator in enumerate(self._separators):
|
|
413
|
+
result += separator + self._fragments[i + 1]
|
|
414
|
+
return result
|
|
415
|
+
|
|
416
|
+
@property
|
|
417
|
+
def config(self) -> dict[str, Any]:
|
|
418
|
+
"""Get configuration dictionary.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Configuration dict
|
|
422
|
+
"""
|
|
423
|
+
return self._config
|
|
424
|
+
|
|
425
|
+
@config.setter
|
|
426
|
+
def config(self, value: dict[str, Any]) -> None:
|
|
427
|
+
"""Set configuration dictionary.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
value: New configuration dict
|
|
431
|
+
"""
|
|
432
|
+
self._config = value
|
|
433
|
+
self._capabilities_obj = None # Reset resolved capabilities
|
|
434
|
+
self._invalidate_cache()
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def capabilities(self) -> "TTSCapabilities | str | None":
|
|
438
|
+
"""Get TTS capabilities.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
TTSCapabilities instance, preset name, or None
|
|
442
|
+
"""
|
|
443
|
+
return self._capabilities
|
|
444
|
+
|
|
445
|
+
@capabilities.setter
|
|
446
|
+
def capabilities(self, value: "TTSCapabilities | str | None") -> None:
|
|
447
|
+
"""Set TTS capabilities.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
value: TTSCapabilities instance, preset name, or None
|
|
451
|
+
"""
|
|
452
|
+
self._capabilities = value
|
|
453
|
+
self._capabilities_obj = None # Reset resolved capabilities
|
|
454
|
+
self._invalidate_cache()
|
|
455
|
+
|
|
456
|
+
# ═══════════════════════════════════════════════════════════
|
|
457
|
+
# ITERATION
|
|
458
|
+
# ═══════════════════════════════════════════════════════════
|
|
459
|
+
|
|
460
|
+
def sentences(self, as_documents: bool = False) -> "Iterator[str | Document]":
|
|
461
|
+
"""Iterate through sentences.
|
|
462
|
+
|
|
463
|
+
Yields SSML sentences one at a time, which is useful for
|
|
464
|
+
streaming TTS applications.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
as_documents: If True, yield Document objects instead of strings.
|
|
468
|
+
Each sentence will be wrapped in its own Document instance.
|
|
469
|
+
|
|
470
|
+
Yields:
|
|
471
|
+
SSML sentence strings (str), or Document objects if as_documents=True
|
|
472
|
+
|
|
473
|
+
Example:
|
|
474
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
475
|
+
>>> for sentence in doc.sentences():
|
|
476
|
+
... tts_engine.speak(sentence)
|
|
477
|
+
|
|
478
|
+
>>> for sentence_doc in doc.sentences(as_documents=True):
|
|
479
|
+
... ssml = sentence_doc.to_ssml()
|
|
480
|
+
... ssmd = sentence_doc.to_ssmd()
|
|
481
|
+
"""
|
|
482
|
+
if self._cached_sentences is None:
|
|
483
|
+
ssml = self.to_ssml()
|
|
484
|
+
self._cached_sentences = extract_sentences(ssml)
|
|
485
|
+
|
|
486
|
+
for sentence in self._cached_sentences:
|
|
487
|
+
if as_documents:
|
|
488
|
+
# Create a Document from this SSML sentence
|
|
489
|
+
yield Document.from_ssml(
|
|
490
|
+
sentence,
|
|
491
|
+
config=self._config,
|
|
492
|
+
capabilities=self._capabilities,
|
|
493
|
+
)
|
|
494
|
+
else:
|
|
495
|
+
yield sentence
|
|
496
|
+
|
|
497
|
+
# ═══════════════════════════════════════════════════════════
|
|
498
|
+
# LIST-LIKE INTERFACE (operates on SSML sentences)
|
|
499
|
+
# ═══════════════════════════════════════════════════════════
|
|
500
|
+
|
|
501
|
+
def __len__(self) -> int:
|
|
502
|
+
"""Return number of sentences in the document.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Number of sentences
|
|
506
|
+
|
|
507
|
+
Example:
|
|
508
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
509
|
+
>>> len(doc)
|
|
510
|
+
3
|
|
511
|
+
"""
|
|
512
|
+
if self._cached_sentences is None:
|
|
513
|
+
ssml = self.to_ssml()
|
|
514
|
+
self._cached_sentences = extract_sentences(ssml)
|
|
515
|
+
return len(self._cached_sentences)
|
|
516
|
+
|
|
517
|
+
@overload
|
|
518
|
+
def __getitem__(self, index: int) -> str: ...
|
|
519
|
+
|
|
520
|
+
@overload
|
|
521
|
+
def __getitem__(self, index: slice) -> list[str]: ...
|
|
522
|
+
|
|
523
|
+
def __getitem__(self, index: int | slice) -> str | list[str]:
|
|
524
|
+
"""Get sentence(s) by index.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
index: Sentence index or slice
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
SSML sentence string or list of strings
|
|
531
|
+
|
|
532
|
+
Raises:
|
|
533
|
+
IndexError: If index is out of range
|
|
534
|
+
|
|
535
|
+
Example:
|
|
536
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
537
|
+
>>> doc[0] # First sentence SSML
|
|
538
|
+
>>> doc[-1] # Last sentence SSML
|
|
539
|
+
>>> doc[0:2] # First two sentences
|
|
540
|
+
"""
|
|
541
|
+
if self._cached_sentences is None:
|
|
542
|
+
ssml = self.to_ssml()
|
|
543
|
+
self._cached_sentences = extract_sentences(ssml)
|
|
544
|
+
return self._cached_sentences[index]
|
|
545
|
+
|
|
546
|
+
def __setitem__(self, index: int, value: str) -> None:
|
|
547
|
+
"""Replace sentence at index.
|
|
548
|
+
|
|
549
|
+
This reconstructs the document with the modified sentence.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
index: Sentence index
|
|
553
|
+
value: New SSMD content for this sentence
|
|
554
|
+
|
|
555
|
+
Raises:
|
|
556
|
+
IndexError: If index is out of range
|
|
557
|
+
|
|
558
|
+
Example:
|
|
559
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
560
|
+
>>> doc[0] = "Modified first sentence."
|
|
561
|
+
"""
|
|
562
|
+
if self._cached_sentences is None:
|
|
563
|
+
ssml = self.to_ssml()
|
|
564
|
+
self._cached_sentences = extract_sentences(ssml)
|
|
565
|
+
|
|
566
|
+
self._rebuild_from_sentence_ssml(
|
|
567
|
+
self._cached_sentences,
|
|
568
|
+
replacement_index=index,
|
|
569
|
+
replacement_ssmd=value,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
def __delitem__(self, index: int) -> None:
|
|
573
|
+
"""Delete sentence at index.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
index: Sentence index
|
|
577
|
+
|
|
578
|
+
Raises:
|
|
579
|
+
IndexError: If index is out of range
|
|
580
|
+
|
|
581
|
+
Example:
|
|
582
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
583
|
+
>>> del doc[1] # Remove second sentence
|
|
584
|
+
"""
|
|
585
|
+
if self._cached_sentences is None:
|
|
586
|
+
ssml = self.to_ssml()
|
|
587
|
+
self._cached_sentences = extract_sentences(ssml)
|
|
588
|
+
|
|
589
|
+
remaining_sentences = [
|
|
590
|
+
sentence_ssml
|
|
591
|
+
for i, sentence_ssml in enumerate(self._cached_sentences)
|
|
592
|
+
if i != index
|
|
593
|
+
]
|
|
594
|
+
self._rebuild_from_sentence_ssml(remaining_sentences)
|
|
595
|
+
|
|
596
|
+
def __iter__(self) -> "Iterator[str | Document]":
|
|
597
|
+
"""Iterate through sentences.
|
|
598
|
+
|
|
599
|
+
Yields:
|
|
600
|
+
SSML sentence strings
|
|
601
|
+
|
|
602
|
+
Example:
|
|
603
|
+
>>> doc = ssmd.Document("First. Second.")
|
|
604
|
+
>>> for sentence in doc:
|
|
605
|
+
... print(sentence)
|
|
606
|
+
"""
|
|
607
|
+
return self.sentences(as_documents=False)
|
|
608
|
+
|
|
609
|
+
def __iadd__(self, other: "str | Document") -> "Document":
|
|
610
|
+
"""Support += operator for appending content.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
other: String or Document to append
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
Self for chaining
|
|
617
|
+
|
|
618
|
+
Example:
|
|
619
|
+
>>> doc = ssmd.Document("Hello")
|
|
620
|
+
>>> doc += " world"
|
|
621
|
+
>>> other = ssmd.Document("More")
|
|
622
|
+
>>> doc += other
|
|
623
|
+
"""
|
|
624
|
+
if isinstance(other, Document):
|
|
625
|
+
# Append another document's content
|
|
626
|
+
return self.add(other.ssmd)
|
|
627
|
+
else:
|
|
628
|
+
# Append string
|
|
629
|
+
return self.add(other)
|
|
630
|
+
|
|
631
|
+
# ═══════════════════════════════════════════════════════════
|
|
632
|
+
# EDITING METHODS
|
|
633
|
+
# ═══════════════════════════════════════════════════════════
|
|
634
|
+
|
|
635
|
+
def insert(self, index: int, text: str, separator: str = "") -> "Document":
|
|
636
|
+
"""Insert text at specific fragment index.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
index: Position to insert (0 = beginning)
|
|
640
|
+
text: SSMD text to insert
|
|
641
|
+
separator: Separator to use ("", "\\n", or "\\n\\n")
|
|
642
|
+
|
|
643
|
+
Returns:
|
|
644
|
+
Self for method chaining
|
|
645
|
+
|
|
646
|
+
Example:
|
|
647
|
+
>>> doc = ssmd.Document("Hello world")
|
|
648
|
+
>>> doc.insert(0, "Start: ", "")
|
|
649
|
+
>>> doc.ssmd
|
|
650
|
+
'Start: Hello world'
|
|
651
|
+
"""
|
|
652
|
+
if not text:
|
|
653
|
+
return self
|
|
654
|
+
|
|
655
|
+
self._invalidate_cache()
|
|
656
|
+
|
|
657
|
+
if not self._fragments:
|
|
658
|
+
self._fragments.append(text)
|
|
659
|
+
elif index == 0:
|
|
660
|
+
# Insert at beginning
|
|
661
|
+
self._fragments.insert(0, text)
|
|
662
|
+
if len(self._fragments) > 1:
|
|
663
|
+
self._separators.insert(0, separator)
|
|
664
|
+
elif index >= len(self._fragments):
|
|
665
|
+
# Append at end
|
|
666
|
+
self._separators.append(separator)
|
|
667
|
+
self._fragments.append(text)
|
|
668
|
+
else:
|
|
669
|
+
# Insert in middle
|
|
670
|
+
self._fragments.insert(index, text)
|
|
671
|
+
self._separators.insert(index, separator)
|
|
672
|
+
|
|
673
|
+
return self
|
|
674
|
+
|
|
675
|
+
def remove(self, index: int) -> "Document":
|
|
676
|
+
"""Remove fragment at index.
|
|
677
|
+
|
|
678
|
+
This is the same as `del doc[index]` but returns self for chaining.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
index: Fragment index to remove
|
|
682
|
+
|
|
683
|
+
Returns:
|
|
684
|
+
Self for method chaining
|
|
685
|
+
|
|
686
|
+
Raises:
|
|
687
|
+
IndexError: If index is out of range
|
|
688
|
+
|
|
689
|
+
Example:
|
|
690
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
691
|
+
>>> doc.remove(1)
|
|
692
|
+
"""
|
|
693
|
+
del self[index]
|
|
694
|
+
return self
|
|
695
|
+
|
|
696
|
+
def clear(self) -> "Document":
|
|
697
|
+
"""Remove all content from the document.
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
Self for method chaining
|
|
701
|
+
|
|
702
|
+
Example:
|
|
703
|
+
>>> doc = ssmd.Document("Hello world")
|
|
704
|
+
>>> doc.clear()
|
|
705
|
+
>>> doc.ssmd
|
|
706
|
+
''
|
|
707
|
+
"""
|
|
708
|
+
self._fragments.clear()
|
|
709
|
+
self._separators.clear()
|
|
710
|
+
self._invalidate_cache()
|
|
711
|
+
return self
|
|
712
|
+
|
|
713
|
+
def replace(self, old: str, new: str, count: int = -1) -> "Document":
|
|
714
|
+
"""Replace text across all fragments.
|
|
715
|
+
|
|
716
|
+
Args:
|
|
717
|
+
old: Text to find
|
|
718
|
+
new: Text to replace with
|
|
719
|
+
count: Maximum replacements (-1 = all)
|
|
720
|
+
|
|
721
|
+
Returns:
|
|
722
|
+
Self for method chaining
|
|
723
|
+
|
|
724
|
+
Example:
|
|
725
|
+
>>> doc = ssmd.Document("Hello world. Hello again.")
|
|
726
|
+
>>> doc.replace("Hello", "Hi")
|
|
727
|
+
>>> doc.ssmd
|
|
728
|
+
'Hi world. Hi again.'
|
|
729
|
+
"""
|
|
730
|
+
self._invalidate_cache()
|
|
731
|
+
|
|
732
|
+
replacements_made = 0
|
|
733
|
+
for i, fragment in enumerate(self._fragments):
|
|
734
|
+
if count == -1:
|
|
735
|
+
self._fragments[i] = fragment.replace(old, new)
|
|
736
|
+
else:
|
|
737
|
+
remaining = count - replacements_made
|
|
738
|
+
if remaining <= 0:
|
|
739
|
+
break
|
|
740
|
+
self._fragments[i] = fragment.replace(old, new, remaining)
|
|
741
|
+
replacements_made += self._fragments[i].count(new) - fragment.count(new)
|
|
742
|
+
|
|
743
|
+
return self
|
|
744
|
+
|
|
745
|
+
# ═══════════════════════════════════════════════════════════
|
|
746
|
+
# ADVANCED METHODS
|
|
747
|
+
# ═══════════════════════════════════════════════════════════
|
|
748
|
+
|
|
749
|
+
def merge(self, other: "Document", separator: str = "\n\n") -> "Document":
|
|
750
|
+
"""Merge another document into this one.
|
|
751
|
+
|
|
752
|
+
Args:
|
|
753
|
+
other: Document to merge
|
|
754
|
+
separator: Separator to use between documents
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
Self for method chaining
|
|
758
|
+
|
|
759
|
+
Example:
|
|
760
|
+
>>> doc1 = ssmd.Document("First document.")
|
|
761
|
+
>>> doc2 = ssmd.Document("Second document.")
|
|
762
|
+
>>> doc1.merge(doc2)
|
|
763
|
+
>>> doc1.ssmd
|
|
764
|
+
'First document.\\n\\nSecond document.'
|
|
765
|
+
"""
|
|
766
|
+
if not other._fragments:
|
|
767
|
+
return self
|
|
768
|
+
|
|
769
|
+
self._invalidate_cache()
|
|
770
|
+
|
|
771
|
+
if not self._fragments:
|
|
772
|
+
self._fragments = other._fragments.copy()
|
|
773
|
+
self._separators = other._separators.copy()
|
|
774
|
+
else:
|
|
775
|
+
self._separators.append(separator)
|
|
776
|
+
self._fragments.extend(other._fragments)
|
|
777
|
+
self._separators.extend(other._separators)
|
|
778
|
+
|
|
779
|
+
return self
|
|
780
|
+
|
|
781
|
+
def split(self) -> list["Document"]:
|
|
782
|
+
"""Split document into individual sentence Documents.
|
|
783
|
+
|
|
784
|
+
Returns:
|
|
785
|
+
List of Document objects, one per sentence
|
|
786
|
+
|
|
787
|
+
Example:
|
|
788
|
+
>>> doc = ssmd.Document("First. Second. Third.")
|
|
789
|
+
>>> sentences = doc.split()
|
|
790
|
+
>>> len(sentences)
|
|
791
|
+
3
|
|
792
|
+
>>> sentences[0].ssmd
|
|
793
|
+
'First.'
|
|
794
|
+
"""
|
|
795
|
+
return [
|
|
796
|
+
Document.from_ssml(
|
|
797
|
+
str(sentence_ssml), # Ensure it's a string
|
|
798
|
+
config=self._config,
|
|
799
|
+
capabilities=self._capabilities,
|
|
800
|
+
)
|
|
801
|
+
for sentence_ssml in self.sentences(as_documents=False)
|
|
802
|
+
]
|
|
803
|
+
|
|
804
|
+
def get_fragment(self, index: int) -> str:
|
|
805
|
+
"""Get raw fragment by index (not sentence).
|
|
806
|
+
|
|
807
|
+
This accesses the internal fragment storage directly,
|
|
808
|
+
which may be different from sentence boundaries.
|
|
809
|
+
|
|
810
|
+
Args:
|
|
811
|
+
index: Fragment index
|
|
812
|
+
|
|
813
|
+
Returns:
|
|
814
|
+
Raw SSMD fragment string
|
|
815
|
+
|
|
816
|
+
Raises:
|
|
817
|
+
IndexError: If index is out of range
|
|
818
|
+
|
|
819
|
+
Example:
|
|
820
|
+
>>> doc = ssmd.Document()
|
|
821
|
+
>>> doc.add("First")
|
|
822
|
+
>>> doc.add_sentence("Second")
|
|
823
|
+
>>> doc.get_fragment(0)
|
|
824
|
+
'First'
|
|
825
|
+
>>> doc.get_fragment(1)
|
|
826
|
+
'Second'
|
|
827
|
+
"""
|
|
828
|
+
return self._fragments[index]
|
|
829
|
+
|
|
830
|
+
# ═══════════════════════════════════════════════════════════
|
|
831
|
+
# INTERNAL HELPERS
|
|
832
|
+
# ═══════════════════════════════════════════════════════════
|
|
833
|
+
|
|
834
|
+
def _rebuild_from_sentence_ssml(
|
|
835
|
+
self,
|
|
836
|
+
sentences: list[str],
|
|
837
|
+
*,
|
|
838
|
+
replacement_index: int | None = None,
|
|
839
|
+
replacement_ssmd: str | None = None,
|
|
840
|
+
) -> None:
|
|
841
|
+
"""Rebuild fragments from SSML sentence list.
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
sentences: List of SSML sentence strings
|
|
845
|
+
replacement_index: Optional index to replace with SSMD content
|
|
846
|
+
replacement_ssmd: SSMD content to use at replacement_index
|
|
847
|
+
"""
|
|
848
|
+
from ssmd.ssml_parser import SSMLParser
|
|
849
|
+
|
|
850
|
+
parser = SSMLParser(self._config)
|
|
851
|
+
new_fragments: list[str] = []
|
|
852
|
+
new_separators: list[str] = []
|
|
853
|
+
|
|
854
|
+
for i, sentence_ssml in enumerate(sentences):
|
|
855
|
+
if replacement_index is not None and i == replacement_index:
|
|
856
|
+
if replacement_ssmd is not None:
|
|
857
|
+
new_fragments.append(replacement_ssmd)
|
|
858
|
+
else:
|
|
859
|
+
new_fragments.append(parser.to_ssmd(sentence_ssml))
|
|
860
|
+
else:
|
|
861
|
+
new_fragments.append(parser.to_ssmd(sentence_ssml))
|
|
862
|
+
|
|
863
|
+
if i < len(sentences) - 1:
|
|
864
|
+
new_separators.append("\n")
|
|
865
|
+
|
|
866
|
+
self._fragments = new_fragments
|
|
867
|
+
self._separators = new_separators
|
|
868
|
+
self._invalidate_cache()
|
|
869
|
+
|
|
870
|
+
def _get_capabilities(self) -> "TTSCapabilities | None":
|
|
871
|
+
"""Get resolved TTSCapabilities object.
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
TTSCapabilities instance or None
|
|
875
|
+
"""
|
|
876
|
+
if self._capabilities_obj is None and self._capabilities is not None:
|
|
877
|
+
from ssmd.capabilities import TTSCapabilities, get_preset
|
|
878
|
+
|
|
879
|
+
if isinstance(self._capabilities, str):
|
|
880
|
+
self._capabilities_obj = get_preset(self._capabilities)
|
|
881
|
+
elif isinstance(self._capabilities, TTSCapabilities):
|
|
882
|
+
self._capabilities_obj = self._capabilities
|
|
883
|
+
return self._capabilities_obj
|
|
884
|
+
|
|
885
|
+
def _invalidate_cache(self) -> None:
|
|
886
|
+
"""Invalidate cached SSML and sentences."""
|
|
887
|
+
self._cached_ssml = None
|
|
888
|
+
self._cached_sentences = None
|
|
889
|
+
|
|
890
|
+
def __repr__(self) -> str:
|
|
891
|
+
"""String representation of document.
|
|
892
|
+
|
|
893
|
+
Returns:
|
|
894
|
+
Representation string
|
|
895
|
+
|
|
896
|
+
Example:
|
|
897
|
+
>>> doc = ssmd.Document("Hello. World.")
|
|
898
|
+
>>> repr(doc)
|
|
899
|
+
'Document(2 sentences, 13 chars)'
|
|
900
|
+
"""
|
|
901
|
+
try:
|
|
902
|
+
num_sentences = len(self)
|
|
903
|
+
return f"Document({num_sentences} sentences, {len(self.ssmd)} chars)"
|
|
904
|
+
except Exception:
|
|
905
|
+
return f"Document({len(self.ssmd)} chars)"
|
|
906
|
+
|
|
907
|
+
def __str__(self) -> str:
|
|
908
|
+
"""String conversion returns SSMD content.
|
|
909
|
+
|
|
910
|
+
Returns:
|
|
911
|
+
SSMD string
|
|
912
|
+
|
|
913
|
+
Example:
|
|
914
|
+
>>> doc = ssmd.Document("Hello *world*")
|
|
915
|
+
>>> str(doc)
|
|
916
|
+
'Hello *world*'
|
|
917
|
+
"""
|
|
918
|
+
return self.ssmd
|