ttsforge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,422 @@
1
+ """SSMD (Speech Synthesis Markdown) generator for ttsforge.
2
+
3
+ This module converts chapter text to SSMD format with markup for:
4
+ - Emphasis (*text* for moderate, **text** for strong)
5
+ - Language switches ([text](lang_code))
6
+ - Phoneme substitutions ([word](ph: /phoneme/))
7
+
8
+ Note: Structural breaks (paragraphs, sentences, clauses) are NOT automatically
9
+ added. The SSMD parser in pykokoro handles sentence detection automatically.
10
+ Users can manually add breaks in the SSMD file if desired:
11
+ - Paragraph breaks (...p)
12
+ - Sentence breaks (...s)
13
+ - Clause breaks (...c)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ import re
20
+ from html.parser import HTMLParser
21
+ from pathlib import Path
22
+
23
+
24
+ class SSMDGenerationError(Exception):
25
+ """Exception raised when SSMD generation fails."""
26
+
27
+ pass
28
+
29
+
30
+ def _hash_content(content: str) -> str:
31
+ """Generate a hash of content for change detection.
32
+
33
+ Args:
34
+ content: Text content to hash
35
+
36
+ Returns:
37
+ 12-character hex hash
38
+ """
39
+ return hashlib.md5(content.encode("utf-8")).hexdigest()[:12]
40
+
41
+
42
+ class _EmphasisHTMLParser(HTMLParser):
43
+ def __init__(self) -> None:
44
+ super().__init__(convert_charrefs=True)
45
+ self._stack: list[str] = []
46
+ self.segments: list[tuple[str, str]] = []
47
+ self._last_was_emphasis = False
48
+
49
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
50
+ tag_lower = tag.lower()
51
+ if tag_lower in {"em", "i"}:
52
+ self._stack.append("*")
53
+ elif tag_lower in {"strong", "b"}:
54
+ self._stack.append("**")
55
+
56
+ def handle_endtag(self, tag: str) -> None:
57
+ tag_lower = tag.lower()
58
+ if tag_lower in {"em", "i", "strong", "b"}:
59
+ for idx in range(len(self._stack) - 1, -1, -1):
60
+ if self._stack[idx] in {"*", "**"}:
61
+ self._stack.pop(idx)
62
+ break
63
+
64
+ def handle_data(self, data: str) -> None:
65
+ marker = "**" if "**" in self._stack else ("*" if "*" in self._stack else None)
66
+ if not marker:
67
+ self._last_was_emphasis = False
68
+ return
69
+ if self._last_was_emphasis and self.segments and self.segments[-1][1] == marker:
70
+ prev_text, _ = self.segments[-1]
71
+ self.segments[-1] = (prev_text + data, marker)
72
+ else:
73
+ self.segments.append((data, marker))
74
+ self._last_was_emphasis = True
75
+
76
+
77
+ def _detect_emphasis_from_html(html_content: str) -> list[tuple[str, str]]:
78
+ """Detect emphasis from HTML tags and return ordered text segments.
79
+
80
+ Args:
81
+ html_content: HTML content with formatting tags
82
+
83
+ Returns:
84
+ List of (text, marker) segments in document order
85
+ """
86
+ parser = _EmphasisHTMLParser()
87
+ parser.feed(html_content)
88
+ return parser.segments
89
+
90
+
91
+ def _apply_emphasis_markers(text: str, emphasis_segments: list[tuple[str, str]]) -> str:
92
+ """Apply emphasis markers to text based on ordered emphasis segments.
93
+
94
+ Args:
95
+ text: Plain text
96
+ emphasis_segments: List of (text, marker) in document order
97
+
98
+ Returns:
99
+ Text with emphasis markers applied
100
+ """
101
+ if not emphasis_segments:
102
+ return text
103
+
104
+ matches: list[tuple[int, int, str]] = []
105
+ cursor = 0
106
+ base_text = text
107
+
108
+ for emphasized_text, marker in emphasis_segments:
109
+ if not emphasized_text.strip():
110
+ continue
111
+ pattern = re.escape(emphasized_text)
112
+ pattern = re.sub(r"\s+", r"\\s+", pattern)
113
+ match = re.search(pattern, base_text[cursor:], flags=re.MULTILINE)
114
+ if not match:
115
+ continue
116
+ start = cursor + match.start()
117
+ end = cursor + match.end()
118
+ matches.append((start, end, marker))
119
+ cursor = end
120
+
121
+ for start, end, marker in reversed(matches):
122
+ base_text = (
123
+ base_text[:start] + marker + base_text[start:end] + marker + base_text[end:]
124
+ )
125
+
126
+ return base_text
127
+
128
+
129
+ def _inject_phoneme_substitutions(
130
+ text: str, phoneme_dict: dict[str, str], case_sensitive: bool = False
131
+ ) -> str:
132
+ """Inject phoneme substitutions into text using SSMD [word](ph: /phoneme/) syntax.
133
+
134
+ Args:
135
+ text: Text to process
136
+ phoneme_dict: Dictionary mapping words to IPA phonemes
137
+ case_sensitive: Whether to match case-sensitively
138
+
139
+ Returns:
140
+ Text with phoneme substitutions injected
141
+ """
142
+ if not phoneme_dict:
143
+ return text
144
+
145
+ link_pattern = re.compile(r"\[[^\]]+\]\([^\)]+\)")
146
+
147
+ words = [word for word in phoneme_dict.keys() if word]
148
+ if not words:
149
+ return text
150
+
151
+ words = sorted(words, key=len, reverse=True)
152
+ alternation = "|".join(re.escape(word) for word in words)
153
+ boundary_pattern = rf"(?<!\w)({alternation})(?!\w)"
154
+ flags = 0 if case_sensitive else re.IGNORECASE
155
+ compiled = re.compile(boundary_pattern, flags=flags)
156
+
157
+ if case_sensitive:
158
+ lookup = phoneme_dict
159
+ else:
160
+ lookup = {}
161
+ for word, phoneme in phoneme_dict.items():
162
+ key = word.lower()
163
+ if key not in lookup:
164
+ lookup[key] = phoneme
165
+
166
+ def replace(match: re.Match[str]) -> str:
167
+ matched_word = match.group(1)
168
+ key = matched_word if case_sensitive else matched_word.lower()
169
+ phoneme = lookup.get(key)
170
+ if not phoneme:
171
+ return matched_word
172
+ clean_phoneme = phoneme.strip("/")
173
+ return f"[{matched_word}](ph: /{clean_phoneme}/)"
174
+
175
+ segments: list[str] = []
176
+ last_index = 0
177
+ for match in link_pattern.finditer(text):
178
+ if match.start() > last_index:
179
+ segment = text[last_index : match.start()]
180
+ segments.append(compiled.sub(replace, segment))
181
+ segments.append(match.group(0))
182
+ last_index = match.end()
183
+
184
+ if last_index < len(text):
185
+ segments.append(compiled.sub(replace, text[last_index:]))
186
+
187
+ return "".join(segments)
188
+
189
+
190
+ def _add_language_markers(text: str, mixed_language_config: dict | None = None) -> str:
191
+ """Add language markers for mixed-language segments.
192
+
193
+ Note: This is a placeholder for now. Full implementation would require
194
+ language detection library (lingua-language-detector).
195
+
196
+ Args:
197
+ text: Text to process
198
+ mixed_language_config: Configuration for mixed-language mode
199
+
200
+ Returns:
201
+ Text with language markers (currently returns text unchanged)
202
+ """
203
+ # TODO: Implement language detection and wrapping
204
+ # For now, return text unchanged
205
+ # Future: Use lingua-language-detector to identify foreign segments
206
+ # and wrap them with [segment](lang_code)
207
+ return text
208
+
209
+
210
+ def _add_structural_breaks(text: str) -> str:
211
+ """Preserve paragraph structure without adding automatic SSMD breaks.
212
+
213
+ The SSMD parser in pykokoro will handle sentence detection automatically.
214
+ This function only preserves existing paragraph breaks as double newlines.
215
+
216
+ Args:
217
+ text: Plain text to process
218
+
219
+ Returns:
220
+ Text with normalized paragraph spacing (no SSMD break markers)
221
+ """
222
+ # Split into paragraphs and normalize spacing
223
+ paragraphs = re.split(r"\n\s*\n+", text)
224
+ result_paragraphs = []
225
+
226
+ for para in paragraphs:
227
+ para = para.strip()
228
+ if para:
229
+ result_paragraphs.append(para)
230
+
231
+ # Join paragraphs with double newlines (standard paragraph separation)
232
+ # No SSMD markers - let pykokoro's parser handle sentence detection
233
+ result = "\n\n".join(result_paragraphs)
234
+
235
+ return result
236
+
237
+
238
+ def _strip_redundant_title(chapter_title: str, chapter_text: str) -> str:
239
+ """Remove a duplicated chapter title from the start of the text."""
240
+ title = chapter_title.strip()
241
+ if not title:
242
+ return chapter_text
243
+
244
+ lines = chapter_text.splitlines()
245
+ first_idx = None
246
+ for idx, line in enumerate(lines):
247
+ if line.strip():
248
+ first_idx = idx
249
+ break
250
+
251
+ if first_idx is None:
252
+ return chapter_text
253
+
254
+ first_line = lines[first_idx]
255
+ title_pattern = re.compile(
256
+ rf"^\s*{re.escape(title)}(?:\b|[\s:;\-\u2013\u2014])",
257
+ re.IGNORECASE,
258
+ )
259
+ if not title_pattern.search(first_line):
260
+ return chapter_text
261
+
262
+ trimmed_line = title_pattern.sub("", first_line, count=1).lstrip(
263
+ " \t:;\-\u2013\u2014"
264
+ )
265
+ if trimmed_line:
266
+ lines[first_idx] = trimmed_line
267
+ return "\n".join(lines[first_idx:]).lstrip()
268
+
269
+ remaining = lines[first_idx + 1 :]
270
+ while remaining and not remaining[0].strip():
271
+ remaining = remaining[1:]
272
+ return "\n".join(remaining).lstrip()
273
+
274
+
275
+ def chapter_to_ssmd(
276
+ chapter_title: str,
277
+ chapter_text: str,
278
+ phoneme_dict: dict[str, str] | None = None,
279
+ phoneme_dict_case_sensitive: bool = False,
280
+ mixed_language_config: dict | None = None,
281
+ html_content: str | None = None,
282
+ include_title: bool = True,
283
+ ) -> str:
284
+ """Convert a chapter to SSMD format.
285
+
286
+ Args:
287
+ chapter_title: Title of the chapter
288
+ chapter_text: Plain text content of the chapter
289
+ phoneme_dict: Optional dictionary mapping words to IPA phonemes
290
+ phoneme_dict_case_sensitive: Whether phoneme matching is case-sensitive
291
+ mixed_language_config: Optional config for mixed-language mode
292
+ html_content: Optional HTML content for emphasis detection
293
+ include_title: Whether to include chapter title in SSMD
294
+
295
+ Returns:
296
+ SSMD formatted text
297
+
298
+ Raises:
299
+ SSMDGenerationError: If generation fails
300
+ """
301
+ try:
302
+ result = chapter_text
303
+ if include_title and chapter_title:
304
+ result = _strip_redundant_title(chapter_title, result)
305
+
306
+ # Step 1: Detect emphasis from HTML if available
307
+ emphasis_segments: list[tuple[str, str]] = []
308
+ if html_content:
309
+ emphasis_segments = _detect_emphasis_from_html(html_content)
310
+
311
+ # Step 2: Apply emphasis markers
312
+ if emphasis_segments:
313
+ result = _apply_emphasis_markers(result, emphasis_segments)
314
+
315
+ # Step 3: Inject phoneme substitutions
316
+ if phoneme_dict:
317
+ result = _inject_phoneme_substitutions(
318
+ result, phoneme_dict, phoneme_dict_case_sensitive
319
+ )
320
+
321
+ # Step 4: Add language markers (if mixed-language mode)
322
+ if mixed_language_config and mixed_language_config.get("use_mixed_language"):
323
+ result = _add_language_markers(result, mixed_language_config)
324
+
325
+ # Step 5: Add structural breaks (paragraphs, sentences, clauses)
326
+ result = _add_structural_breaks(result)
327
+
328
+ # Step 6: Add chapter title if requested
329
+ if include_title and chapter_title:
330
+ # Clean title and add as heading with double newline separation
331
+ clean_title = chapter_title.strip()
332
+ result = f"# {clean_title}\n\n{result}"
333
+
334
+ return result
335
+
336
+ except Exception as e:
337
+ raise SSMDGenerationError(
338
+ f"Failed to generate SSMD for chapter '{chapter_title}': {str(e)}"
339
+ ) from e
340
+
341
+
342
+ def save_ssmd_file(ssmd_content: str, output_path: Path) -> str:
343
+ """Save SSMD content to a file and return its hash.
344
+
345
+ Args:
346
+ ssmd_content: SSMD formatted text
347
+ output_path: Path to save the SSMD file
348
+
349
+ Returns:
350
+ Hash of the saved content
351
+
352
+ Raises:
353
+ SSMDGenerationError: If file save fails
354
+ """
355
+ try:
356
+ output_path.parent.mkdir(parents=True, exist_ok=True)
357
+ with open(output_path, "w", encoding="utf-8") as f:
358
+ f.write(ssmd_content)
359
+ return _hash_content(ssmd_content)
360
+ except Exception as e:
361
+ raise SSMDGenerationError(
362
+ f"Failed to save SSMD file to {output_path}: {str(e)}"
363
+ ) from e
364
+
365
+
366
+ def load_ssmd_file(ssmd_path: Path) -> tuple[str, str]:
367
+ """Load SSMD file and return content with hash.
368
+
369
+ Args:
370
+ ssmd_path: Path to the SSMD file
371
+
372
+ Returns:
373
+ Tuple of (content, hash)
374
+
375
+ Raises:
376
+ SSMDGenerationError: If file load fails or doesn't exist
377
+ """
378
+ try:
379
+ if not ssmd_path.exists():
380
+ raise SSMDGenerationError(f"SSMD file not found: {ssmd_path}")
381
+
382
+ with open(ssmd_path, encoding="utf-8") as f:
383
+ content = f.read()
384
+
385
+ return content, _hash_content(content)
386
+ except SSMDGenerationError:
387
+ raise
388
+ except Exception as e:
389
+ raise SSMDGenerationError(
390
+ f"Failed to load SSMD file from {ssmd_path}: {str(e)}"
391
+ ) from e
392
+
393
+
394
+ def validate_ssmd(ssmd_content: str) -> list[str]:
395
+ """Validate SSMD content and return warnings.
396
+
397
+ This is intentionally lightweight: it checks for obviously unbalanced
398
+ brackets/parentheses and unmatched emphasis markers. It does not attempt
399
+ to fully parse SSMD.
400
+
401
+ Args:
402
+ ssmd_content: SSMD formatted text
403
+
404
+ Returns:
405
+ List of warning strings. Empty list means no issues found.
406
+ """
407
+ warnings: list[str] = []
408
+
409
+ if ssmd_content.count("[") != ssmd_content.count("]"):
410
+ warnings.append("Unbalanced '[' and ']' brackets")
411
+
412
+ if ssmd_content.count("(") != ssmd_content.count(")"):
413
+ warnings.append("Unbalanced '(' and ')' parentheses")
414
+
415
+ strong_count = ssmd_content.count("**")
416
+ if strong_count % 2 != 0:
417
+ warnings.append("Unbalanced strong emphasis markers '**'")
418
+
419
+ if ssmd_content.count("*") % 2 != 0:
420
+ warnings.append("Unbalanced emphasis markers '*'")
421
+
422
+ return warnings