revoxx 1.0.0.dev22__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
revoxx/__init__.py CHANGED
@@ -1,6 +1,14 @@
1
1
  """Revoxx Recorder - A tool for recording emotional speech."""
2
2
 
3
- __version__ = "1.0.0"
3
+ try:
4
+ # Try to use versioningit for dynamic version detection
5
+ from versioningit import get_version
6
+
7
+ __version__ = get_version(root="../", config={})
8
+ except (ImportError, Exception):
9
+ # Fallback if versioningit is not installed or fails
10
+ __version__ = "1.0.0+dev"
11
+
4
12
  __author__ = "Grammatek"
5
13
 
6
14
  # Only import main entry point to avoid circular imports
revoxx/app.py CHANGED
@@ -688,6 +688,12 @@ class Revoxx:
688
688
  # Tkinter might have changed it during setup
689
689
  self.cleanup_manager.refresh_sigint_handler()
690
690
 
691
+ # Show user guide dialog if configured
692
+ if self.settings_manager.get_setting("show_user_guide_at_startup", True):
693
+ from .ui.dialogs.user_guide_dialog import UserGuideDialog
694
+
695
+ UserGuideDialog(self.window.window, self.settings_manager)
696
+
691
697
  self.window.focus_window()
692
698
  self.window.window.mainloop()
693
699
 
@@ -220,6 +220,36 @@ class DisplayController:
220
220
  """Reset the level meter display."""
221
221
  self.reset_level_meters()
222
222
 
223
+ def format_take_status(self, label: str) -> str:
224
+ """Format the take status display string for a given label.
225
+
226
+ This returns current take information in the status bar.
227
+
228
+ Args:
229
+ label: The utterance label (e.g., "utterance_001")
230
+
231
+ Returns:
232
+ - Empty string if label is None or empty
233
+ - Just the label if no active_recordings exist
234
+ - Just the label if no takes exist for this utterance
235
+ - "label - Take X/Y" if takes exist, where X is the position of the
236
+ current take in the list and Y is the total number of takes
237
+ """
238
+ if not label:
239
+ return ""
240
+
241
+ if not self.app.active_recordings:
242
+ return label
243
+
244
+ current_take = self.app.state.recording.get_current_take(label)
245
+ existing_takes = self.app.active_recordings.get_existing_takes(label)
246
+
247
+ if existing_takes and current_take in existing_takes:
248
+ position = existing_takes.index(current_take) + 1
249
+ return f"{label} - Take {position}/{len(existing_takes)}"
250
+
251
+ return label
252
+
223
253
  def set_status(self, status: str, msg_type: MsgType = MsgType.TEMPORARY) -> None:
224
254
  """Set the status bar text.
225
255
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from ..constants import FileConstants
5
+ from ..constants import FileConstants, MsgType
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from ..app import Revoxx
@@ -134,10 +134,6 @@ class NavigationController:
134
134
  # Update info overlay if visible
135
135
  if self.app.window.info_panel_visible:
136
136
  self.app.display_controller.update_info_panel()
137
- else:
138
- # No more takes in that direction
139
- direction_text = "forward" if direction > 0 else "backward"
140
- self.app.display_controller.set_status(f"No more takes {direction_text}")
141
137
 
142
138
  def find_utterance(self, index: int) -> None:
143
139
  """Navigate directly to a specific utterance by index.
@@ -252,15 +248,8 @@ class NavigationController:
252
248
  if not current_label:
253
249
  return
254
250
 
255
- current_take = self.app.state.recording.get_current_take(current_label)
256
- if not self.app.active_recordings:
257
- existing_takes = []
258
- else:
259
- existing_takes = self.app.active_recordings.get_existing_takes(
260
- current_label
261
- )
262
-
263
251
  # Update label with filename if we have a recording
252
+ current_take = self.app.state.recording.get_current_take(current_label)
264
253
  if current_take > 0:
265
254
  filename = f"take_{current_take:03d}{FileConstants.AUDIO_FILE_EXTENSION}"
266
255
  self.app.window.update_label_with_filename(current_label, filename)
@@ -277,18 +266,8 @@ class NavigationController:
277
266
  if second:
278
267
  second.update_label_with_filename(current_label)
279
268
 
280
- if existing_takes and current_take in existing_takes:
281
- # Find position in the list
282
- position = existing_takes.index(current_take) + 1
283
- total = len(existing_takes)
284
- self.app.display_controller.set_status(
285
- f"{current_label} - Take {position}/{total}"
286
- )
287
- elif not existing_takes:
288
- # Show label even without recordings
289
- self.app.display_controller.set_status(f"{current_label}")
290
- else:
291
- self.app.display_controller.set_status(f"{current_label}")
269
+ status_text = self.app.display_controller.format_take_status(current_label)
270
+ self.app.display_controller.set_status(status_text, MsgType.DEFAULT)
292
271
 
293
272
  def after_recording_saved(self, label: str) -> None:
294
273
  """Called after a recording has been saved to disk.
@@ -77,6 +77,9 @@ class ProcessManager:
77
77
  self.set_audio_queue_active(False)
78
78
  self.set_save_path(None)
79
79
 
80
+ # Check for VAD availability
81
+ self._check_vad_availability()
82
+
80
83
  def start_processes(self) -> None:
81
84
  """Start background recording and playback processes."""
82
85
  if self.app.debug:
@@ -322,3 +325,34 @@ class ProcessManager:
322
325
  and self.playback_process is not None
323
326
  and self.playback_process.is_alive()
324
327
  )
328
+
329
+ def _check_vad_availability(self) -> None:
330
+ """Check if VAD support is available and store in manager_dict."""
331
+ try:
332
+ # Try to import the VAD module from scripts_module
333
+ from scripts_module import vadiate # noqa: F401
334
+ from silero_vad import load_silero_vad # noqa: F401
335
+
336
+ vad_available = True
337
+ if self.app.debug:
338
+ print("[ProcessManager] VAD support is available")
339
+ except ImportError:
340
+ vad_available = False
341
+ if self.app.debug:
342
+ print("[ProcessManager] VAD support is not available")
343
+
344
+ if self.manager_dict is not None:
345
+ self.manager_dict["vad_available"] = vad_available
346
+
347
+ def is_vad_available(self) -> bool:
348
+ """Check if VAD support is available.
349
+
350
+ Returns:
351
+ True if VAD is available
352
+ """
353
+ if self.manager_dict:
354
+ try:
355
+ return self.manager_dict.get("vad_available", False)
356
+ except (AttributeError, KeyError):
357
+ return False
358
+ return False
@@ -147,10 +147,7 @@ class SessionController:
147
147
  self.reload_script_and_recordings()
148
148
 
149
149
  # Then apply saved sort settings from session (after data is loaded)
150
- if session:
151
- self.app.active_recordings.set_sort(
152
- session.sort_column, session.sort_reverse
153
- )
150
+ self.app.active_recordings.set_sort(session.sort_column, session.sort_reverse)
154
151
 
155
152
  self.app.window.window.title(f"Revoxx - {session.name}")
156
153
  self.app.menu.update_recent_sessions()
@@ -1,6 +1,7 @@
1
1
  """Dataset exporter for converting Revoxx sessions to Talrómur 3 format."""
2
2
 
3
3
  import shutil
4
+ import json
4
5
  from pathlib import Path
5
6
  from typing import List, Dict, Tuple, Optional, Any
6
7
  from collections import Counter
@@ -30,6 +31,7 @@ class DatasetExporter:
30
31
  audio_format: str = "flac",
31
32
  zero_intensity_emotions: List[str] = None,
32
33
  include_intensity: bool = True,
34
+ include_vad: bool = False,
33
35
  ):
34
36
  """Initialize dataset exporter.
35
37
 
@@ -38,11 +40,13 @@ class DatasetExporter:
38
40
  audio_format: Output audio format ('wav' or 'flac')
39
41
  zero_intensity_emotions: List of emotions to set intensity to 0
40
42
  include_intensity: Whether to include intensity column in index.tsv
43
+ include_vad: Whether to run VAD analysis on the exported dataset
41
44
  """
42
45
  self.output_dir = Path(output_dir)
43
46
  self.format = audio_format.lower()
44
47
  self.zero_intensity_emotions = zero_intensity_emotions or ["neutral"]
45
48
  self.include_intensity = include_intensity
49
+ self.include_vad = include_vad
46
50
 
47
51
  def _group_sessions_by_speaker(self, session_paths: List[Path]) -> Dict:
48
52
  """Group sessions by speaker name.
@@ -172,6 +176,11 @@ class DatasetExporter:
172
176
  }
173
177
  )
174
178
 
179
+ # Run VAD processing if requested
180
+ if self.include_vad:
181
+ vad_stats = self._run_vad_processing(all_datasets, progress_callback)
182
+ total_statistics["vad_statistics"] = vad_stats
183
+
175
184
  return all_datasets, total_statistics
176
185
 
177
186
  def _process_emotion_group(
@@ -387,3 +396,115 @@ class DatasetExporter:
387
396
  readme_path = dataset_dir / "README.txt"
388
397
  with open(readme_path, "w", encoding="utf-8") as f:
389
398
  f.write(readme_content)
399
+
400
+ def _run_vad_processing(
401
+ self, dataset_paths: List[Path], progress_callback=None
402
+ ) -> Dict:
403
+ """Run VAD processing on exported datasets using multiprocessing.
404
+
405
+ Args:
406
+ dataset_paths: List of dataset directories to process
407
+ progress_callback: Optional progress callback (count, message)
408
+
409
+ Returns:
410
+ Dictionary with total files processed and warnings
411
+ """
412
+ try:
413
+ from scripts_module.vadiate import get_audio_files
414
+ import multiprocessing as mp
415
+ from concurrent.futures import ProcessPoolExecutor, as_completed
416
+ except ImportError:
417
+ return {} # VAD not available
418
+
419
+ # Count total files for progress
420
+ total_files = sum(len(get_audio_files(str(d))) for d in dataset_paths)
421
+ if total_files == 0:
422
+ return {}
423
+
424
+ processed = 0
425
+ vad_statistics = {"total_files": total_files, "warnings": []}
426
+
427
+ # Use process pool for parallel processing
428
+ # Each process handles VAD analysis for one complete dataset (speaker)
429
+ # This means if we export 3 speakers, we use up to 3 processes
430
+ # Each process analyzes all audio files within its assigned speaker's dataset
431
+ num_workers = min(mp.cpu_count(), len(dataset_paths))
432
+
433
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
434
+ # Submit one VAD processing task per dataset (per speaker)
435
+ # Each task processes all audio files in that speaker's dataset directory
436
+ future_to_dataset = {
437
+ executor.submit(self._process_dataset_vad, dataset_path): dataset_path
438
+ for dataset_path in dataset_paths
439
+ }
440
+
441
+ # Process completed tasks
442
+ for future in as_completed(future_to_dataset):
443
+ dataset_path = future_to_dataset[future]
444
+ try:
445
+ result = future.result()
446
+ processed += result["files_processed"]
447
+ vad_statistics["warnings"].extend(result["warnings"])
448
+ if progress_callback:
449
+ progress_callback(
450
+ processed, f"VAD analysis: {processed}/{total_files}"
451
+ )
452
+ except Exception as e:
453
+ vad_statistics["warnings"].append(
454
+ f"VAD processing error for {dataset_path}: {e}"
455
+ )
456
+
457
+ return vad_statistics
458
+
459
+ @staticmethod
460
+ def _process_dataset_vad(dataset_path: Path) -> Dict:
461
+ """Process VAD for a single dataset (one speaker's complete dataset).
462
+
463
+ This method runs in a separate process and handles all audio files
464
+ for one speaker. If multiple speakers were exported, each speaker's
465
+ dataset is processed by a different process in parallel.
466
+
467
+ Args:
468
+ dataset_path: Path to the dataset directory for one speaker
469
+
470
+ Returns:
471
+ Dictionary with files processed and warnings
472
+ """
473
+ from scripts_module.vadiate import (
474
+ get_audio_files,
475
+ process_audio,
476
+ load_silero_vad,
477
+ )
478
+
479
+ vad_output = dataset_path / "vad.json"
480
+ audio_files = get_audio_files(str(dataset_path))
481
+
482
+ result_info = {"files_processed": 0, "warnings": []}
483
+
484
+ if not audio_files:
485
+ return result_info
486
+
487
+ # Load model for this process
488
+ model = load_silero_vad()
489
+ results = {}
490
+
491
+ for file_path in audio_files:
492
+ try:
493
+ rel_path, result, warnings = process_audio(
494
+ file_path,
495
+ model,
496
+ str(dataset_path),
497
+ use_dynamic_threshold=True,
498
+ collect_warnings=True,
499
+ )
500
+ results[rel_path] = result
501
+ result_info["warnings"].extend(warnings)
502
+ result_info["files_processed"] += 1
503
+ except Exception as e:
504
+ result_info["warnings"].append(f"VAD error for {file_path}: {e}")
505
+
506
+ # Save results
507
+ with open(vad_output, "w") as f:
508
+ json.dump(results, f, indent=2)
509
+
510
+ return result_info
@@ -0,0 +1,272 @@
1
+ # Revoxx User Guide
2
+
3
+ ## Quick Start Guide
4
+
5
+ This guide will walk you through using Revoxx for speech recording.
6
+
7
+ ## 1. Prepare Your Recording Script
8
+
9
+ Before you can start recording, you need to prepare a script containing all the utterances you want to record. Revoxx uses the Festival-style script format for organizing utterances.
10
+
11
+ ### Using the Import Tool
12
+
13
+ The easiest way to create a script is through the built-in import tool:
14
+
15
+ 1. Navigate to **Tools → Import Text to Script** in the menu bar.
16
+ 2. Select your input text and output script file paths.
17
+ 3. Configure the text splitting options:
18
+ - **Split by**: Choose how to divide your text into utterances:
19
+ - **Lines**: Each line in the input file becomes one utterance
20
+ - **Sentences**: Text is split at sentence boundaries (periods, question marks, exclamation points)
21
+ - **Paragraphs**: Each paragraph (separated by blank lines) becomes one utterance
22
+ - **Maximum characters**: Set a character limit for each utterance. The tool displays statistics for your input file showing the character count distribution to help you choose an appropriate limit.
23
+ 4. Configure additional import settings:
24
+ - **Emotion levels**: If you're recording emotional speech with different intensities, you can choose between two methods:
25
+ - **Fixed levels**: Define specific emotion levels (e.g., 1-5) that will be assigned sequentially or randomly
26
+ - **Normal distribution**: Set parameters for a gaussian distribution:
27
+ - **Mean**: The center of the distribution (average emotion level)
28
+ - **Standard deviation**: How spread out the emotion levels should be
29
+ - **Minimum/Maximum**: The bounds for the emotion levels
30
+ - The tool displays a graph showing the actual distribution of emotion levels that will be assigned to your utterances based on these parameters
31
+ - **Output location**: Select where the generated script file will be saved.
32
+ 5. Click **Import** to generate your script file in the correct format.
33
+
34
+ ### Script Format Examples
35
+
36
+ **Emotional speech:**
37
+ ```
38
+ ( spk1_happy_001 "happy-3: I love sunny days!" )
39
+ ( spk1_sad_002 "sad-4: The rain makes me melancholy." )
40
+ ```
41
+
42
+ **Neutral speech:**
43
+ ```
44
+ ( spk1_001 "The weather forecast predicts sunshine." )
45
+ ( spk1_002 "Tomorrow will be partly cloudy." )
46
+ ```
47
+
48
+ ## 2. Create a New Recording Session
49
+
50
+ Once you have prepared your script, you can create a new recording session:
51
+
52
+ 1. Select **File → New Session...** from the menu to open the session configuration dialog.
53
+ 2. Configure the various session settings:
54
+
55
+ ### Speaker Information
56
+ - **Speaker Name**: Enter the full name of the voice talent who will be recording.
57
+ - **Gender**: Select the appropriate gender option (M/F/X).
58
+ - **Emotion**: Choose either a specific emotion (happy, sad, angry, etc.) or select "neutral" for non-emotional recordings.
59
+
60
+ ### Script Selection
61
+ - Click **Browse** to select your prepared script file.
62
+ - The script file determines which utterances will appear during the recording session.
63
+
64
+ ### Session Storage
65
+ - **Base Directory**: Choose the parent folder where all your recording sessions will be stored.
66
+ - **Session Directory**: You can enter a descriptive name for this session - or a subfolder with the speaker name and emotion will automatically be created.
67
+
68
+ ### Audio Configuration
69
+ - **Input Device**: Select your audio interface from the dropdown list.
70
+ - **Sample Rate**: Choose at least 48000 Hz for professional quality recordings (44100 Hz is also acceptable).
71
+ - **Bit Depth**: Select 24-bit for optimal dynamic range.
72
+ - **Recording Format**: FLAC format is recommended as it provides losslessly compressed audio.
73
+
74
+ 3. After configuring all settings, click **OK** to start your recording session.
75
+
76
+ ## 3. Main Recording Interface
77
+
78
+ ### Selecting Recording Standards
79
+
80
+ Before calibrating, select the appropriate recording standard for your project. Navigate to **Settings → Level Meter Preset** and choose from these industry standards:
81
+
82
+ - **EBU R128 broadcast** (default): For broadcast content with -23 LUFS integrated loudness
83
+ - **ACX/Audible audiobook**: For audiobook recording with RMS levels between -23 to -18 dB
84
+ - **Podcast standard**: For podcast production targeting -16 to -14 LUFS
85
+ - **Film dialog recording**: For film/video dialog with peaks between -27 to -20 dBFS
86
+ - **Music vocal recording**: For music production with peaks between -18 to -12 dBFS
87
+
88
+ Each preset automatically configures the level meter with appropriate target ranges, warning thresholds, and measurement windows specific to that recording standard.
89
+
90
+ ### Before Recording - Calibration
91
+
92
+ It's important to calibrate your input levels before starting the recording session:
93
+
94
+ 1. Press **M** to enter Monitor Mode. In this mode, you can see the input levels without recording.
95
+ 2. Ask the speaker to test their voice at the volume they will use during recording.
96
+ 3. Adjust the input gain on your audio interface until the levels fall within the target range for your selected preset (shown in dotted lines on the meter).
97
+ 4. Press **M** again to exit Monitor Mode and return to normal recording mode.
98
+
99
+ ### Recording Controls
100
+
101
+ | Key | Action |
102
+ |--------------|-----------------------------------------------------------------|
103
+ | **SPACE** | Start/Stop recording |
104
+ | **P** | Play the current take |
105
+ | **⌘/Ctrl+D** | Delete the current take (it moves to trash folder subdirectory) |
106
+ | **↑/↓** | Navigate between utterances in the list |
107
+ | **←/→** | Navigate between different takes of the current utterance |
108
+ | **⌘/Ctrl+U** | Change the utterance ordering method |
109
+ | **F1** | Show all keyboard shortcuts |
110
+
111
+ **Important:** When multiple takes exist for an utterance, the system always uses the most recent take for export. You can navigate through all takes using the left/right arrow keys to review them, but only the last recorded take will be included in the final dataset.
112
+
113
+ ### Recording Process
114
+
115
+ When recording each utterance, follow these steps:
116
+
117
+ 1. Speaker takes a deep breath before recording starts
118
+ 2. Press **SPACE** to start recording
119
+ 3. Wait 1-2 seconds (speaker holds breath)
120
+ 4. Speaker reads the utterance
121
+ 5. Speaker holds breath at the end
122
+ 6. Wait 1-2 seconds before pressing **SPACE** to stop
123
+ 7. Review the spectrogram or press **P** to play
124
+
125
+ **Breathing**: No breath sounds at the beginning or end of recordings. Speaker should inhale before recording starts and hold breath when finished.
126
+
127
+ **Why silence matters**: The pauses at the beginning and end of each recording serve two purposes:
128
+
129
+ - They provide clean boundaries for audio processing algorithms
130
+ - When VAD (Voice Activity Detection) is enabled during export, these silence regions help the system accurately detect speech boundaries and help trim recordings to consistent silence lengths
131
+
132
+ **VAD Processing**: During dataset export, if VAD analysis is enabled, the system will:
133
+
134
+ - Detect the exact start and end of speech in each recording
135
+ - Generate precise timing information for downstream TTS training
136
+
137
+ ### Display Options
138
+
139
+ | Key | Action |
140
+ |---------------|-------------------------------|
141
+ | **M** | Toggle meter display |
142
+ | **I** | Toggle info display at bottom |
143
+ | **F10** | Fullscreen main window |
144
+ | **Shift+F10** | Fullscreen speaker window |
145
+
146
+ ### Spectrogram Navigation
147
+
148
+ The spectrograms provide visual feedback of your recordings. You can interact with them using:
149
+
150
+ - **Mouse wheel**: Scroll up to zoom in for more detail, scroll down to zoom out.
151
+ - **Right-click + drag**: When zoomed in, hold the right mouse button and drag to pan left or right through the spectrogram.
152
+
153
+ ## 4. Multi-Screen Setup
154
+
155
+ Revoxx supports dual-screen setups, allowing you to have separate displays for the engineer and the speaker.
156
+
157
+ ### Enabling Speaker Display
158
+
159
+ 1. Navigate to **Settings → 2nd Window → Enable 2nd Window** in the menu.
160
+ 2. A second window will appear that can mirror the main recording interface or be configured for minimal display.
161
+ 3. Drag this window to your second monitor, external display, or iPad.
162
+ 4. Configure what appears on the second window through the settings:
163
+ - **Full Interface**: Shows everything from the main window including spectrograms and meters.
164
+ - **Minimal Mode**: Shows only the utterance text and status bar to maximize screen space for the text.
165
+ - You can toggle individual elements like spectrograms, meters, and info displays.
166
+
167
+ The minimal mode is recommended for speakers as it removes technical distractions and makes the text as large as possible.
168
+
169
+ ### Using iPad with Sidecar (macOS)
170
+
171
+ For a portable dual-screen setup on macOS:
172
+
173
+ 1. Connect your iPad to your Mac using the Sidecar feature.
174
+ 2. Once connected, simply drag the speaker window to the iPad display.
175
+ 3. The speaker can now comfortably read from the iPad while you control the recording from your main screen.
176
+
177
+ ## 5. Session Management
178
+
179
+ ### Utterance Ordering
180
+
181
+ The order in which utterances appear during recording can be customized to suit your workflow:
182
+
183
+ - By default, utterances appear in the order they are listed in the script file.
184
+ - Press **⌘/Ctrl+U** to open the utterance ordering dialog where you can sort by:
185
+ - **Label**: Alphabetical order by utterance ID
186
+ - **Emotion level**: Groups utterances by emotion and intensity
187
+ - **Text content**: Alphabetical order by the actual text
188
+ - **Text length**: Shortest to longest utterances
189
+ - **Number of takes**: Prioritize utterances with fewer recordings
190
+
191
+ For each sorting option, you can also choose the sort direction.
192
+
193
+ ### Finding Utterances
194
+
195
+ When you need to locate specific utterances quickly:
196
+
197
+ - Use **Edit → Find Utterance** (⌘/Ctrl+F) to open the search dialog.
198
+ - You can search by typing any part of the utterance text.
199
+ - The search results can be sorted using the same criteria as the ordering options.
200
+ - Double-click any result to jump directly to that utterance.
201
+
202
+ ### Session Progress
203
+
204
+ Revoxx automatically manages your recording progress:
205
+
206
+ - Your progress is saved automatically after each recording.
207
+ - Use **File → Recent Sessions** to see a list of your recent work and quickly resume where you left off.
208
+ - All session settings, including audio configuration and display preferences, are preserved between sessions.
209
+
210
+ ## 6. Best Practices
211
+
212
+ ### Recording Environment
213
+
214
+ Creating the right recording environment is essential for high-quality results:
215
+
216
+ 1. **Acoustic Treatment**: Use a professionally treated room or vocal booth to minimize reflections and external noise.
217
+ 2. **Microphone Position**: Maintain a consistent distance of 6-12 inches from the microphone. Use the monitoring mode to verify that levels remain stable as the speaker moves.
218
+ 3. **Pop Filter**: Always use a pop filter positioned between the speaker and microphone to prevent plosive sounds from ruining takes.
219
+ 4. **Headphones**: Provide closed-back headphones for the voice talent to prevent audio bleed into the microphone.
220
+
221
+ ### Recording Workflow
222
+
223
+ Follow these guidelines for efficient and consistent recording sessions:
224
+
225
+ 1. **Warm-up**: Always have the speaker perform vocal exercises before starting to ensure their voice is ready.
226
+ 2. **Consistency**: Help the speaker maintain the same sitting position and energy level throughout the session.
227
+ 3. **Breathing technique**: For longer utterances with natural pauses, breathing between phrases is acceptable and natural. However, ensure no breath sounds at the beginning or end of the recording.
228
+ 4. **Breaks**: Schedule regular breaks every 30-45 minutes to prevent vocal fatigue and maintain quality.
229
+ 5. **Multiple Takes**: For difficult or important utterances, record 2-3 takes to have options during dataset creation.
230
+ 6. **Review**: Periodically stop to review recent recordings and ensure quality standards are being maintained.
231
+
232
+ ## 7. Export and Dataset Creation
233
+
234
+ Once you have completed recording, you can export your sessions into datasets suitable for TTS training:
235
+
236
+ 1. Select **File → Export Dataset** from the menu to open the export dialog.
237
+ 2. Choose which recording sessions you want to include in the dataset. You can select multiple sessions from any speaker. Datasets are combined by speaker name automatically.
238
+ 3. Configure the export options according to your needs:
239
+ - T3 format is chosen automatically
240
+ - **VAD analysis**: If VAD support is installed, you can generate voice activity timestamps for each audio file.
241
+ 4. Click **Export** to create an organized dataset structure that's ready for TTS model training. The export process will handle file naming, metadata generation, and directory organization automatically.
242
+
243
+ ## 8. Troubleshooting
244
+
245
+ ### Audio Issues
246
+
247
+ If you encounter audio problems during recording:
248
+
249
+ - **No input signal**: Check that your audio interface is properly connected and that the correct drivers are installed. Verify that the input device is selected correctly and you have selected the correct channel.
250
+ - **Clipping or distortion**: Reduce the input gain on your audio interface. The peaks should never exceed -3 dB during normal speech.
251
+ - **Audio dropouts**: This may indicate buffer size issues. Close unnecessary applications and ensure your computer meets the performance requirements.
252
+
253
+ ### Display Issues
254
+
255
+ For problems with the visual interface:
256
+
257
+ - **Text too small/large**: The text size adjusts automatically based on window size. Simply resize the window to change the text size.
258
+ - **Speaker window position lost**: If the second window appears off-screen, go to Settings → 2nd Window and disable/re-enable it.
259
+ - **Spectrogram not updating**: Press 'M' to toggle the meter display off and on again. This will refresh the visualization.
260
+
261
+ ### Performance
262
+
263
+ To maintain optimal performance:
264
+
265
+ - **Slow response**: Close other CPU-intensive applications while recording. Audio processing requires significant resources. We have had best results with M-based Macs
266
+ - **Disk space**: Ensure you have adequate free space on your recording drive. Each hour of 48kHz/24-bit recording requires approximately 500MB.
267
+ - **Memory usage**: If the application becomes sluggish after extended use, save your session and restart Revoxx to clear the memory.
268
+
269
+ ## Need Help?
270
+
271
+ - Press **F1** for keyboard shortcuts
272
+ - Report issues at: https://github.com/icelandic-lt/revoxx/issues