revoxx 1.0.0.dev22__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- revoxx/__init__.py +9 -1
- revoxx/app.py +6 -0
- revoxx/controllers/display_controller.py +30 -0
- revoxx/controllers/navigation_controller.py +4 -25
- revoxx/controllers/process_manager.py +34 -0
- revoxx/controllers/session_controller.py +1 -4
- revoxx/dataset/exporter.py +121 -0
- revoxx/doc/USER_GUIDE.md +272 -0
- revoxx/ui/dialogs/dataset_dialog.py +108 -5
- revoxx/ui/dialogs/open_session_dialog.py +54 -6
- revoxx/ui/dialogs/session_settings_dialog.py +262 -88
- revoxx/ui/dialogs/user_guide_dialog.py +249 -0
- revoxx/ui/dialogs/utterance_list_base.py +50 -13
- revoxx/ui/icon.py +1 -44
- revoxx/ui/menus/application_menu.py +13 -1
- revoxx/ui/window_base.py +8 -3
- revoxx/ui/window_factory.py +23 -30
- revoxx/utils/device_manager.py +1 -1
- revoxx/utils/process_cleanup.py +12 -4
- revoxx/utils/settings_manager.py +3 -0
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/METADATA +65 -10
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/RECORD +27 -25
- scripts_module/vadiate.py +19 -7
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/WHEEL +0 -0
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/entry_points.txt +0 -0
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {revoxx-1.0.0.dev22.dist-info → revoxx-1.0.2.dist-info}/top_level.txt +0 -0
revoxx/__init__.py
CHANGED
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
"""Revoxx Recorder - A tool for recording emotional speech."""
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
try:
|
|
4
|
+
# Try to use versioningit for dynamic version detection
|
|
5
|
+
from versioningit import get_version
|
|
6
|
+
|
|
7
|
+
__version__ = get_version(root="../", config={})
|
|
8
|
+
except (ImportError, Exception):
|
|
9
|
+
# Fallback if versioningit is not installed or fails
|
|
10
|
+
__version__ = "1.0.0+dev"
|
|
11
|
+
|
|
4
12
|
__author__ = "Grammatek"
|
|
5
13
|
|
|
6
14
|
# Only import main entry point to avoid circular imports
|
revoxx/app.py
CHANGED
|
@@ -688,6 +688,12 @@ class Revoxx:
|
|
|
688
688
|
# Tkinter might have changed it during setup
|
|
689
689
|
self.cleanup_manager.refresh_sigint_handler()
|
|
690
690
|
|
|
691
|
+
# Show user guide dialog if configured
|
|
692
|
+
if self.settings_manager.get_setting("show_user_guide_at_startup", True):
|
|
693
|
+
from .ui.dialogs.user_guide_dialog import UserGuideDialog
|
|
694
|
+
|
|
695
|
+
UserGuideDialog(self.window.window, self.settings_manager)
|
|
696
|
+
|
|
691
697
|
self.window.focus_window()
|
|
692
698
|
self.window.window.mainloop()
|
|
693
699
|
|
|
@@ -220,6 +220,36 @@ class DisplayController:
|
|
|
220
220
|
"""Reset the level meter display."""
|
|
221
221
|
self.reset_level_meters()
|
|
222
222
|
|
|
223
|
+
def format_take_status(self, label: str) -> str:
|
|
224
|
+
"""Format the take status display string for a given label.
|
|
225
|
+
|
|
226
|
+
This returns current take information in the status bar.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
label: The utterance label (e.g., "utterance_001")
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
- Empty string if label is None or empty
|
|
233
|
+
- Just the label if no active_recordings exist
|
|
234
|
+
- Just the label if no takes exist for this utterance
|
|
235
|
+
- "label - Take X/Y" if takes exist, where X is the position of the
|
|
236
|
+
current take in the list and Y is the total number of takes
|
|
237
|
+
"""
|
|
238
|
+
if not label:
|
|
239
|
+
return ""
|
|
240
|
+
|
|
241
|
+
if not self.app.active_recordings:
|
|
242
|
+
return label
|
|
243
|
+
|
|
244
|
+
current_take = self.app.state.recording.get_current_take(label)
|
|
245
|
+
existing_takes = self.app.active_recordings.get_existing_takes(label)
|
|
246
|
+
|
|
247
|
+
if existing_takes and current_take in existing_takes:
|
|
248
|
+
position = existing_takes.index(current_take) + 1
|
|
249
|
+
return f"{label} - Take {position}/{len(existing_takes)}"
|
|
250
|
+
|
|
251
|
+
return label
|
|
252
|
+
|
|
223
253
|
def set_status(self, status: str, msg_type: MsgType = MsgType.TEMPORARY) -> None:
|
|
224
254
|
"""Set the status bar text.
|
|
225
255
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from ..constants import FileConstants
|
|
5
|
+
from ..constants import FileConstants, MsgType
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from ..app import Revoxx
|
|
@@ -134,10 +134,6 @@ class NavigationController:
|
|
|
134
134
|
# Update info overlay if visible
|
|
135
135
|
if self.app.window.info_panel_visible:
|
|
136
136
|
self.app.display_controller.update_info_panel()
|
|
137
|
-
else:
|
|
138
|
-
# No more takes in that direction
|
|
139
|
-
direction_text = "forward" if direction > 0 else "backward"
|
|
140
|
-
self.app.display_controller.set_status(f"No more takes {direction_text}")
|
|
141
137
|
|
|
142
138
|
def find_utterance(self, index: int) -> None:
|
|
143
139
|
"""Navigate directly to a specific utterance by index.
|
|
@@ -252,15 +248,8 @@ class NavigationController:
|
|
|
252
248
|
if not current_label:
|
|
253
249
|
return
|
|
254
250
|
|
|
255
|
-
current_take = self.app.state.recording.get_current_take(current_label)
|
|
256
|
-
if not self.app.active_recordings:
|
|
257
|
-
existing_takes = []
|
|
258
|
-
else:
|
|
259
|
-
existing_takes = self.app.active_recordings.get_existing_takes(
|
|
260
|
-
current_label
|
|
261
|
-
)
|
|
262
|
-
|
|
263
251
|
# Update label with filename if we have a recording
|
|
252
|
+
current_take = self.app.state.recording.get_current_take(current_label)
|
|
264
253
|
if current_take > 0:
|
|
265
254
|
filename = f"take_{current_take:03d}{FileConstants.AUDIO_FILE_EXTENSION}"
|
|
266
255
|
self.app.window.update_label_with_filename(current_label, filename)
|
|
@@ -277,18 +266,8 @@ class NavigationController:
|
|
|
277
266
|
if second:
|
|
278
267
|
second.update_label_with_filename(current_label)
|
|
279
268
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
position = existing_takes.index(current_take) + 1
|
|
283
|
-
total = len(existing_takes)
|
|
284
|
-
self.app.display_controller.set_status(
|
|
285
|
-
f"{current_label} - Take {position}/{total}"
|
|
286
|
-
)
|
|
287
|
-
elif not existing_takes:
|
|
288
|
-
# Show label even without recordings
|
|
289
|
-
self.app.display_controller.set_status(f"{current_label}")
|
|
290
|
-
else:
|
|
291
|
-
self.app.display_controller.set_status(f"{current_label}")
|
|
269
|
+
status_text = self.app.display_controller.format_take_status(current_label)
|
|
270
|
+
self.app.display_controller.set_status(status_text, MsgType.DEFAULT)
|
|
292
271
|
|
|
293
272
|
def after_recording_saved(self, label: str) -> None:
|
|
294
273
|
"""Called after a recording has been saved to disk.
|
|
@@ -77,6 +77,9 @@ class ProcessManager:
|
|
|
77
77
|
self.set_audio_queue_active(False)
|
|
78
78
|
self.set_save_path(None)
|
|
79
79
|
|
|
80
|
+
# Check for VAD availability
|
|
81
|
+
self._check_vad_availability()
|
|
82
|
+
|
|
80
83
|
def start_processes(self) -> None:
|
|
81
84
|
"""Start background recording and playback processes."""
|
|
82
85
|
if self.app.debug:
|
|
@@ -322,3 +325,34 @@ class ProcessManager:
|
|
|
322
325
|
and self.playback_process is not None
|
|
323
326
|
and self.playback_process.is_alive()
|
|
324
327
|
)
|
|
328
|
+
|
|
329
|
+
def _check_vad_availability(self) -> None:
|
|
330
|
+
"""Check if VAD support is available and store in manager_dict."""
|
|
331
|
+
try:
|
|
332
|
+
# Try to import the VAD module from scripts_module
|
|
333
|
+
from scripts_module import vadiate # noqa: F401
|
|
334
|
+
from silero_vad import load_silero_vad # noqa: F401
|
|
335
|
+
|
|
336
|
+
vad_available = True
|
|
337
|
+
if self.app.debug:
|
|
338
|
+
print("[ProcessManager] VAD support is available")
|
|
339
|
+
except ImportError:
|
|
340
|
+
vad_available = False
|
|
341
|
+
if self.app.debug:
|
|
342
|
+
print("[ProcessManager] VAD support is not available")
|
|
343
|
+
|
|
344
|
+
if self.manager_dict is not None:
|
|
345
|
+
self.manager_dict["vad_available"] = vad_available
|
|
346
|
+
|
|
347
|
+
def is_vad_available(self) -> bool:
|
|
348
|
+
"""Check if VAD support is available.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
True if VAD is available
|
|
352
|
+
"""
|
|
353
|
+
if self.manager_dict:
|
|
354
|
+
try:
|
|
355
|
+
return self.manager_dict.get("vad_available", False)
|
|
356
|
+
except (AttributeError, KeyError):
|
|
357
|
+
return False
|
|
358
|
+
return False
|
|
@@ -147,10 +147,7 @@ class SessionController:
|
|
|
147
147
|
self.reload_script_and_recordings()
|
|
148
148
|
|
|
149
149
|
# Then apply saved sort settings from session (after data is loaded)
|
|
150
|
-
|
|
151
|
-
self.app.active_recordings.set_sort(
|
|
152
|
-
session.sort_column, session.sort_reverse
|
|
153
|
-
)
|
|
150
|
+
self.app.active_recordings.set_sort(session.sort_column, session.sort_reverse)
|
|
154
151
|
|
|
155
152
|
self.app.window.window.title(f"Revoxx - {session.name}")
|
|
156
153
|
self.app.menu.update_recent_sessions()
|
revoxx/dataset/exporter.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Dataset exporter for converting Revoxx sessions to Talrómur 3 format."""
|
|
2
2
|
|
|
3
3
|
import shutil
|
|
4
|
+
import json
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import List, Dict, Tuple, Optional, Any
|
|
6
7
|
from collections import Counter
|
|
@@ -30,6 +31,7 @@ class DatasetExporter:
|
|
|
30
31
|
audio_format: str = "flac",
|
|
31
32
|
zero_intensity_emotions: List[str] = None,
|
|
32
33
|
include_intensity: bool = True,
|
|
34
|
+
include_vad: bool = False,
|
|
33
35
|
):
|
|
34
36
|
"""Initialize dataset exporter.
|
|
35
37
|
|
|
@@ -38,11 +40,13 @@ class DatasetExporter:
|
|
|
38
40
|
audio_format: Output audio format ('wav' or 'flac')
|
|
39
41
|
zero_intensity_emotions: List of emotions to set intensity to 0
|
|
40
42
|
include_intensity: Whether to include intensity column in index.tsv
|
|
43
|
+
include_vad: Whether to run VAD analysis on the exported dataset
|
|
41
44
|
"""
|
|
42
45
|
self.output_dir = Path(output_dir)
|
|
43
46
|
self.format = audio_format.lower()
|
|
44
47
|
self.zero_intensity_emotions = zero_intensity_emotions or ["neutral"]
|
|
45
48
|
self.include_intensity = include_intensity
|
|
49
|
+
self.include_vad = include_vad
|
|
46
50
|
|
|
47
51
|
def _group_sessions_by_speaker(self, session_paths: List[Path]) -> Dict:
|
|
48
52
|
"""Group sessions by speaker name.
|
|
@@ -172,6 +176,11 @@ class DatasetExporter:
|
|
|
172
176
|
}
|
|
173
177
|
)
|
|
174
178
|
|
|
179
|
+
# Run VAD processing if requested
|
|
180
|
+
if self.include_vad:
|
|
181
|
+
vad_stats = self._run_vad_processing(all_datasets, progress_callback)
|
|
182
|
+
total_statistics["vad_statistics"] = vad_stats
|
|
183
|
+
|
|
175
184
|
return all_datasets, total_statistics
|
|
176
185
|
|
|
177
186
|
def _process_emotion_group(
|
|
@@ -387,3 +396,115 @@ class DatasetExporter:
|
|
|
387
396
|
readme_path = dataset_dir / "README.txt"
|
|
388
397
|
with open(readme_path, "w", encoding="utf-8") as f:
|
|
389
398
|
f.write(readme_content)
|
|
399
|
+
|
|
400
|
+
def _run_vad_processing(
|
|
401
|
+
self, dataset_paths: List[Path], progress_callback=None
|
|
402
|
+
) -> Dict:
|
|
403
|
+
"""Run VAD processing on exported datasets using multiprocessing.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
dataset_paths: List of dataset directories to process
|
|
407
|
+
progress_callback: Optional progress callback (count, message)
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Dictionary with total files processed and warnings
|
|
411
|
+
"""
|
|
412
|
+
try:
|
|
413
|
+
from scripts_module.vadiate import get_audio_files
|
|
414
|
+
import multiprocessing as mp
|
|
415
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
416
|
+
except ImportError:
|
|
417
|
+
return {} # VAD not available
|
|
418
|
+
|
|
419
|
+
# Count total files for progress
|
|
420
|
+
total_files = sum(len(get_audio_files(str(d))) for d in dataset_paths)
|
|
421
|
+
if total_files == 0:
|
|
422
|
+
return {}
|
|
423
|
+
|
|
424
|
+
processed = 0
|
|
425
|
+
vad_statistics = {"total_files": total_files, "warnings": []}
|
|
426
|
+
|
|
427
|
+
# Use process pool for parallel processing
|
|
428
|
+
# Each process handles VAD analysis for one complete dataset (speaker)
|
|
429
|
+
# This means if we export 3 speakers, we use up to 3 processes
|
|
430
|
+
# Each process analyzes all audio files within its assigned speaker's dataset
|
|
431
|
+
num_workers = min(mp.cpu_count(), len(dataset_paths))
|
|
432
|
+
|
|
433
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
434
|
+
# Submit one VAD processing task per dataset (per speaker)
|
|
435
|
+
# Each task processes all audio files in that speaker's dataset directory
|
|
436
|
+
future_to_dataset = {
|
|
437
|
+
executor.submit(self._process_dataset_vad, dataset_path): dataset_path
|
|
438
|
+
for dataset_path in dataset_paths
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
# Process completed tasks
|
|
442
|
+
for future in as_completed(future_to_dataset):
|
|
443
|
+
dataset_path = future_to_dataset[future]
|
|
444
|
+
try:
|
|
445
|
+
result = future.result()
|
|
446
|
+
processed += result["files_processed"]
|
|
447
|
+
vad_statistics["warnings"].extend(result["warnings"])
|
|
448
|
+
if progress_callback:
|
|
449
|
+
progress_callback(
|
|
450
|
+
processed, f"VAD analysis: {processed}/{total_files}"
|
|
451
|
+
)
|
|
452
|
+
except Exception as e:
|
|
453
|
+
vad_statistics["warnings"].append(
|
|
454
|
+
f"VAD processing error for {dataset_path}: {e}"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
return vad_statistics
|
|
458
|
+
|
|
459
|
+
@staticmethod
|
|
460
|
+
def _process_dataset_vad(dataset_path: Path) -> Dict:
|
|
461
|
+
"""Process VAD for a single dataset (one speaker's complete dataset).
|
|
462
|
+
|
|
463
|
+
This method runs in a separate process and handles all audio files
|
|
464
|
+
for one speaker. If multiple speakers were exported, each speaker's
|
|
465
|
+
dataset is processed by a different process in parallel.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
dataset_path: Path to the dataset directory for one speaker
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Dictionary with files processed and warnings
|
|
472
|
+
"""
|
|
473
|
+
from scripts_module.vadiate import (
|
|
474
|
+
get_audio_files,
|
|
475
|
+
process_audio,
|
|
476
|
+
load_silero_vad,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
vad_output = dataset_path / "vad.json"
|
|
480
|
+
audio_files = get_audio_files(str(dataset_path))
|
|
481
|
+
|
|
482
|
+
result_info = {"files_processed": 0, "warnings": []}
|
|
483
|
+
|
|
484
|
+
if not audio_files:
|
|
485
|
+
return result_info
|
|
486
|
+
|
|
487
|
+
# Load model for this process
|
|
488
|
+
model = load_silero_vad()
|
|
489
|
+
results = {}
|
|
490
|
+
|
|
491
|
+
for file_path in audio_files:
|
|
492
|
+
try:
|
|
493
|
+
rel_path, result, warnings = process_audio(
|
|
494
|
+
file_path,
|
|
495
|
+
model,
|
|
496
|
+
str(dataset_path),
|
|
497
|
+
use_dynamic_threshold=True,
|
|
498
|
+
collect_warnings=True,
|
|
499
|
+
)
|
|
500
|
+
results[rel_path] = result
|
|
501
|
+
result_info["warnings"].extend(warnings)
|
|
502
|
+
result_info["files_processed"] += 1
|
|
503
|
+
except Exception as e:
|
|
504
|
+
result_info["warnings"].append(f"VAD error for {file_path}: {e}")
|
|
505
|
+
|
|
506
|
+
# Save results
|
|
507
|
+
with open(vad_output, "w") as f:
|
|
508
|
+
json.dump(results, f, indent=2)
|
|
509
|
+
|
|
510
|
+
return result_info
|
revoxx/doc/USER_GUIDE.md
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# Revoxx User Guide
|
|
2
|
+
|
|
3
|
+
## Quick Start Guide
|
|
4
|
+
|
|
5
|
+
This guide will walk you through using Revoxx for speech recording.
|
|
6
|
+
|
|
7
|
+
## 1. Prepare Your Recording Script
|
|
8
|
+
|
|
9
|
+
Before you can start recording, you need to prepare a script containing all the utterances you want to record. Revoxx uses the Festival-style script format for organizing utterances.
|
|
10
|
+
|
|
11
|
+
### Using the Import Tool
|
|
12
|
+
|
|
13
|
+
The easiest way to create a script is through the built-in import tool:
|
|
14
|
+
|
|
15
|
+
1. Navigate to **Tools → Import Text to Script** in the menu bar.
|
|
16
|
+
2. Select your input text and output script file paths.
|
|
17
|
+
3. Configure the text splitting options:
|
|
18
|
+
- **Split by**: Choose how to divide your text into utterances:
|
|
19
|
+
- **Lines**: Each line in the input file becomes one utterance
|
|
20
|
+
- **Sentences**: Text is split at sentence boundaries (periods, question marks, exclamation points)
|
|
21
|
+
- **Paragraphs**: Each paragraph (separated by blank lines) becomes one utterance
|
|
22
|
+
- **Maximum characters**: Set a character limit for each utterance. The tool displays statistics for your input file showing the character count distribution to help you choose an appropriate limit.
|
|
23
|
+
4. Configure additional import settings:
|
|
24
|
+
- **Emotion levels**: If you're recording emotional speech with different intensities, you can choose between two methods:
|
|
25
|
+
- **Fixed levels**: Define specific emotion levels (e.g., 1-5) that will be assigned sequentially or randomly
|
|
26
|
+
- **Normal distribution**: Set parameters for a gaussian distribution:
|
|
27
|
+
- **Mean**: The center of the distribution (average emotion level)
|
|
28
|
+
- **Standard deviation**: How spread out the emotion levels should be
|
|
29
|
+
- **Minimum/Maximum**: The bounds for the emotion levels
|
|
30
|
+
- The tool displays a graph showing the actual distribution of emotion levels that will be assigned to your utterances based on these parameters
|
|
31
|
+
- **Output location**: Select where the generated script file will be saved.
|
|
32
|
+
5. Click **Import** to generate your script file in the correct format.
|
|
33
|
+
|
|
34
|
+
### Script Format Examples
|
|
35
|
+
|
|
36
|
+
**Emotional speech:**
|
|
37
|
+
```
|
|
38
|
+
( spk1_happy_001 "happy-3: I love sunny days!" )
|
|
39
|
+
( spk1_sad_002 "sad-4: The rain makes me melancholy." )
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Neutral speech:**
|
|
43
|
+
```
|
|
44
|
+
( spk1_001 "The weather forecast predicts sunshine." )
|
|
45
|
+
( spk1_002 "Tomorrow will be partly cloudy." )
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## 2. Create a New Recording Session
|
|
49
|
+
|
|
50
|
+
Once you have prepared your script, you can create a new recording session:
|
|
51
|
+
|
|
52
|
+
1. Select **File → New Session...** from the menu to open the session configuration dialog.
|
|
53
|
+
2. Configure the various session settings:
|
|
54
|
+
|
|
55
|
+
### Speaker Information
|
|
56
|
+
- **Speaker Name**: Enter the full name of the voice talent who will be recording.
|
|
57
|
+
- **Gender**: Select the appropriate gender option (M/F/X).
|
|
58
|
+
- **Emotion**: Choose either a specific emotion (happy, sad, angry, etc.) or select "neutral" for non-emotional recordings.
|
|
59
|
+
|
|
60
|
+
### Script Selection
|
|
61
|
+
- Click **Browse** to select your prepared script file.
|
|
62
|
+
- The script file determines which utterances will appear during the recording session.
|
|
63
|
+
|
|
64
|
+
### Session Storage
|
|
65
|
+
- **Base Directory**: Choose the parent folder where all your recording sessions will be stored.
|
|
66
|
+
- **Session Directory**: You can enter a descriptive name for this session - or a subfolder with the speaker name and emotion will automatically be created.
|
|
67
|
+
|
|
68
|
+
### Audio Configuration
|
|
69
|
+
- **Input Device**: Select your audio interface from the dropdown list.
|
|
70
|
+
- **Sample Rate**: Choose at least 48000 Hz for professional quality recordings (44100 Hz is also acceptable).
|
|
71
|
+
- **Bit Depth**: Select 24-bit for optimal dynamic range.
|
|
72
|
+
- **Recording Format**: FLAC format is recommended as it provides losslessly compressed audio.
|
|
73
|
+
|
|
74
|
+
3. After configuring all settings, click **OK** to start your recording session.
|
|
75
|
+
|
|
76
|
+
## 3. Main Recording Interface
|
|
77
|
+
|
|
78
|
+
### Selecting Recording Standards
|
|
79
|
+
|
|
80
|
+
Before calibrating, select the appropriate recording standard for your project. Navigate to **Settings → Level Meter Preset** and choose from these industry standards:
|
|
81
|
+
|
|
82
|
+
- **EBU R128 broadcast** (default): For broadcast content with -23 LUFS integrated loudness
|
|
83
|
+
- **ACX/Audible audiobook**: For audiobook recording with RMS levels between -23 to -18 dB
|
|
84
|
+
- **Podcast standard**: For podcast production targeting -16 to -14 LUFS
|
|
85
|
+
- **Film dialog recording**: For film/video dialog with peaks between -27 to -20 dBFS
|
|
86
|
+
- **Music vocal recording**: For music production with peaks between -18 to -12 dBFS
|
|
87
|
+
|
|
88
|
+
Each preset automatically configures the level meter with appropriate target ranges, warning thresholds, and measurement windows specific to that recording standard.
|
|
89
|
+
|
|
90
|
+
### Before Recording - Calibration
|
|
91
|
+
|
|
92
|
+
It's important to calibrate your input levels before starting the recording session:
|
|
93
|
+
|
|
94
|
+
1. Press **M** to enter Monitor Mode. In this mode, you can see the input levels without recording.
|
|
95
|
+
2. Ask the speaker to test their voice at the volume they will use during recording.
|
|
96
|
+
3. Adjust the input gain on your audio interface until the levels fall within the target range for your selected preset (shown in dotted lines on the meter).
|
|
97
|
+
4. Press **M** again to exit Monitor Mode and return to normal recording mode.
|
|
98
|
+
|
|
99
|
+
### Recording Controls
|
|
100
|
+
|
|
101
|
+
| Key | Action |
|
|
102
|
+
|--------------|-----------------------------------------------------------------|
|
|
103
|
+
| **SPACE** | Start/Stop recording |
|
|
104
|
+
| **P** | Play the current take |
|
|
105
|
+
| **⌘/Ctrl+D** | Delete the current take (it moves to trash folder subdirectory) |
|
|
106
|
+
| **↑/↓** | Navigate between utterances in the list |
|
|
107
|
+
| **←/→** | Navigate between different takes of the current utterance |
|
|
108
|
+
| **⌘/Ctrl+U** | Change the utterance ordering method |
|
|
109
|
+
| **F1** | Show all keyboard shortcuts |
|
|
110
|
+
|
|
111
|
+
**Important:** When multiple takes exist for an utterance, the system always uses the most recent take for export. You can navigate through all takes using the left/right arrow keys to review them, but only the last recorded take will be included in the final dataset.
|
|
112
|
+
|
|
113
|
+
### Recording Process
|
|
114
|
+
|
|
115
|
+
When recording each utterance, follow these steps:
|
|
116
|
+
|
|
117
|
+
1. Speaker takes a deep breath before recording starts
|
|
118
|
+
2. Press **SPACE** to start recording
|
|
119
|
+
3. Wait 1-2 seconds (speaker holds breath)
|
|
120
|
+
4. Speaker reads the utterance
|
|
121
|
+
5. Speaker holds breath at the end
|
|
122
|
+
6. Wait 1-2 seconds before pressing **SPACE** to stop
|
|
123
|
+
7. Review the spectrogram or press **P** to play
|
|
124
|
+
|
|
125
|
+
**Breathing**: No breath sounds at the beginning or end of recordings. Speaker should inhale before recording starts and hold breath when finished.
|
|
126
|
+
|
|
127
|
+
**Why silence matters**: The pauses at the beginning and end of each recording serve two purposes:
|
|
128
|
+
|
|
129
|
+
- They provide clean boundaries for audio processing algorithms
|
|
130
|
+
- When VAD (Voice Activity Detection) is enabled during export, these silence regions help the system accurately detect speech boundaries and help trim recordings to consistent silence lengths
|
|
131
|
+
|
|
132
|
+
**VAD Processing**: During dataset export, if VAD analysis is enabled, the system will:
|
|
133
|
+
|
|
134
|
+
- Detect the exact start and end of speech in each recording
|
|
135
|
+
- Generate precise timing information for downstream TTS training
|
|
136
|
+
|
|
137
|
+
### Display Options
|
|
138
|
+
|
|
139
|
+
| Key | Action |
|
|
140
|
+
|---------------|-------------------------------|
|
|
141
|
+
| **M** | Toggle meter display |
|
|
142
|
+
| **I** | Toggle info display at bottom |
|
|
143
|
+
| **F10** | Fullscreen main window |
|
|
144
|
+
| **Shift+F10** | Fullscreen speaker window |
|
|
145
|
+
|
|
146
|
+
### Spectrogram Navigation
|
|
147
|
+
|
|
148
|
+
The spectrograms provide visual feedback of your recordings. You can interact with them using:
|
|
149
|
+
|
|
150
|
+
- **Mouse wheel**: Scroll up to zoom in for more detail, scroll down to zoom out.
|
|
151
|
+
- **Right-click + drag**: When zoomed in, hold the right mouse button and drag to pan left or right through the spectrogram.
|
|
152
|
+
|
|
153
|
+
## 4. Multi-Screen Setup
|
|
154
|
+
|
|
155
|
+
Revoxx supports dual-screen setups, allowing you to have separate displays for the engineer and the speaker.
|
|
156
|
+
|
|
157
|
+
### Enabling Speaker Display
|
|
158
|
+
|
|
159
|
+
1. Navigate to **Settings → 2nd Window → Enable 2nd Window** in the menu.
|
|
160
|
+
2. A second window will appear that can mirror the main recording interface or be configured for minimal display.
|
|
161
|
+
3. Drag this window to your second monitor, external display, or iPad.
|
|
162
|
+
4. Configure what appears on the second window through the settings:
|
|
163
|
+
- **Full Interface**: Shows everything from the main window including spectrograms and meters.
|
|
164
|
+
- **Minimal Mode**: Shows only the utterance text and status bar to maximize screen space for the text.
|
|
165
|
+
- You can toggle individual elements like spectrograms, meters, and info displays.
|
|
166
|
+
|
|
167
|
+
The minimal mode is recommended for speakers as it removes technical distractions and makes the text as large as possible.
|
|
168
|
+
|
|
169
|
+
### Using iPad with Sidecar (macOS)
|
|
170
|
+
|
|
171
|
+
For a portable dual-screen setup on macOS:
|
|
172
|
+
|
|
173
|
+
1. Connect your iPad to your Mac using the Sidecar feature.
|
|
174
|
+
2. Once connected, simply drag the speaker window to the iPad display.
|
|
175
|
+
3. The speaker can now comfortably read from the iPad while you control the recording from your main screen.
|
|
176
|
+
|
|
177
|
+
## 5. Session Management
|
|
178
|
+
|
|
179
|
+
### Utterance Ordering
|
|
180
|
+
|
|
181
|
+
The order in which utterances appear during recording can be customized to suit your workflow:
|
|
182
|
+
|
|
183
|
+
- By default, utterances appear in the order they are listed in the script file.
|
|
184
|
+
- Press **⌘/Ctrl+U** to open the utterance ordering dialog where you can sort by:
|
|
185
|
+
- **Label**: Alphabetical order by utterance ID
|
|
186
|
+
- **Emotion level**: Groups utterances by emotion and intensity
|
|
187
|
+
- **Text content**: Alphabetical order by the actual text
|
|
188
|
+
- **Text length**: Shortest to longest utterances
|
|
189
|
+
- **Number of takes**: Prioritize utterances with fewer recordings
|
|
190
|
+
|
|
191
|
+
For each sorting option, you can also choose the sort direction.
|
|
192
|
+
|
|
193
|
+
### Finding Utterances
|
|
194
|
+
|
|
195
|
+
When you need to locate specific utterances quickly:
|
|
196
|
+
|
|
197
|
+
- Use **Edit → Find Utterance** (⌘/Ctrl+F) to open the search dialog.
|
|
198
|
+
- You can search by typing any part of the utterance text.
|
|
199
|
+
- The search results can be sorted using the same criteria as the ordering options.
|
|
200
|
+
- Double-click any result to jump directly to that utterance.
|
|
201
|
+
|
|
202
|
+
### Session Progress
|
|
203
|
+
|
|
204
|
+
Revoxx automatically manages your recording progress:
|
|
205
|
+
|
|
206
|
+
- Your progress is saved automatically after each recording.
|
|
207
|
+
- Use **File → Recent Sessions** to see a list of your recent work and quickly resume where you left off.
|
|
208
|
+
- All session settings, including audio configuration and display preferences, are preserved between sessions.
|
|
209
|
+
|
|
210
|
+
## 6. Best Practices
|
|
211
|
+
|
|
212
|
+
### Recording Environment
|
|
213
|
+
|
|
214
|
+
Creating the right recording environment is essential for high-quality results:
|
|
215
|
+
|
|
216
|
+
1. **Acoustic Treatment**: Use a professionally treated room or vocal booth to minimize reflections and external noise.
|
|
217
|
+
2. **Microphone Position**: Maintain a consistent distance of 6-12 inches from the microphone. Use the monitoring mode to verify that levels remain stable as the speaker moves.
|
|
218
|
+
3. **Pop Filter**: Always use a pop filter positioned between the speaker and microphone to prevent plosive sounds from ruining takes.
|
|
219
|
+
4. **Headphones**: Provide closed-back headphones for the voice talent to prevent audio bleed into the microphone.
|
|
220
|
+
|
|
221
|
+
### Recording Workflow
|
|
222
|
+
|
|
223
|
+
Follow these guidelines for efficient and consistent recording sessions:
|
|
224
|
+
|
|
225
|
+
1. **Warm-up**: Always have the speaker perform vocal exercises before starting to ensure their voice is ready.
|
|
226
|
+
2. **Consistency**: Help the speaker maintain the same sitting position and energy level throughout the session.
|
|
227
|
+
3. **Breathing technique**: For longer utterances with natural pauses, breathing between phrases is acceptable and natural. However, ensure no breath sounds at the beginning or end of the recording.
|
|
228
|
+
4. **Breaks**: Schedule regular breaks every 30-45 minutes to prevent vocal fatigue and maintain quality.
|
|
229
|
+
5. **Multiple Takes**: For difficult or important utterances, record 2-3 takes to have options during dataset creation.
|
|
230
|
+
6. **Review**: Periodically stop to review recent recordings and ensure quality standards are being maintained.
|
|
231
|
+
|
|
232
|
+
## 7. Export and Dataset Creation
|
|
233
|
+
|
|
234
|
+
Once you have completed recording, you can export your sessions into datasets suitable for TTS training:
|
|
235
|
+
|
|
236
|
+
1. Select **File → Export Dataset** from the menu to open the export dialog.
|
|
237
|
+
2. Choose which recording sessions you want to include in the dataset. You can select multiple sessions from any speaker. Datasets are combined by speaker name automatically.
|
|
238
|
+
3. Configure the export options according to your needs:
|
|
239
|
+
- T3 format is chosen automatically
|
|
240
|
+
- **VAD analysis**: If VAD support is installed, you can generate voice activity timestamps for each audio file.
|
|
241
|
+
4. Click **Export** to create an organized dataset structure that's ready for TTS model training. The export process will handle file naming, metadata generation, and directory organization automatically.
|
|
242
|
+
|
|
243
|
+
## 8. Troubleshooting
|
|
244
|
+
|
|
245
|
+
### Audio Issues
|
|
246
|
+
|
|
247
|
+
If you encounter audio problems during recording:
|
|
248
|
+
|
|
249
|
+
- **No input signal**: Check that your audio interface is properly connected and that the correct drivers are installed. Verify that the input device is selected correctly and you have selected the correct channel.
|
|
250
|
+
- **Clipping or distortion**: Reduce the input gain on your audio interface. The peaks should never exceed -3 dB during normal speech.
|
|
251
|
+
- **Audio dropouts**: This may indicate buffer size issues. Close unnecessary applications and ensure your computer meets the performance requirements.
|
|
252
|
+
|
|
253
|
+
### Display Issues
|
|
254
|
+
|
|
255
|
+
For problems with the visual interface:
|
|
256
|
+
|
|
257
|
+
- **Text too small/large**: The text size adjusts automatically based on window size. Simply resize the window to change the text size.
|
|
258
|
+
- **Speaker window position lost**: If the second window appears off-screen, go to Settings → 2nd Window and disable/re-enable it.
|
|
259
|
+
- **Spectrogram not updating**: Press 'M' to toggle the meter display off and on again. This will refresh the visualization.
|
|
260
|
+
|
|
261
|
+
### Performance
|
|
262
|
+
|
|
263
|
+
To maintain optimal performance:
|
|
264
|
+
|
|
265
|
+
- **Slow response**: Close other CPU-intensive applications while recording. Audio processing requires significant resources. We have had best results with M-based Macs
|
|
266
|
+
- **Disk space**: Ensure you have adequate free space on your recording drive. Each hour of 48kHz/24-bit recording requires approximately 500MB.
|
|
267
|
+
- **Memory usage**: If the application becomes sluggish after extended use, save your session and restart Revoxx to clear the memory.
|
|
268
|
+
|
|
269
|
+
## Need Help?
|
|
270
|
+
|
|
271
|
+
- Press **F1** for keyboard shortcuts
|
|
272
|
+
- Report issues at: https://github.com/icelandic-lt/revoxx/issues
|