monkeyplug-enhanced 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2892 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import base64
6
+ import errno
7
+ import importlib
8
+ import importlib.metadata
9
+ import importlib.util
10
+ import json
11
+ import mmguero
12
+ import mutagen
13
+ import os
14
+ import pathlib
15
+ import requests
16
+ import shutil
17
+ import string
18
+ import sys
19
+ import wave
20
+
21
+ from urllib.parse import urlparse
22
+ from itertools import tee
23
+
24
+ ###################################################################################################
25
+ CHANNELS_REPLACER = 'CHANNELS'
26
+ SAMPLE_RATE_REPLACER = 'SAMPLE'
27
+ BIT_RATE_REPLACER = 'BITRATE'
28
+ VORBIS_QSCALE_REPLACER = 'QSCALE'
29
+ AUDIO_DEFAULT_PARAMS_BY_FORMAT = {
30
+ "flac": ["-c:a", "flac", "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
31
+ "m4a": ["-c:a", "aac", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
32
+ "aac": ["-c:a", "aac", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
33
+ "mp3": ["-c:a", "libmp3lame", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
34
+ "ogg": [
35
+ "-c:a",
36
+ "libvorbis",
37
+ "-qscale:a",
38
+ VORBIS_QSCALE_REPLACER,
39
+ "-ar",
40
+ SAMPLE_RATE_REPLACER,
41
+ "-ac",
42
+ CHANNELS_REPLACER,
43
+ ],
44
+ "opus": ["-c:a", "libopus", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
45
+ "ac3": ["-c:a", "ac3", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
46
+ "wav": ["-c:a", "pcm_s16le", "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
47
+ }
48
+ AUDIO_CODEC_TO_FORMAT = {
49
+ "aac": "m4a",
50
+ "ac3": "ac3",
51
+ "flac": "flac",
52
+ "mp3": "mp3",
53
+ "opus": "opus",
54
+ "vorbis": "ogg",
55
+ "pcm_s16le": "wav",
56
+ }
57
+
58
+ AUDIO_DEFAULT_FORMAT = "mp3"
59
+ AUDIO_DEFAULT_CHANNELS = 2
60
+ AUDIO_DEFAULT_SAMPLE_RATE = 48000
61
+ AUDIO_DEFAULT_BIT_RATE = "256K"
62
+ AUDIO_DEFAULT_VORBIS_QSCALE = 5
63
+ AUDIO_MATCH_FORMAT = "MATCH"
64
+ AUDIO_INTERMEDIATE_PARAMS = ["-c:a", "pcm_s16le", "-ac", "1", "-ar", "16000"]
65
+ AUDIO_DEFAULT_WAV_FRAMES_CHUNK = 8000
66
+ BEEP_HERTZ_DEFAULT = 1000
67
+ BEEP_MIX_NORMALIZE_DEFAULT = False
68
+ BEEP_AUDIO_WEIGHT_DEFAULT = 1
69
+ BEEP_SINE_WEIGHT_DEFAULT = 1
70
+ BEEP_DROPOUT_TRANSITION_DEFAULT = 0
71
+ SWEARS_FILENAME_DEFAULT = 'swears.txt'
72
+ MUTAGEN_METADATA_TAGS = ['encodedby', 'comment']
73
+ MUTAGEN_METADATA_TAG_VALUE = u'monkeyplug'
74
+ SPEECH_REC_MODE_VOSK = "vosk"
75
+ SPEECH_REC_MODE_WHISPER = "whisper"
76
+ SPEECH_REC_MODE_GROQ = "groq"
77
+ DEFAULT_SPEECH_REC_MODE = os.getenv("MONKEYPLUG_MODE", SPEECH_REC_MODE_GROQ)
78
+ DEFAULT_VOSK_MODEL_DIR = os.getenv(
79
+ "VOSK_MODEL_DIR", os.path.join(os.path.join(os.path.join(os.path.expanduser("~"), '.cache'), 'vosk'))
80
+ )
81
+ DEFAULT_WHISPER_MODEL_DIR = os.getenv(
82
+ "WHISPER_MODEL_DIR", os.path.join(os.path.join(os.path.join(os.path.expanduser("~"), '.cache'), 'whisper'))
83
+ )
84
+ DEFAULT_WHISPER_MODEL_NAME = os.getenv("WHISPER_MODEL_NAME", "small.en")
85
+ DEFAULT_TORCH_THREADS = 0
86
+
87
+ ###################################################################################################
88
+ # Determine script_path and script_name in a way that works both as module and direct execution
89
+ try:
90
+ # This works when running as a module
91
+ script_name = 'monkeyplug.py'
92
+ script_path = os.path.dirname(os.path.realpath(__file__))
93
+ except (NameError, TypeError):
94
+ # Fallback for edge cases
95
+ script_name = 'monkeyplug.py'
96
+ script_path = os.path.dirname(os.path.realpath(sys.argv[0])) if sys.argv and sys.argv[0] else os.getcwd()
97
+
98
+
99
+ # thanks https://docs.python.org/3/library/itertools.html#recipes
100
+ def pairwise(iterable):
101
+ a, b = tee(iterable)
102
+ next(b, None)
103
+ return zip(a, b)
104
+
105
+
106
+ def scrubword(value):
107
+ return str(value).lower().replace("’", "'").lower().strip(string.punctuation)
108
+
109
+
110
+ ###################################################################################################
111
+ # download to file
112
+ def DownloadToFile(url, local_filename=None, chunk_bytes=4096, debug=False):
113
+ tmpDownloadedFileSpec = local_filename if local_filename else os.path.basename(urlparse(url).path)
114
+ r = requests.get(url, stream=True, allow_redirects=True)
115
+ with open(tmpDownloadedFileSpec, "wb") as f:
116
+ for chunk in r.iter_content(chunk_size=chunk_bytes):
117
+ if chunk:
118
+ f.write(chunk)
119
+ fExists = os.path.isfile(tmpDownloadedFileSpec)
120
+ fSize = os.path.getsize(tmpDownloadedFileSpec)
121
+ if debug:
122
+ mmguero.eprint(
123
+ f"Download of {url} to {tmpDownloadedFileSpec} {'succeeded' if fExists else 'failed'} ({mmguero.size_human_format(fSize)})"
124
+ )
125
+
126
+ if fExists and (fSize > 0):
127
+ return tmpDownloadedFileSpec
128
+ else:
129
+ if fExists:
130
+ os.remove(tmpDownloadedFileSpec)
131
+ return None
132
+
133
+
134
+ ###################################################################################################
135
+ # Get tag from file to indicate monkeyplug has already been set
136
+ def GetMonkeyplugTagged(local_filename, debug=False):
137
+ result = False
138
+ if os.path.isfile(local_filename):
139
+ mut = mutagen.File(local_filename, easy=True)
140
+ if debug:
141
+ mmguero.eprint(f'Tags of {local_filename}: {mut}')
142
+ if hasattr(mut, 'get'):
143
+ for tag in MUTAGEN_METADATA_TAGS:
144
+ try:
145
+ if MUTAGEN_METADATA_TAG_VALUE in mmguero.get_iterable(mut.get(tag, default=())):
146
+ result = True
147
+ break
148
+ except Exception as e:
149
+ if debug:
150
+ mmguero.eprint(e)
151
+ return result
152
+
153
+
154
+ ###################################################################################################
155
+ # Set tag to file to indicate monkeyplug has worked its magic
156
+ def SetMonkeyplugTag(local_filename, debug=False):
157
+ result = False
158
+ if os.path.isfile(local_filename):
159
+ mut = mutagen.File(local_filename, easy=True)
160
+ if debug:
161
+ mmguero.eprint(f'Tags of {local_filename} before: {mut}')
162
+ if hasattr(mut, '__setitem__'):
163
+ for tag in MUTAGEN_METADATA_TAGS:
164
+ try:
165
+ mut[tag] = MUTAGEN_METADATA_TAG_VALUE
166
+ result = True
167
+ break
168
+ except Exception as e:
169
+ if debug:
170
+ mmguero.eprint(e)
171
+ if result:
172
+ try:
173
+ mut.save(local_filename)
174
+ except Exception as e:
175
+ result = False
176
+ mmguero.eprint(e)
177
+ if debug:
178
+ mmguero.eprint(f'Tags of {local_filename} after: {mut}')
179
+
180
+ return result
181
+
182
+
183
+ ###################################################################################################
184
+ # get stream codecs from an input filename
185
+ # e.g. result: {'video': {'h264'}, 'audio': {'eac3'}, 'subtitle': {'subrip'}}
186
+ def GetCodecs(local_filename, debug=False):
187
+ result = {}
188
+ if os.path.isfile(local_filename):
189
+ ffprobeCmd = [
190
+ 'ffprobe',
191
+ '-v',
192
+ 'quiet',
193
+ '-print_format',
194
+ 'json',
195
+ '-show_format',
196
+ '-show_streams',
197
+ local_filename,
198
+ ]
199
+ ffprobeResult, ffprobeOutput = mmguero.run_process(ffprobeCmd, stdout=True, stderr=False, debug=debug)
200
+ if ffprobeResult == 0:
201
+ ffprobeOutput = mmguero.load_str_if_json(' '.join(ffprobeOutput))
202
+ if 'streams' in ffprobeOutput:
203
+ for stream in ffprobeOutput['streams']:
204
+ if 'codec_name' in stream and 'codec_type' in stream:
205
+ cType = stream['codec_type'].lower()
206
+ cValue = stream['codec_name'].lower()
207
+ if cType in result:
208
+ result[cType].add(cValue)
209
+ else:
210
+ result[cType] = set([cValue])
211
+ result['format'] = mmguero.deep_get(ffprobeOutput, ['format', 'format_name'])
212
+ if isinstance(result['format'], str):
213
+ result['format'] = result['format'].split(',')
214
+ else:
215
+ mmguero.eprint(' '.join(mmguero.flatten(ffprobeCmd)))
216
+ mmguero.eprint(ffprobeResult)
217
+ mmguero.eprint(ffprobeOutput)
218
+ raise ValueError(f"Could not analyze {local_filename}")
219
+
220
+ return result
221
+
222
+
223
+ #################################################################################
224
+ class Plugger(object):
225
+ debug = False
226
+ inputFileSpec = ""
227
+ inputCodecs = {}
228
+ inputFileParts = None
229
+ outputFileSpec = ""
230
+ outputAudioFileFormat = ""
231
+ outputVideoFileFormat = ""
232
+ outputJson = ""
233
+ tmpDownloadedFileSpec = ""
234
+ swearsFileSpec = ""
235
+ swearsMap = {}
236
+ wordList = []
237
+ naughtyWordList = []
238
+ # for beep and mute
239
+ muteTimeList = []
240
+ # for beep only
241
+ sineTimeList = []
242
+ beepDelayList = []
243
+ padSecPre = 0.0
244
+ padSecPost = 0.0
245
+ beep = False
246
+ beepHertz = BEEP_HERTZ_DEFAULT
247
+ beepMixNormalize = BEEP_MIX_NORMALIZE_DEFAULT
248
+ beepAudioWeight = BEEP_AUDIO_WEIGHT_DEFAULT
249
+ beepSineWeight = BEEP_SINE_WEIGHT_DEFAULT
250
+ beepDropTransition = BEEP_DROPOUT_TRANSITION_DEFAULT
251
+ forceDespiteTag = False
252
+ aParams = None
253
+ tags = None
254
+ # for instrumental splicing
255
+ instrumentalFileSpec = ""
256
+ instrumentalMode = False
257
+ instrumentalSegments = [] # List of (start, end) tuples for profanity sections
258
+
259
+ ######## init #################################################################
260
+ def __init__(
261
+ self,
262
+ iFileSpec,
263
+ oFileSpec,
264
+ oAudioFileFormat,
265
+ iSwearsFileSpec,
266
+ outputJson,
267
+ inputTranscript=None,
268
+ saveTranscript=False,
269
+ forceRetranscribe=False,
270
+ aParams=None,
271
+ aChannels=AUDIO_DEFAULT_CHANNELS,
272
+ aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
273
+ aBitRate=AUDIO_DEFAULT_BIT_RATE,
274
+ aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
275
+ padMsecPre=0,
276
+ padMsecPost=0,
277
+ beep=False,
278
+ beepHertz=BEEP_HERTZ_DEFAULT,
279
+ beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
280
+ beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
281
+ beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
282
+ beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
283
+ force=False,
284
+ dbug=False,
285
+ instrumentalFileSpec=None,
286
+ ):
287
+ self.padSecPre = padMsecPre / 1000.0
288
+ self.padSecPost = padMsecPost / 1000.0
289
+ self.beep = beep
290
+ self.beepHertz = beepHertz
291
+ self.beepMixNormalize = beepMixNormalize
292
+ self.beepAudioWeight = beepAudioWeight
293
+ self.beepSineWeight = beepSineWeight
294
+ self.beepDropTransition = beepDropTransition
295
+ self.forceDespiteTag = force
296
+ self.debug = dbug
297
+ self.outputJson = outputJson
298
+ self.inputTranscript = inputTranscript
299
+ self.saveTranscript = saveTranscript
300
+
301
+ # determine input file name, or download and save file
302
+ if (iFileSpec is not None) and os.path.isfile(iFileSpec):
303
+ self.inputFileSpec = iFileSpec
304
+ elif iFileSpec.lower().startswith("http"):
305
+ self.tmpDownloadedFileSpec = DownloadToFile(iFileSpec)
306
+ if (self.tmpDownloadedFileSpec is not None) and os.path.isfile(self.tmpDownloadedFileSpec):
307
+ self.inputFileSpec = self.tmpDownloadedFileSpec
308
+ else:
309
+ raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), iFileSpec)
310
+ else:
311
+ raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), iFileSpec)
312
+
313
+ # input file should exist locally by now
314
+ if os.path.isfile(self.inputFileSpec):
315
+ self.inputFileParts = os.path.splitext(self.inputFileSpec)
316
+ self.inputCodecs = GetCodecs(self.inputFileSpec)
317
+ inputFormat = next(
318
+ iter([x for x in self.inputCodecs.get('format', None) if x in AUDIO_DEFAULT_PARAMS_BY_FORMAT]), None
319
+ )
320
+ else:
321
+ raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.inputFileSpec)
322
+
323
+ # determine output file name (either specified or based on input filename)
324
+ self.outputFileSpec = oFileSpec if oFileSpec else self.inputFileParts[0] + "_clean"
325
+ if self.outputFileSpec:
326
+ outParts = os.path.splitext(self.outputFileSpec)
327
+ if (
328
+ ((not oAudioFileFormat) or (str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT))
329
+ and oFileSpec
330
+ and (len(outParts) > 1)
331
+ and outParts[1]
332
+ ):
333
+ oAudioFileFormat = outParts[1]
334
+
335
+ if str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT:
336
+ # output format not specified, base on input filename matching extension (or codec)
337
+ if self.inputFileParts[1] in AUDIO_DEFAULT_PARAMS_BY_FORMAT:
338
+ self.outputFileSpec = self.outputFileSpec + self.inputFileParts[1]
339
+ elif str(inputFormat).lower() in AUDIO_DEFAULT_PARAMS_BY_FORMAT:
340
+ self.outputFileSpec = self.outputFileSpec + '.' + inputFormat.lower()
341
+ else:
342
+ for codec in mmguero.get_iterable(self.inputCodecs.get('audio', [])):
343
+ if codec.lower() in AUDIO_CODEC_TO_FORMAT:
344
+ self.outputFileSpec = self.outputFileSpec + '.' + AUDIO_CODEC_TO_FORMAT[codec.lower()]
345
+ break
346
+
347
+ elif oAudioFileFormat:
348
+ # output filename not specified, base on input filename with specified format
349
+ newSuffix = '.' + oAudioFileFormat.lower().lstrip('.')
350
+ self.outputFileSpec = mmguero.remove_suffix(self.outputFileSpec, newSuffix) + newSuffix
351
+
352
+ else:
353
+ # can't determine what output file audio format should be
354
+ raise ValueError("Output file audio format unspecified")
355
+
356
+ # determine output file extension if it's not already obvious
357
+ outParts = os.path.splitext(self.outputFileSpec)
358
+ self.outputAudioFileFormat = outParts[1].lower().lstrip('.')
359
+
360
+ if (not self.outputAudioFileFormat) or (
361
+ (not aParams) and (self.outputAudioFileFormat not in AUDIO_DEFAULT_PARAMS_BY_FORMAT)
362
+ ):
363
+ raise ValueError("Output file audio format unspecified or unsupported")
364
+ elif not aParams:
365
+ # we're using ffmpeg encoding params based on output file format
366
+ self.aParams = AUDIO_DEFAULT_PARAMS_BY_FORMAT[self.outputAudioFileFormat]
367
+ else:
368
+ # they specified custom ffmpeg encoding params
369
+ self.aParams = aParams
370
+ if self.aParams.startswith("base64:"):
371
+ self.aParams = base64.b64decode(self.aParams[7:]).decode("utf-8")
372
+ self.aParams = self.aParams.split(' ')
373
+ self.aParams = [
374
+ {
375
+ CHANNELS_REPLACER: str(aChannels),
376
+ SAMPLE_RATE_REPLACER: str(aSampleRate),
377
+ BIT_RATE_REPLACER: str(aBitRate),
378
+ VORBIS_QSCALE_REPLACER: str(aVorbisQscale),
379
+ }.get(aParam, aParam)
380
+ for aParam in self.aParams
381
+ ]
382
+
383
+ # if we're actually just replacing the audio stream(s) inside a video file, the actual output file is still a video file
384
+ self.outputVideoFileFormat = (
385
+ self.inputFileParts[1]
386
+ if (
387
+ (len(mmguero.get_iterable(self.inputCodecs.get('video', []))) > 0)
388
+ and (str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT)
389
+ )
390
+ else ''
391
+ )
392
+ if self.outputVideoFileFormat:
393
+ self.outputFileSpec = outParts[0] + self.outputVideoFileFormat
394
+
395
+ # create output directory if it doesn't exist
396
+ self._ensure_directory_exists(self.outputFileSpec, "output directory")
397
+
398
+ # if output file already exists, remove as we'll be overwriting it anyway
399
+ if os.path.isfile(self.outputFileSpec):
400
+ if self.debug:
401
+ mmguero.eprint(f'Removing existing destination file {self.outputFileSpec}')
402
+ os.remove(self.outputFileSpec)
403
+
404
+ # If save-transcript is enabled and no explicit JSON output path, auto-generate one
405
+ if self.saveTranscript and not self.outputJson:
406
+ outputBaseName = os.path.splitext(self.outputFileSpec)[0]
407
+ self.outputJson = outputBaseName + '_transcript.json'
408
+ if self.debug:
409
+ mmguero.eprint(f'Auto-generated transcript output: {self.outputJson}')
410
+
411
+ # Auto-detect existing transcript for reuse (unless force flag set or explicit input provided)
412
+ if self.saveTranscript and not self.inputTranscript and self.outputJson and not forceRetranscribe:
413
+ if os.path.exists(self.outputJson):
414
+ self.inputTranscript = self.outputJson
415
+ if self.debug:
416
+ mmguero.eprint(f'Found existing transcript, reusing: {self.inputTranscript}')
417
+
418
+ # If JSON output is specified, ensure its directory exists too
419
+ if self.outputJson:
420
+ self._ensure_directory_exists(self.outputJson, "JSON output directory")
421
+
422
+ # load the swears file (not actually mapping right now, but who knows, speech synthesis maybe someday?)
423
+ self.swearsFileSpec = iSwearsFileSpec if (iSwearsFileSpec is not None) and os.path.isfile(iSwearsFileSpec) else None
424
+
425
+ self._load_swears_file()
426
+
427
+ # validate instrumental file if provided
428
+ if instrumentalFileSpec:
429
+ if not os.path.isfile(instrumentalFileSpec):
430
+ raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), instrumentalFileSpec)
431
+
432
+ # Check duration of instrumental vs original
433
+ # Need to get duration directly from ffprobe since GetCodecs doesn't extract it
434
+ instrumentalDuration = self._get_file_duration(instrumentalFileSpec)
435
+ originalDuration = self._get_file_duration(self.inputFileSpec)
436
+
437
+ if instrumentalDuration > 0 and originalDuration > 0:
438
+ if instrumentalDuration < originalDuration:
439
+ raise ValueError(
440
+ f"Instrumental file duration ({instrumentalDuration}s) is shorter than "
441
+ f"original file duration ({originalDuration}s)"
442
+ )
443
+ elif self.debug:
444
+ mmguero.eprint('Warning: Could not verify file durations')
445
+
446
+ self.instrumentalFileSpec = instrumentalFileSpec
447
+ self.instrumentalMode = True
448
+ else:
449
+ self.instrumentalMode = False
450
+
451
+ if self.debug:
452
+ mmguero.eprint(f'Input: {self.inputFileSpec}')
453
+ mmguero.eprint(f'Input codec: {self.inputCodecs}')
454
+ mmguero.eprint(f'Output: {self.outputFileSpec}')
455
+ mmguero.eprint(f'Output audio format: {self.outputAudioFileFormat}')
456
+ mmguero.eprint(f'Encode parameters: {self.aParams}')
457
+ mmguero.eprint(f'Profanity file: {self.swearsFileSpec if self.swearsFileSpec else "built-in"}')
458
+ mmguero.eprint(f'Intermediate downloaded file: {self.tmpDownloadedFileSpec}')
459
+ if self.outputJson:
460
+ mmguero.eprint(f'Transcript output: {self.outputJson}')
461
+ if self.inputTranscript:
462
+ mmguero.eprint(f'Input transcript: {self.inputTranscript}')
463
+ mmguero.eprint(f'Beep instead of mute: {self.beep}')
464
+ if self.beep:
465
+ mmguero.eprint(f'Beep hertz: {self.beepHertz}')
466
+ mmguero.eprint(f'Beep mix normalization: {self.beepMixNormalize}')
467
+ mmguero.eprint(f'Beep audio weight: {self.beepAudioWeight}')
468
+ mmguero.eprint(f'Beep sine weight: {self.beepSineWeight}')
469
+ mmguero.eprint(f'Beep dropout transition: {self.beepDropTransition}')
470
+ mmguero.eprint(f'Force despite tags: {self.forceDespiteTag}')
471
+ mmguero.eprint(f'Instrumental mode: {self.instrumentalMode}')
472
+ if self.instrumentalMode:
473
+ mmguero.eprint(f'Instrumental file: {self.instrumentalFileSpec}')
474
+
475
+ ######## del ##################################################################
476
+ def __del__(self):
477
+ # if we downloaded the input file, remove it as well
478
+ if os.path.isfile(self.tmpDownloadedFileSpec):
479
+ os.remove(self.tmpDownloadedFileSpec)
480
+
481
+ # Clean up temporary separation files
482
+ if hasattr(self, 'separationCacheDir') and self.separationCacheDir:
483
+ import shutil
484
+ try:
485
+ if os.path.exists(self.separationCacheDir):
486
+ shutil.rmtree(self.separationCacheDir)
487
+ if self.debug:
488
+ mmguero.eprint(f'Cleaned up separation cache: {self.separationCacheDir}')
489
+ except Exception as e:
490
+ if self.debug:
491
+ mmguero.eprint(f'Warning: Failed to cleanup separation cache: {e}')
492
+
493
+ ######## _ensure_directory_exists #############################################
494
+ def _ensure_directory_exists(self, filepath, description="directory"):
495
+ """Ensure the directory for a file path exists, creating it if necessary"""
496
+ directory = os.path.dirname(filepath)
497
+ if directory and not os.path.exists(directory):
498
+ if self.debug:
499
+ mmguero.eprint(f'Creating {description}: {directory}')
500
+ os.makedirs(directory, exist_ok=True)
501
+ return directory
502
+
503
+ ######## _get_file_duration ###################################################
504
+ def _get_file_duration(self, filepath):
505
+ """Get the duration of an audio/video file using ffprobe"""
506
+ try:
507
+ ffprobeCmd = [
508
+ 'ffprobe',
509
+ '-v',
510
+ 'quiet',
511
+ '-print_format',
512
+ 'json',
513
+ '-show_entries',
514
+ 'format=duration',
515
+ filepath,
516
+ ]
517
+ ffprobeResult, ffprobeOutput = mmguero.run_process(ffprobeCmd, stdout=True, stderr=False, debug=False)
518
+ if ffprobeResult == 0:
519
+ ffprobeData = mmguero.load_str_if_json(' '.join(ffprobeOutput))
520
+ duration_str = mmguero.deep_get(ffprobeData, ['format', 'duration'], '0')
521
+ return float(duration_str)
522
+ else:
523
+ return 0.0
524
+ except Exception as e:
525
+ if self.debug:
526
+ mmguero.eprint(f'Error getting duration for {filepath}: {e}')
527
+ return 0.0
528
+
529
+ ######## LoadTranscriptFromFile ##############################################
530
+ def LoadTranscriptFromFile(self):
531
+ """Load pre-generated transcript from JSON file"""
532
+ if not self.inputTranscript:
533
+ return False
534
+
535
+ if not os.path.isfile(self.inputTranscript):
536
+ raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.inputTranscript)
537
+
538
+ if self.debug:
539
+ mmguero.eprint(f'Loading transcript from: {self.inputTranscript}')
540
+
541
+ with open(self.inputTranscript, 'r') as f:
542
+ self.wordList = json.load(f)
543
+
544
+ # Recalculate scrub flags with current swears list
545
+ for word in self.wordList:
546
+ word['scrub'] = scrubword(word.get('word', '')) in self.swearsMap
547
+
548
+ if self.debug:
549
+ mmguero.eprint(f'Loaded {len(self.wordList)} words from transcript')
550
+ scrubbed_count = sum(1 for w in self.wordList if w.get('scrub', False))
551
+ mmguero.eprint(f'Words to censor with current swear list: {scrubbed_count}')
552
+
553
+ return True
554
+
555
+ ######## _load_swears_file ####################################################
556
+ def _load_swears_file(self):
557
+ """Load swears from built-in list first, then from custom text or JSON file if provided"""
558
+ # Load built-in profanity list first
559
+ self._load_builtin_swears()
560
+
561
+ # Load custom swears file if provided
562
+ if self.swearsFileSpec:
563
+ # Try to detect and parse JSON first
564
+ is_json = False
565
+ if self.swearsFileSpec.lower().endswith('.json'):
566
+ is_json = True
567
+ else:
568
+ # Try to parse as JSON even without .json extension
569
+ try:
570
+ with open(self.swearsFileSpec, 'r') as f:
571
+ content = f.read()
572
+ json.loads(content)
573
+ is_json = True
574
+ except (json.JSONDecodeError, ValueError):
575
+ pass
576
+
577
+ if is_json:
578
+ self._load_swears_from_json()
579
+ else:
580
+ self._load_swears_from_text()
581
+
582
+ if self.debug:
583
+ mmguero.eprint(f'Loaded {len(self.swearsMap)} profanity entries (built-in + custom from {self.swearsFileSpec})')
584
+ else:
585
+ if self.debug:
586
+ mmguero.eprint(f'Loaded {len(self.swearsMap)} profanity entries from built-in list')
587
+
588
+ def _load_builtin_swears(self):
589
+ """Load built-in profanity list from package data"""
590
+ data = None
591
+ error_msgs = []
592
+
593
+ # Method 1: Try importlib.resources.files (Python 3.9+)
594
+ try:
595
+ import importlib.resources as resources
596
+ with resources.files('monkeyplug.data').joinpath('profanity_list.json').open('r') as f:
597
+ data = json.load(f)
598
+ if self.debug:
599
+ mmguero.eprint('Loaded profanity list using importlib.resources.files')
600
+ except Exception as e:
601
+ error_msgs.append(f"importlib.resources.files failed: {e}")
602
+
603
+ # Method 2: Fallback for older Python versions using pkg_resources
604
+ if data is None:
605
+ try:
606
+ import pkg_resources
607
+ resource_package = 'monkeyplug'
608
+ resource_path = '/'.join(('data', 'profanity_list.json'))
609
+ data = json.loads(pkg_resources.resource_string(resource_package, resource_path).decode('UTF-8'))
610
+ if self.debug:
611
+ mmguero.eprint('Loaded profanity list using pkg_resources')
612
+ except Exception as e:
613
+ error_msgs.append(f"pkg_resources failed: {e}")
614
+
615
+ # Method 3: Last resort - try to find the file relative to this module
616
+ if data is None:
617
+ try:
618
+ module_dir = os.path.dirname(os.path.abspath(__file__))
619
+ data_file = os.path.join(module_dir, 'data', 'profanity_list.json')
620
+ if os.path.exists(data_file):
621
+ with open(data_file, 'r') as f:
622
+ data = json.load(f)
623
+ if self.debug:
624
+ mmguero.eprint(f'Loaded profanity list from file path: {data_file}')
625
+ else:
626
+ error_msgs.append(f"File not found at {data_file}")
627
+ except Exception as e:
628
+ error_msgs.append(f"File path fallback failed: {e}")
629
+
630
+ # If all methods failed, warn but continue (custom swears file might be provided)
631
+ if data is None:
632
+ if self.debug:
633
+ mmguero.eprint('Could not load built-in profanity list:')
634
+ for msg in error_msgs:
635
+ mmguero.eprint(f' {msg}')
636
+ return
637
+
638
+ if isinstance(data, list):
639
+ for item in data:
640
+ if isinstance(item, str) and item.strip():
641
+ self.swearsMap[scrubword(item)] = "*****"
642
+ elif self.debug:
643
+ mmguero.eprint('Built-in profanity list has unexpected format')
644
+
645
+ def _load_swears_from_json(self):
646
+ """Load swears from JSON format - simple array of strings
647
+
648
+ Format: ["word1", "word2", "word3", ...]
649
+ Example: https://github.com/zautumnz/profane-words/blob/master/words.json
650
+ """
651
+ with open(self.swearsFileSpec, 'r') as f:
652
+ data = json.load(f)
653
+
654
+ if not isinstance(data, list):
655
+ raise ValueError(f"JSON swears file must contain an array of strings, got {type(data).__name__}")
656
+
657
+ for item in data:
658
+ if isinstance(item, str) and item.strip():
659
+ self.swearsMap[scrubword(item)] = "*****"
660
+
661
+ def _load_swears_from_text(self):
662
+ """Load swears from pipe-delimited text format (legacy)"""
663
+ lines = []
664
+ with open(self.swearsFileSpec) as f:
665
+ lines = [line.rstrip("\n") for line in f]
666
+ for line in lines:
667
+ lineMap = line.split("|")
668
+ self.swearsMap[scrubword(lineMap[0])] = lineMap[1] if len(lineMap) > 1 else "*****"
669
+
670
+ ######## CreateCleanMuteList #################################################
671
+ def CreateCleanMuteList(self):
672
+ # Try to load existing transcript first, otherwise perform speech recognition
673
+ if not self.LoadTranscriptFromFile():
674
+ self.RecognizeSpeech()
675
+
676
+ self.naughtyWordList = [word for word in self.wordList if word["scrub"] is True]
677
+
678
+ # Handle auto-generation mode
679
+ if hasattr(self, 'autoGenerateMode') and self.autoGenerateMode and len(self.naughtyWordList) > 0:
680
+ # Create merged profanity segments
681
+ self._create_instrumental_splice_list()
682
+
683
+ # Extract, separate, and get instrumental file
684
+ if self.instrumentalSegments:
685
+ try:
686
+ self.instrumentalFileSpec = self._create_combined_profanity_file()
687
+ if self.instrumentalFileSpec:
688
+ self.instrumentalMode = True
689
+ self._build_instrumental_filters()
690
+ return [] # Return empty list for muteTimeList
691
+ except Exception as e:
692
+ # Fallback to mute if generation fails
693
+ if self.debug:
694
+ mmguero.eprint(f"Generation failed: {e}, falling back to mute mode")
695
+ self.instrumentalMode = False
696
+ return self._create_mute_beep_list()
697
+ else:
698
+ return []
699
+
700
+ # Handle traditional instrumental file mode or mute/beep mode
701
+ if self.instrumentalMode:
702
+ return self._create_instrumental_splice_list()
703
+ else:
704
+ return self._create_mute_beep_list()
705
+
706
+ def _create_instrumental_splice_list(self):
707
+ """Create list of profanity segments for instrumental splicing"""
708
+ if len(self.naughtyWordList) == 0:
709
+ self.instrumentalSegments = []
710
+ return []
711
+
712
+ # Sort by start time
713
+ sorted_naughty = sorted(self.naughtyWordList, key=lambda x: x['start'])
714
+
715
+ # Merge consecutive profanity segments (gap < 100ms)
716
+ merged_segments = []
717
+ if sorted_naughty:
718
+ current_start = max(0, sorted_naughty[0]['start'] - self.padSecPre)
719
+ current_end = sorted_naughty[0]['end'] + self.padSecPost
720
+
721
+ for word in sorted_naughty[1:]:
722
+ word_start = max(0, word['start'] - self.padSecPre)
723
+ word_end = word['end'] + self.padSecPost
724
+
725
+ # If gap between segments is less than 100ms, merge them
726
+ if word_start - current_end < 0.1:
727
+ current_end = max(current_end, word_end)
728
+ else:
729
+ merged_segments.append((current_start, current_end))
730
+ current_start = word_start
731
+ current_end = word_end
732
+
733
+ # Add the last segment
734
+ merged_segments.append((current_start, current_end))
735
+
736
+ self.instrumentalSegments = merged_segments
737
+
738
+ if self.debug:
739
+ mmguero.eprint(f'Instrumental segments: {self.instrumentalSegments}')
740
+
741
+ # Return empty list for muteTimeList (not used in instrumental mode)
742
+ return []
743
+
744
+ def _create_mute_beep_list(self):
745
+ """Create traditional mute or beep filter list"""
746
+ if len(self.naughtyWordList) > 0:
747
+ # append a dummy word at the very end so that pairwise can peek then ignore it
748
+ self.naughtyWordList.extend(
749
+ [
750
+ {
751
+ "conf": 1,
752
+ "end": self.naughtyWordList[-1]["end"] + 2.0,
753
+ "start": self.naughtyWordList[-1]["end"] + 1.0,
754
+ "word": "mothaflippin",
755
+ "scrub": True,
756
+ }
757
+ ]
758
+ )
759
+ if self.debug:
760
+ mmguero.eprint(self.naughtyWordList)
761
+
762
+ self.muteTimeList = []
763
+ self.sineTimeList = []
764
+ self.beepDelayList = []
765
+ for word, wordPeek in pairwise(self.naughtyWordList):
766
+ wordStart = format(word["start"] - self.padSecPre, ".3f")
767
+ wordEnd = format(word["end"] + self.padSecPost, ".3f")
768
+ wordDuration = format(float(wordEnd) - float(wordStart), ".3f")
769
+ wordPeekStart = format(wordPeek["start"] - self.padSecPre, ".3f")
770
+ if self.beep:
771
+ self.muteTimeList.append(f"volume=enable='between(t,{wordStart},{wordEnd})':volume=0")
772
+ self.sineTimeList.append(f"sine=f={self.beepHertz}:duration={wordDuration}")
773
+ self.beepDelayList.append(
774
+ f"atrim=0:{wordDuration},adelay={'|'.join([str(int(float(wordStart) * 1000))] * 2)}"
775
+ )
776
+ else:
777
+ self.muteTimeList.append(
778
+ "afade=enable='between(t," + wordStart + "," + wordEnd + ")':t=out:st=" + wordStart + ":d=5ms"
779
+ )
780
+ self.muteTimeList.append(
781
+ "afade=enable='between(t," + wordEnd + "," + wordPeekStart + ")':t=in:st=" + wordEnd + ":d=5ms"
782
+ )
783
+
784
+ if self.debug:
785
+ mmguero.eprint(self.muteTimeList)
786
+ if self.beep:
787
+ mmguero.eprint(self.sineTimeList)
788
+ mmguero.eprint(self.beepDelayList)
789
+
790
+ return self.muteTimeList
791
+
792
+ def _build_instrumental_filters(self):
793
+ """Build FFmpeg filter complex for instrumental splicing
794
+
795
+ Supports both:
796
+ - Traditional instrumental file (instrumentalFileSpec provided by user)
797
+ - Auto-generated combined file (autoGenerateMode with segMapping)
798
+ """
799
+ if not self.instrumentalSegments:
800
+ return []
801
+
802
+ duration = self._get_file_duration(self.inputFileSpec)
803
+ filter_parts = []
804
+ seg_index = 0
805
+ last_end = 0.0
806
+
807
+ if hasattr(self, 'autoGenerateMode') and self.autoGenerateMode and hasattr(self, 'segMapping') and self.segMapping:
808
+ # AUTO-SEPARATION MODE: Use segMapping to translate timestamps
809
+ for idx, (orig_start, orig_end) in enumerate(self.instrumentalSegments):
810
+ # Get the mapping for this segment
811
+ if idx < len(self.segMapping):
812
+ profanity_start, profanity_end, combined_start, combined_end, padded_start, padded_end = self.segMapping[idx]
813
+ else:
814
+ # Fallback: shouldn't happen
815
+ if orig_start > last_end:
816
+ filter_parts.append(f"[0:a]atrim={last_end:.2f}:{orig_start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
817
+ seg_index += 1
818
+ filter_parts.append(f"[0:a]atrim={orig_start:.2f}:{orig_end:.2f},volume=0[seg{seg_index}]")
819
+ seg_index += 1
820
+ last_end = orig_end
821
+ continue
822
+
823
+ # Original audio before profanity
824
+ if orig_start > last_end:
825
+ filter_parts.append(f"[0:a]atrim={last_end:.2f}:{orig_start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
826
+ seg_index += 1
827
+
828
+ # Extract the profanity portion from the combined instrumental file
829
+ # Calculate the position in the combined file where profanity starts
830
+ # combined_start = where this padded segment is in combined file
831
+ # (profanity_start - padded_start) = offset of profanity within the padded segment
832
+ position_in_combined = combined_start + (profanity_start - padded_start)
833
+ profanity_duration = profanity_end - profanity_start
834
+
835
+ filter_parts.append(
836
+ f"[1:a]atrim={position_in_combined:.2f}:{position_in_combined + profanity_duration:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
837
+ )
838
+ seg_index += 1
839
+
840
+ last_end = orig_end
841
+
842
+ # Final original audio segment
843
+ if last_end < duration:
844
+ filter_parts.append(f"[0:a]atrim={last_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
845
+ seg_index += 1
846
+
847
+ # Concatenate all segments
848
+ concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
849
+ filter_parts.append(f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]")
850
+
851
+ else:
852
+ # TRADITIONAL MODE: Use provided instrumental file
853
+ # Original logic works fine here
854
+ filter_parts.append("[0:a]asplit=2[orig][inst]")
855
+
856
+ seg_index = 0
857
+ last_end = 0.0
858
+
859
+ for start, end in self.instrumentalSegments:
860
+ # Add original audio segment before profanity
861
+ if start > last_end:
862
+ filter_parts.append(
863
+ f"[orig]atrim={last_end:.2f}:{start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
864
+ )
865
+ seg_index += 1
866
+
867
+ # Add instrumental audio segment for profanity
868
+ filter_parts.append(
869
+ f"[inst]atrim={start:.2f}:{end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
870
+ )
871
+ seg_index += 1
872
+
873
+ last_end = end
874
+
875
+ # Add final original audio segment after last profanity
876
+ filter_parts.append(
877
+ f"[orig]atrim={last_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
878
+ )
879
+ seg_index += 1
880
+
881
+ # Concatenate all segments
882
+ concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
883
+ filter_parts.append(
884
+ f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]"
885
+ )
886
+
887
+ filter_complex = ';'.join(filter_parts)
888
+
889
+ if self.debug:
890
+ if hasattr(self, 'verbose_level') and self.verbose_level == "full":
891
+ mmguero.eprint(f'Filter complex: {filter_complex}')
892
+ else:
893
+ # Concise mode: just show segment count
894
+ mode = "auto-separation" if (hasattr(self, 'autoGenerateMode') and self.autoGenerateMode) else "traditional"
895
+ mmguero.eprint(f'Building FFmpeg filter with {len(self.instrumentalSegments)} instrumental segment(s) ({mode} mode)')
896
+
897
+ return ['-filter_complex', filter_complex, '-map', '[outa]']
898
+
899
+ ######## EncodeCleanAudio ####################################################
900
+ def EncodeCleanAudio(self):
901
+ if (self.forceDespiteTag is True) or (GetMonkeyplugTagged(self.inputFileSpec, debug=self.debug) is False):
902
+ self.CreateCleanMuteList()
903
+
904
+ # Handle instrumental mode differently
905
+ if self.instrumentalMode:
906
+ # Use instrumental splicing
907
+ audioArgs = self._build_instrumental_filters()
908
+ else:
909
+ # Traditional mute or beep
910
+ if len(self.muteTimeList) > 0:
911
+ if self.beep:
912
+ muteTimeListStr = ','.join(self.muteTimeList)
913
+ sineTimeListStr = ';'.join([f'{val}[beep{i+1}]' for i, val in enumerate(self.sineTimeList)])
914
+ beepDelayList = ';'.join(
915
+ [f'[beep{i+1}]{val}[beep{i+1}_delayed]' for i, val in enumerate(self.beepDelayList)]
916
+ )
917
+ beepMixList = ''.join([f'[beep{i+1}_delayed]' for i in range(len(self.beepDelayList))])
918
+ filterStr = f"[0:a]{muteTimeListStr}[mute];{sineTimeListStr};{beepDelayList};[mute]{beepMixList}amix=inputs={len(self.beepDelayList)+1}:normalize={str(self.beepMixNormalize).lower()}:dropout_transition={self.beepDropTransition}:weights={self.beepAudioWeight} {' '.join([str(self.beepSineWeight)] * len(self.beepDelayList))}"
919
+ audioArgs = ['-filter_complex', filterStr]
920
+ else:
921
+ audioArgs = ['-af', ",".join(self.muteTimeList)]
922
+ else:
923
+ audioArgs = []
924
+
925
+ if self.outputVideoFileFormat:
926
+ # replace existing audio stream in video file with -copy
927
+ ffmpegCmd = [
928
+ 'ffmpeg',
929
+ '-nostdin',
930
+ '-hide_banner',
931
+ '-nostats',
932
+ '-loglevel',
933
+ 'error',
934
+ '-y',
935
+ '-i',
936
+ self.inputFileSpec,
937
+ ]
938
+
939
+ # Add instrumental file input if in instrumental mode
940
+ if self.instrumentalMode:
941
+ ffmpegCmd.extend(['-i', self.instrumentalFileSpec])
942
+
943
+ ffmpegCmd.extend([
944
+ '-c:v',
945
+ 'copy',
946
+ '-sn',
947
+ '-dn',
948
+ ])
949
+ ffmpegCmd.extend(audioArgs)
950
+ ffmpegCmd.extend(self.aParams)
951
+ ffmpegCmd.append(self.outputFileSpec)
952
+
953
+ else:
954
+ ffmpegCmd = [
955
+ 'ffmpeg',
956
+ '-nostdin',
957
+ '-hide_banner',
958
+ '-nostats',
959
+ '-loglevel',
960
+ 'error',
961
+ '-y',
962
+ '-i',
963
+ self.inputFileSpec,
964
+ ]
965
+
966
+ # Add instrumental file input if in instrumental mode
967
+ if self.instrumentalMode:
968
+ ffmpegCmd.extend(['-i', self.instrumentalFileSpec])
969
+
970
+ ffmpegCmd.extend(['-vn', '-sn', '-dn'])
971
+ ffmpegCmd.extend(audioArgs)
972
+ ffmpegCmd.extend(self.aParams)
973
+ ffmpegCmd.append(self.outputFileSpec)
974
+
975
+ ffmpegResult, ffmpegOutput = mmguero.run_process(ffmpegCmd, stdout=True, stderr=True, debug=self.debug)
976
+ if (ffmpegResult != 0) or (not os.path.isfile(self.outputFileSpec)):
977
+ mmguero.eprint(' '.join(mmguero.flatten(ffmpegCmd)))
978
+ mmguero.eprint(ffmpegResult)
979
+ mmguero.eprint(ffmpegOutput)
980
+ raise ValueError(f"Could not process {self.inputFileSpec}")
981
+
982
+ SetMonkeyplugTag(self.outputFileSpec, debug=self.debug)
983
+
984
+ else:
985
+ shutil.copyfile(self.inputFileSpec, self.outputFileSpec)
986
+
987
+ return self.outputFileSpec
988
+
989
+
990
+ #################################################################################
991
+
992
+
993
+ #################################################################################
994
+ class VoskPlugger(Plugger):
995
+ tmpWavFileSpec = ""
996
+ modelPath = ""
997
+ wavReadFramesChunk = AUDIO_DEFAULT_WAV_FRAMES_CHUNK
998
+ vosk = None
999
+
1000
+ def __init__(
1001
+ self,
1002
+ iFileSpec,
1003
+ oFileSpec,
1004
+ oAudioFileFormat,
1005
+ iSwearsFileSpec,
1006
+ mDir,
1007
+ outputJson,
1008
+ inputTranscript=None,
1009
+ saveTranscript=False,
1010
+ forceRetranscribe=False,
1011
+ aParams=None,
1012
+ aChannels=AUDIO_DEFAULT_CHANNELS,
1013
+ aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
1014
+ aBitRate=AUDIO_DEFAULT_BIT_RATE,
1015
+ aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
1016
+ wChunk=AUDIO_DEFAULT_WAV_FRAMES_CHUNK,
1017
+ padMsecPre=0,
1018
+ padMsecPost=0,
1019
+ beep=False,
1020
+ beepHertz=BEEP_HERTZ_DEFAULT,
1021
+ beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
1022
+ beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
1023
+ beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
1024
+ beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
1025
+ force=False,
1026
+ dbug=False,
1027
+ ):
1028
+ self.wavReadFramesChunk = wChunk
1029
+ self.modelPath = None
1030
+ self.vosk = None
1031
+
1032
+ # Only load model if we're actually going to transcribe
1033
+ if not inputTranscript:
1034
+ # make sure the VOSK model path exists
1035
+ if (mDir is not None) and os.path.isdir(mDir):
1036
+ self.modelPath = mDir
1037
+ else:
1038
+ raise IOError(
1039
+ errno.ENOENT,
1040
+ os.strerror(errno.ENOENT) + " (see https://alphacephei.com/vosk/models)",
1041
+ mDir,
1042
+ )
1043
+
1044
+ self.vosk = mmguero.dynamic_import("vosk", "vosk", debug=dbug)
1045
+ if not self.vosk:
1046
+ raise Exception("Unable to initialize VOSK API")
1047
+ if not dbug:
1048
+ self.vosk.SetLogLevel(-1)
1049
+
1050
+ super().__init__(
1051
+ iFileSpec=iFileSpec,
1052
+ oFileSpec=oFileSpec,
1053
+ oAudioFileFormat=oAudioFileFormat,
1054
+ iSwearsFileSpec=iSwearsFileSpec,
1055
+ outputJson=outputJson,
1056
+ inputTranscript=inputTranscript,
1057
+ saveTranscript=saveTranscript,
1058
+ forceRetranscribe=forceRetranscribe,
1059
+ aParams=aParams,
1060
+ aChannels=aChannels,
1061
+ aSampleRate=aSampleRate,
1062
+ aBitRate=aBitRate,
1063
+ aVorbisQscale=aVorbisQscale,
1064
+ padMsecPre=padMsecPre,
1065
+ padMsecPost=padMsecPost,
1066
+ beep=beep,
1067
+ beepHertz=beepHertz,
1068
+ beepMixNormalize=beepMixNormalize,
1069
+ beepAudioWeight=beepAudioWeight,
1070
+ beepSineWeight=beepSineWeight,
1071
+ beepDropTransition=beepDropTransition,
1072
+ force=force,
1073
+ dbug=dbug,
1074
+ )
1075
+
1076
+ self.tmpWavFileSpec = self.inputFileParts[0] + ".wav"
1077
+
1078
+ if self.debug:
1079
+ if inputTranscript:
1080
+ mmguero.eprint(f'Using input transcript (skipping speech recognition)')
1081
+ else:
1082
+ mmguero.eprint(f'Model directory: {self.modelPath}')
1083
+ mmguero.eprint(f'Intermediate audio file: {self.tmpWavFileSpec}')
1084
+ mmguero.eprint(f'Read frames: {self.wavReadFramesChunk}')
1085
+
1086
+ def __del__(self):
1087
+ super().__del__()
1088
+ # clean up intermediate WAV file used for speech recognition
1089
+ if os.path.isfile(self.tmpWavFileSpec):
1090
+ os.remove(self.tmpWavFileSpec)
1091
+
1092
+ def CreateIntermediateWAV(self):
1093
+ ffmpegCmd = [
1094
+ 'ffmpeg',
1095
+ '-nostdin',
1096
+ '-hide_banner',
1097
+ '-nostats',
1098
+ '-loglevel',
1099
+ 'error',
1100
+ '-y',
1101
+ '-i',
1102
+ self.inputFileSpec,
1103
+ '-vn',
1104
+ '-sn',
1105
+ '-dn',
1106
+ AUDIO_INTERMEDIATE_PARAMS,
1107
+ self.tmpWavFileSpec,
1108
+ ]
1109
+ ffmpegResult, ffmpegOutput = mmguero.run_process(ffmpegCmd, stdout=True, stderr=True, debug=self.debug)
1110
+ if (ffmpegResult != 0) or (not os.path.isfile(self.tmpWavFileSpec)):
1111
+ mmguero.eprint(' '.join(mmguero.flatten(ffmpegCmd)))
1112
+ mmguero.eprint(ffmpegResult)
1113
+ mmguero.eprint(ffmpegOutput)
1114
+ raise ValueError(
1115
+ f"Could not convert {self.inputFileSpec} to {self.tmpWavFileSpec} (16 kHz, mono, s16 PCM WAV)"
1116
+ )
1117
+
1118
+ return self.inputFileSpec
1119
+
1120
+ def RecognizeSpeech(self):
1121
+ self.CreateIntermediateWAV()
1122
+ self.wordList.clear()
1123
+ with wave.open(self.tmpWavFileSpec, "rb") as wf:
1124
+ if (
1125
+ (wf.getnchannels() != 1)
1126
+ or (wf.getframerate() != 16000)
1127
+ or (wf.getsampwidth() != 2)
1128
+ or (wf.getcomptype() != "NONE")
1129
+ ):
1130
+ raise Exception(f"Audio file ({self.tmpWavFileSpec}) must be 16 kHz, mono, s16 PCM WAV")
1131
+
1132
+ rec = self.vosk.KaldiRecognizer(self.vosk.Model(self.modelPath), wf.getframerate())
1133
+ rec.SetWords(True)
1134
+ while True:
1135
+ data = wf.readframes(self.wavReadFramesChunk)
1136
+ if len(data) == 0:
1137
+ break
1138
+ if rec.AcceptWaveform(data):
1139
+ res = json.loads(rec.Result())
1140
+ if "result" in res:
1141
+ self.wordList.extend(
1142
+ [
1143
+ dict(r, **{'scrub': scrubword(mmguero.deep_get(r, ["word"])) in self.swearsMap})
1144
+ for r in res["result"]
1145
+ ]
1146
+ )
1147
+ res = json.loads(rec.FinalResult())
1148
+ if "result" in res:
1149
+ self.wordList.extend(
1150
+ [
1151
+ dict(r, **{'scrub': scrubword(mmguero.deep_get(r, ["word"])) in self.swearsMap})
1152
+ for r in res["result"]
1153
+ ]
1154
+ )
1155
+
1156
+ if self.debug:
1157
+ if hasattr(self, 'verbose_level') and self.verbose_level == "full":
1158
+ mmguero.eprint(json.dumps(self.wordList))
1159
+ else:
1160
+ # Concise mode: just show summary
1161
+ profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
1162
+ mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
1163
+
1164
+ if self.outputJson:
1165
+ with open(self.outputJson, "w") as f:
1166
+ f.write(json.dumps(self.wordList))
1167
+
1168
+ return self.wordList
1169
+
1170
+
1171
+ #################################################################################
1172
+
1173
+
1174
+ #################################################################################
1175
+ class WhisperPlugger(Plugger):
1176
+ debug = False
1177
+ model = None
1178
+ torch = None
1179
+ whisper = None
1180
+ transcript = None
1181
+
1182
+ def __init__(
1183
+ self,
1184
+ iFileSpec,
1185
+ oFileSpec,
1186
+ oAudioFileFormat,
1187
+ iSwearsFileSpec,
1188
+ mDir,
1189
+ mName,
1190
+ torchThreads,
1191
+ outputJson,
1192
+ inputTranscript=None,
1193
+ saveTranscript=False,
1194
+ forceRetranscribe=False,
1195
+ aParams=None,
1196
+ aChannels=AUDIO_DEFAULT_CHANNELS,
1197
+ aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
1198
+ aBitRate=AUDIO_DEFAULT_BIT_RATE,
1199
+ aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
1200
+ padMsecPre=0,
1201
+ padMsecPost=0,
1202
+ beep=False,
1203
+ beepHertz=BEEP_HERTZ_DEFAULT,
1204
+ beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
1205
+ beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
1206
+ beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
1207
+ beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
1208
+ force=False,
1209
+ dbug=False,
1210
+ ):
1211
+ self.whisper = None
1212
+ self.model = None
1213
+ self.torch = None
1214
+
1215
+ # Only load model if we're actually going to transcribe (no input transcript provided)
1216
+ if not inputTranscript:
1217
+ if torchThreads > 0:
1218
+ self.torch = mmguero.dynamic_import("torch", "torch", debug=dbug)
1219
+ if self.torch:
1220
+ self.torch.set_num_threads(torchThreads)
1221
+
1222
+ self.whisper = mmguero.dynamic_import("whisper", "openai-whisper", debug=dbug)
1223
+ if not self.whisper:
1224
+ raise Exception("Unable to initialize Whisper API")
1225
+
1226
+ self.model = self.whisper.load_model(mName, download_root=mDir)
1227
+ if not self.model:
1228
+ raise Exception(f"Unable to load Whisper model {mName} in {mDir}")
1229
+
1230
+ super().__init__(
1231
+ iFileSpec=iFileSpec,
1232
+ oFileSpec=oFileSpec,
1233
+ oAudioFileFormat=oAudioFileFormat,
1234
+ iSwearsFileSpec=iSwearsFileSpec,
1235
+ outputJson=outputJson,
1236
+ inputTranscript=inputTranscript,
1237
+ saveTranscript=saveTranscript,
1238
+ forceRetranscribe=forceRetranscribe,
1239
+ aParams=aParams,
1240
+ aChannels=aChannels,
1241
+ aSampleRate=aSampleRate,
1242
+ aBitRate=aBitRate,
1243
+ aVorbisQscale=aVorbisQscale,
1244
+ padMsecPre=padMsecPre,
1245
+ padMsecPost=padMsecPost,
1246
+ beep=beep,
1247
+ beepHertz=beepHertz,
1248
+ beepMixNormalize=beepMixNormalize,
1249
+ beepAudioWeight=beepAudioWeight,
1250
+ beepSineWeight=beepSineWeight,
1251
+ beepDropTransition=beepDropTransition,
1252
+ force=force,
1253
+ dbug=dbug,
1254
+ )
1255
+
1256
+ if self.debug:
1257
+ if inputTranscript:
1258
+ mmguero.eprint(f'Using input transcript (skipping speech recognition)')
1259
+ else:
1260
+ mmguero.eprint(f'Model directory: {mDir}')
1261
+ mmguero.eprint(f'Model name: {mName}')
1262
+
1263
+ def __del__(self):
1264
+ super().__del__()
1265
+
1266
+ def RecognizeSpeech(self):
1267
+ self.wordList.clear()
1268
+
1269
+ self.transcript = self.model.transcribe(word_timestamps=True, audio=self.inputFileSpec)
1270
+ if self.transcript and ('segments' in self.transcript):
1271
+ for segment in self.transcript['segments']:
1272
+ if 'words' in segment:
1273
+ for word in segment['words']:
1274
+ word['word'] = word['word'].strip()
1275
+ word['scrub'] = scrubword(word['word']) in self.swearsMap
1276
+ self.wordList.append(word)
1277
+
1278
+ if self.debug:
1279
+ if hasattr(self, 'verbose_level') and self.verbose_level == "full":
1280
+ mmguero.eprint(json.dumps(self.wordList))
1281
+ else:
1282
+ # Concise mode: just show summary
1283
+ profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
1284
+ mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
1285
+
1286
+ if self.outputJson:
1287
+ with open(self.outputJson, "w") as f:
1288
+ f.write(json.dumps(self.wordList))
1289
+
1290
+ return self.wordList
1291
+
1292
+
1293
+ #################################################################################
1294
+ class GroqPlugger(Plugger):
1295
+ GROQ_API_ENDPOINT = "https://api.groq.com/openai/v1/audio/transcriptions"
1296
+ debug = False
1297
+ api_key = None
1298
+ groq_model = "whisper-large-v3"
1299
+ transcript = None
1300
+ VOCAL_DETECTION_SAMPLE_DURATION = 10 # Seconds to sample for vocal detection
1301
+ # Filler words that indicate silence (including common hallucinations)
1302
+ VOCAL_DETECTION_FILLER_WORDS = {
1303
+ 'thank', 'thanks', 'please', 'you', 'hey', 'yeah', 'oh', 'wow',
1304
+ '¶', '¶¶', # Common hallucinations/artifacts
1305
+ '', # Empty strings
1306
+ } # Filler words that indicate silence
1307
+
1308
+ def __init__(
1309
+ self,
1310
+ iFileSpec,
1311
+ oFileSpec,
1312
+ oAudioFileFormat,
1313
+ iSwearsFileSpec,
1314
+ groq_api_key,
1315
+ groq_model,
1316
+ outputJson,
1317
+ inputTranscript=None,
1318
+ saveTranscript=False,
1319
+ forceRetranscribe=False,
1320
+ aParams=None,
1321
+ aChannels=AUDIO_DEFAULT_CHANNELS,
1322
+ aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
1323
+ aBitRate=AUDIO_DEFAULT_BIT_RATE,
1324
+ aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
1325
+ padMsecPre=0,
1326
+ padMsecPost=0,
1327
+ beep=False,
1328
+ beepHertz=BEEP_HERTZ_DEFAULT,
1329
+ beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
1330
+ beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
1331
+ beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
1332
+ beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
1333
+ force=False,
1334
+ dbug=False,
1335
+ instrumentalFileSpec=None,
1336
+ verbose_level="",
1337
+ auto_generate=False,
1338
+ separation_padding=1.0,
1339
+ ):
1340
+ # Import groq_config - handle both relative and absolute imports
1341
+ try:
1342
+ from .groq_config import load_groq_api_key
1343
+ except ImportError:
1344
+ from monkeyplug.groq_config import load_groq_api_key
1345
+
1346
+ self.api_key = load_groq_api_key(groq_api_key, debug=dbug)
1347
+ if not self.api_key:
1348
+ raise ValueError(
1349
+ "Groq API key not found. Please provide it via --groq-api-key parameter, "
1350
+ "GROQ_API_KEY environment variable, ~/.groq/config.json file, or ./.groq_key file"
1351
+ )
1352
+
1353
+ self.groq_model = groq_model
1354
+ self.debug = dbug
1355
+ self.verbose_level = verbose_level
1356
+
1357
+ super().__init__(
1358
+ iFileSpec=iFileSpec,
1359
+ oFileSpec=oFileSpec,
1360
+ oAudioFileFormat=oAudioFileFormat,
1361
+ iSwearsFileSpec=iSwearsFileSpec,
1362
+ outputJson=outputJson,
1363
+ inputTranscript=inputTranscript,
1364
+ saveTranscript=saveTranscript,
1365
+ forceRetranscribe=forceRetranscribe,
1366
+ aParams=aParams,
1367
+ aChannels=aChannels,
1368
+ aSampleRate=aSampleRate,
1369
+ aBitRate=aBitRate,
1370
+ aVorbisQscale=aVorbisQscale,
1371
+ padMsecPre=padMsecPre,
1372
+ padMsecPost=padMsecPost,
1373
+ beep=beep,
1374
+ beepHertz=beepHertz,
1375
+ beepMixNormalize=beepMixNormalize,
1376
+ beepAudioWeight=beepAudioWeight,
1377
+ beepSineWeight=beepSineWeight,
1378
+ beepDropTransition=beepDropTransition,
1379
+ force=force,
1380
+ dbug=dbug,
1381
+ instrumentalFileSpec=instrumentalFileSpec,
1382
+ )
1383
+
1384
+ # Initialize auto-separation mode
1385
+ self.autoGenerateMode = auto_generate
1386
+ self.separationPadding = separation_padding
1387
+ self.separationCacheDir = None
1388
+ self.segMapping = [] # Timestamp mapping for combined file
1389
+ self.separator = None
1390
+
1391
+ if self.autoGenerateMode:
1392
+ try:
1393
+ from .separation import SourceSeparator
1394
+ except ImportError:
1395
+ from monkeyplug.separation import SourceSeparator
1396
+
1397
+ import tempfile
1398
+ self.separator = SourceSeparator(debug=self.debug)
1399
+ self.separationCacheDir = tempfile.mkdtemp(prefix="monkeyplug_separation_")
1400
+ if self.debug:
1401
+ mmguero.eprint(f'Auto-separation mode enabled (padding: {self.separationPadding}s)')
1402
+ mmguero.eprint(f'Cache directory: {self.separationCacheDir}')
1403
+
1404
+ if self.debug:
1405
+ if inputTranscript:
1406
+ mmguero.eprint('Using input transcript (skipping speech recognition)')
1407
+ else:
1408
+ mmguero.eprint(f'Using Groq API with model: {self.groq_model}')
1409
+
1410
+ def RecognizeSpeech(self):
1411
+ import requests
1412
+ import time
1413
+
1414
+ self.wordList.clear()
1415
+
1416
+ # Prepare the API request
1417
+ headers = {
1418
+ "Authorization": f"Bearer {self.api_key}"
1419
+ }
1420
+
1421
+ data = {
1422
+ "model": self.groq_model,
1423
+ "response_format": "verbose_json",
1424
+ "timestamp_granularities[]": "word"
1425
+ }
1426
+
1427
+ # Implement retry logic for rate limiting
1428
+ max_retries = 3
1429
+ retry_delay = 1 # Initial delay in seconds
1430
+
1431
+ for attempt in range(max_retries):
1432
+ file_handle = None
1433
+ try:
1434
+ # Prepare the file and data - open fresh for each attempt
1435
+ filename = os.path.basename(self.inputFileSpec)
1436
+ file_handle = open(self.inputFileSpec, 'rb')
1437
+ files = {
1438
+ "file": (filename, file_handle, "audio/mpeg")
1439
+ }
1440
+
1441
+ if self.debug:
1442
+ mmguero.eprint(f"Sending request to Groq API (attempt {attempt + 1}/{max_retries})...")
1443
+
1444
+ response = requests.post(
1445
+ self.GROQ_API_ENDPOINT,
1446
+ headers=headers,
1447
+ files=files,
1448
+ data=data,
1449
+ timeout=120 # 2 minute timeout
1450
+ )
1451
+
1452
+ # Handle rate limiting (HTTP 429)
1453
+ if response.status_code == 429:
1454
+ if attempt < max_retries - 1:
1455
+ if self.debug:
1456
+ mmguero.eprint(f"Rate limit hit, retrying in {retry_delay} seconds...")
1457
+ time.sleep(retry_delay)
1458
+ retry_delay *= 2 # Exponential backoff
1459
+ continue
1460
+ else:
1461
+ raise Exception("Rate limit exceeded. Please try again later.")
1462
+
1463
+ # Handle authentication errors (HTTP 401)
1464
+ if response.status_code == 401:
1465
+ raise Exception(
1466
+ "Invalid Groq API key. Please check your API key configuration."
1467
+ )
1468
+
1469
+ # Raise for other HTTP errors
1470
+ response.raise_for_status()
1471
+
1472
+ # Parse the response
1473
+ self.transcript = response.json()
1474
+
1475
+ if self.transcript and 'words' in self.transcript:
1476
+ for word in self.transcript['words']:
1477
+ word['word'] = word['word'].strip()
1478
+ word['scrub'] = scrubword(word['word']) in self.swearsMap
1479
+ self.wordList.append(word)
1480
+
1481
+ if self.debug:
1482
+ if hasattr(self, 'verbose_level') and self.verbose_level == "full":
1483
+ mmguero.eprint(json.dumps(self.wordList))
1484
+ else:
1485
+ # Concise mode: just show summary
1486
+ profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
1487
+ mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
1488
+
1489
+ if self.outputJson:
1490
+ with open(self.outputJson, "w") as f:
1491
+ f.write(json.dumps(self.wordList))
1492
+
1493
+ return self.wordList
1494
+
1495
+ except requests.exceptions.Timeout:
1496
+ if attempt < max_retries - 1:
1497
+ if self.debug:
1498
+ mmguero.eprint(f"Request timed out, retrying (attempt {attempt + 1}/{max_retries})...")
1499
+ time.sleep(retry_delay)
1500
+ retry_delay *= 2
1501
+ else:
1502
+ raise Exception("Request timed out. Please check your internet connection and try again.")
1503
+
1504
+ except requests.exceptions.RequestException as e:
1505
+ if attempt < max_retries - 1:
1506
+ if self.debug:
1507
+ mmguero.eprint(f"Request failed: {e}, retrying (attempt {attempt + 1}/{max_retries})...")
1508
+ time.sleep(retry_delay)
1509
+ retry_delay *= 2
1510
+ else:
1511
+ raise Exception(f"Failed to connect to Groq API: {e}")
1512
+
1513
+ finally:
1514
+ # Make sure the file is closed after each attempt
1515
+ if file_handle is not None:
1516
+ file_handle.close()
1517
+
1518
+ raise Exception("Failed to complete speech recognition after maximum retries")
1519
+
1520
+ def DetectVocals(self, filepath):
1521
+ """Detect if file has vocals by transcribing a short sample from the middle.
1522
+
1523
+ Args:
1524
+ filepath: Path to audio file to check
1525
+
1526
+ Returns:
1527
+ bool: True if vocals detected, False if instrumental (no speech)
1528
+ """
1529
+ import requests
1530
+ import tempfile
1531
+
1532
+ # Get file duration
1533
+ duration = self._get_file_duration(filepath)
1534
+ if duration < self.VOCAL_DETECTION_SAMPLE_DURATION:
1535
+ # Short files, assume vocal (too short to be instrumental)
1536
+ return True
1537
+
1538
+ # Calculate middle position for sample
1539
+ start_time = (duration - self.VOCAL_DETECTION_SAMPLE_DURATION) / 2
1540
+
1541
+ # Create temporary file for sample
1542
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
1543
+ tmp_path = tmp.name
1544
+
1545
+ try:
1546
+ # Extract sample from middle using ffmpeg
1547
+ ffmpegCmd = [
1548
+ 'ffmpeg', '-nostdin', '-hide_banner', '-nostats', '-loglevel', 'error',
1549
+ '-i', filepath,
1550
+ '-ss', str(start_time),
1551
+ '-t', str(self.VOCAL_DETECTION_SAMPLE_DURATION),
1552
+ '-acodec', 'libmp3lame', '-b:a', '128K',
1553
+ '-y', tmp_path
1554
+ ]
1555
+
1556
+ result, _ = mmguero.run_process(ffmpegCmd, stdout=False, stderr=False, debug=False)
1557
+
1558
+ if result != 0:
1559
+ # On error, assume vocal
1560
+ if self.debug:
1561
+ mmguero.eprint(f'Warning: Failed to extract sample from {os.path.basename(filepath)}, assuming vocals')
1562
+ return True
1563
+
1564
+ # Transcribe sample with Groq API
1565
+ file_handle = None
1566
+ try:
1567
+ file_handle = open(tmp_path, 'rb')
1568
+ files = {"file": (os.path.basename(filepath), file_handle, "audio/mpeg")}
1569
+ data = {
1570
+ "model": self.groq_model,
1571
+ "response_format": "verbose_json",
1572
+ "timestamp_granularities[]": "word"
1573
+ }
1574
+
1575
+ headers = {"Authorization": f"Bearer {self.api_key}"}
1576
+ response = requests.post(
1577
+ self.GROQ_API_ENDPOINT,
1578
+ headers=headers,
1579
+ files=files,
1580
+ data=data,
1581
+ timeout=30
1582
+ )
1583
+
1584
+ if response.status_code == 200:
1585
+ result = response.json()
1586
+ # Check if any words were detected
1587
+ words = result.get('words', [])
1588
+
1589
+ if len(words) == 0:
1590
+ # No words detected = instrumental
1591
+ if self.debug:
1592
+ mmguero.eprint(f'Vocal detection: 0 words detected → instrumental')
1593
+ return False
1594
+
1595
+ # Get all detected words for debugging
1596
+ # Clean words: lowercase, strip punctuation and special characters
1597
+ def clean_word(w):
1598
+ # Remove common punctuation and special Unicode characters
1599
+ cleaned = w.lower().strip('.,!?;:"\'()[]{}©®™¶§†‡•—–')
1600
+ return cleaned
1601
+
1602
+ detected_words = {clean_word(word['word']) for word in words}
1603
+ all_words_text = ', '.join([word['word'] for word in words])
1604
+
1605
+ # Check for "thank you" pattern - if only filler words detected, it's silence/instrumental
1606
+ # If ALL detected words are filler words, treat as instrumental
1607
+ if detected_words.issubset(self.VOCAL_DETECTION_FILLER_WORDS):
1608
+ if self.debug:
1609
+ mmguero.eprint(f'Vocal detection: Only filler words detected ({all_words_text}) → instrumental (silence)')
1610
+ return False
1611
+
1612
+ # Real lyrics detected = vocal track
1613
+ if self.debug:
1614
+ mmguero.eprint(f'Vocal detection: {len(words)} words detected → vocals')
1615
+ mmguero.eprint(f' Words: {all_words_text}')
1616
+
1617
+ return True
1618
+
1619
+ # On error, assume vocal
1620
+ if self.debug:
1621
+ mmguero.eprint(f'Warning: API error during vocal detection, assuming vocals')
1622
+ return True
1623
+
1624
+ finally:
1625
+ if file_handle:
1626
+ file_handle.close()
1627
+
1628
+ except Exception as e:
1629
+ # On any error, assume vocal
1630
+ if self.debug:
1631
+ mmguero.eprint(f'Warning: Exception during vocal detection: {e}, assuming vocals')
1632
+ return True
1633
+
1634
+ finally:
1635
+ # Clean up temporary file
1636
+ if os.path.exists(tmp_path):
1637
+ os.remove(tmp_path)
1638
+
1639
+ def _extract_combined_segments(self, output_file):
1640
+ """
1641
+ Extract all profanity segments with padding and concatenate into one file
1642
+ Uses FFmpeg filter_complex to concatenate segments
1643
+ Tracks mapping between original timestamps and combined file timestamps
1644
+
1645
+ Returns:
1646
+ float: Total duration of combined file, or 0 if failed
1647
+ """
1648
+ if not self.instrumentalSegments:
1649
+ return 0.0
1650
+
1651
+ duration = self._get_file_duration(self.inputFileSpec)
1652
+
1653
+ # Build filter to extract and concatenate all profanity segments
1654
+ filter_parts = []
1655
+ seg_index = 0
1656
+ combined_time = 0.0 # Track current position in combined file
1657
+
1658
+ for start, end in self.instrumentalSegments:
1659
+ # Add padding
1660
+ padded_start = max(0, start - self.separationPadding)
1661
+ padded_end = min(duration, end + self.separationPadding)
1662
+ segment_duration = padded_end - padded_start
1663
+
1664
+ # Extract this segment
1665
+ filter_parts.append(
1666
+ f"[0:a]atrim={padded_start:.2f}:{padded_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
1667
+ )
1668
+
1669
+ # Track mapping: where this original segment appears in combined file
1670
+ # Format: (original profanity start, original profanity end,
1671
+ # combined file start, combined file end,
1672
+ # padded segment start, padded segment end)
1673
+ padded_start = max(0, start - self.separationPadding)
1674
+ padded_end = min(duration, end + self.separationPadding)
1675
+ self.segMapping.append((
1676
+ start, # Original profanity start
1677
+ end, # Original profanity end
1678
+ combined_time, # Start position in combined file
1679
+ combined_time + segment_duration, # End position in combined file
1680
+ padded_start, # Padded segment start (for offset calculation)
1681
+ padded_end, # Padded segment end (for offset calculation)
1682
+ ))
1683
+
1684
+ combined_time += segment_duration
1685
+ seg_index += 1
1686
+
1687
+ # Concatenate all segments
1688
+ concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
1689
+ filter_parts.append(f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]")
1690
+
1691
+ filter_complex = ';'.join(filter_parts)
1692
+
1693
+ # Run ffmpeg to extract and concatenate
1694
+ ffmpegCmd = [
1695
+ 'ffmpeg', '-nostdin', '-hide_banner', '-nostats', '-loglevel', 'error',
1696
+ '-y',
1697
+ '-i', self.inputFileSpec,
1698
+ '-filter_complex', filter_complex,
1699
+ '-map', '[outa]',
1700
+ '-acodec', 'pcm_s16le', # WAV for sherpa-onnx
1701
+ '-ar', '44100',
1702
+ '-ac', '2',
1703
+ output_file
1704
+ ]
1705
+
1706
+ result, _ = mmguero.run_process(ffmpegCmd, stdout=False, stderr=False, debug=self.debug)
1707
+
1708
+ if result != 0:
1709
+ raise IOError("Failed to extract combined profanity segments")
1710
+
1711
+ # Return duration of combined file
1712
+ return self._get_file_duration(output_file)
1713
+
1714
+ def _create_combined_profanity_file(self):
1715
+ """
1716
+ Extract all profanity segments (with padding) into a single continuous file
1717
+ and separate it into instrumental
1718
+
1719
+ Also creates timestamp mapping: where each original segment appears in the combined file
1720
+
1721
+ Returns:
1722
+ str: Path to the combined instrumental file
1723
+ """
1724
+ if not self.instrumentalSegments:
1725
+ return None
1726
+
1727
+ # Step 1: Extract all profanity segments (with padding) into one file
1728
+ # Also track the mapping between original timestamps and combined file timestamps
1729
+ combined_file = os.path.join(self.separationCacheDir, "combined_profanity.wav")
1730
+ self.segMapping = [] # Reset mapping
1731
+
1732
+ segment_duration = self._extract_combined_segments(combined_file)
1733
+
1734
+ if not segment_duration:
1735
+ return None
1736
+
1737
+ if self.debug:
1738
+ mmguero.eprint(f'Extracted {len(self.instrumentalSegments)} profanity segment(s) into combined file ({segment_duration:.2f}s)')
1739
+
1740
+ # Step 2: Separate the combined file
1741
+ instrumental_path, vocals_path = self.separator.separate_audio_file(
1742
+ combined_file,
1743
+ self.separationCacheDir
1744
+ )
1745
+
1746
+ return instrumental_path
1747
+
1748
+
1749
+ #################################################################################
1750
+
1751
+
1752
+ ###################################################################################################
1753
+ # Wildcard and batch processing helpers
1754
+ def apply_output_pattern(input_file, output_pattern):
1755
+ """Generate output filename from pattern.
1756
+
1757
+ Args:
1758
+ input_file: Path to input file
1759
+ output_pattern: Output pattern (e.g., '*_clean.mp3')
1760
+
1761
+ Returns:
1762
+ str: Generated output filepath
1763
+ """
1764
+ input_dir = os.path.dirname(input_file)
1765
+ input_basename = os.path.basename(input_file)
1766
+ input_name, input_ext = os.path.splitext(input_basename)
1767
+
1768
+ # Replace * with input name
1769
+ output_name = output_pattern.replace('*', input_name)
1770
+
1771
+ # Add extension if not present in pattern
1772
+ if not os.path.splitext(output_name)[1]:
1773
+ output_name += input_ext
1774
+
1775
+ if input_dir:
1776
+ return os.path.join(input_dir, output_name)
1777
+ return output_name
1778
+
1779
+
1780
+ def expand_and_detect_vocals(input_pattern, output_pattern, args, skip_detection=False):
1781
+ """Expand wildcards and detect which files have vocals.
1782
+
1783
+ Args:
1784
+ input_pattern: Input file pattern (e.g., '*.mp3')
1785
+ output_pattern: Output file pattern (e.g., '*_clean.mp3')
1786
+ args: Parsed command-line arguments
1787
+ skip_detection: If True, assume all files have vocals (used with --instrumental generate)
1788
+
1789
+ Returns:
1790
+ tuple: (vocal_files, instrumental_files, output_files)
1791
+ """
1792
+ import glob
1793
+ import re
1794
+
1795
+ # Expand input wildcard
1796
+ input_files = glob.glob(input_pattern)
1797
+
1798
+ if not input_files:
1799
+ raise IOError(f"No files found matching pattern: {input_pattern}")
1800
+
1801
+ # If only one file and no wildcard, return it directly
1802
+ if len(input_files) == 1 and '*' not in input_pattern:
1803
+ output_file = apply_output_pattern(input_files[0], output_pattern)
1804
+ return [input_files[0]], [], [output_file]
1805
+
1806
+ # Filter out files that match the output pattern (already processed)
1807
+ # Convert output pattern to regex for matching
1808
+ def pattern_to_regex(pattern):
1809
+ """Convert wildcard pattern to regex for matching"""
1810
+ # Escape special regex characters except *
1811
+ regex = re.escape(pattern)
1812
+ # Replace escaped * with .* (match anything)
1813
+ regex = regex.replace(r'\*', '.*')
1814
+ # Add anchors to match entire filename
1815
+ return f'^{regex}$'
1816
+
1817
+ output_regex = pattern_to_regex(output_pattern)
1818
+ filtered_files = []
1819
+ skipped_output_files = []
1820
+
1821
+ for filepath in input_files:
1822
+ basename = os.path.basename(filepath)
1823
+ # Check if file matches output pattern
1824
+ if re.match(output_regex, basename, re.IGNORECASE):
1825
+ skipped_output_files.append(filepath)
1826
+ if args.debug:
1827
+ mmguero.eprint(f'Skipping output file: {basename} (matches output pattern)')
1828
+ else:
1829
+ filtered_files.append(filepath)
1830
+
1831
+ input_files = filtered_files
1832
+
1833
+ if not input_files:
1834
+ mmguero.eprint('No files to process after filtering out already-processed output files.')
1835
+ return [], [], []
1836
+
1837
+ if args.debug:
1838
+ mmguero.eprint(f'Expanded wildcard to {len(input_files)} file(s) (skipped {len(skipped_output_files)} output files)')
1839
+
1840
+ if skip_detection:
1841
+ if args.debug:
1842
+ mmguero.eprint('Skipping vocal detection (generate mode — assuming all files have vocals)')
1843
+ output_files = [apply_output_pattern(f, output_pattern) for f in input_files]
1844
+ return input_files, [], output_files
1845
+
1846
+ # Create a GroqPlugger instance just for detection
1847
+ # We need to use dummy values for most parameters since we're only detecting vocals
1848
+ try:
1849
+ from .groq_config import load_groq_api_key
1850
+ except ImportError:
1851
+ from monkeyplug.groq_config import load_groq_api_key
1852
+
1853
+ api_key = load_groq_api_key(args.groqApiKey, debug=args.debug)
1854
+ if not api_key:
1855
+ raise ValueError("Groq API key required for wildcard vocal detection")
1856
+
1857
+ # Create minimal GroqPlugger for detection
1858
+ detector = GroqPlugger(
1859
+ iFileSpec=input_files[0], # Dummy, will be overridden
1860
+ oFileSpec="dummy.mp3",
1861
+ oAudioFileFormat="MATCH",
1862
+ iSwearsFileSpec=args.swears,
1863
+ groq_api_key=api_key,
1864
+ groq_model=args.groqModel,
1865
+ outputJson=None,
1866
+ dbug=args.debug,
1867
+ verbose_level=args.verbose_level if hasattr(args, 'verbose_level') else "",
1868
+ )
1869
+
1870
+ vocal_files = []
1871
+ instrumental_files = []
1872
+ output_files = []
1873
+
1874
+ # Detect vocals in each file
1875
+ for filepath in input_files:
1876
+ basename = os.path.basename(filepath)
1877
+
1878
+ if args.debug:
1879
+ mmguero.eprint(f'Detecting vocals in: {basename}')
1880
+
1881
+ has_vocals = detector.DetectVocals(filepath)
1882
+
1883
+ if has_vocals:
1884
+ output_file = apply_output_pattern(filepath, output_pattern)
1885
+ vocal_files.append(filepath)
1886
+ output_files.append(output_file)
1887
+ if args.debug:
1888
+ mmguero.eprint(f' ✓ Vocals detected → will process')
1889
+ else:
1890
+ instrumental_files.append(filepath)
1891
+ if args.debug:
1892
+ mmguero.eprint(f' ✗ No vocals → skipping (likely instrumental)')
1893
+
1894
+ if args.debug:
1895
+ mmguero.eprint(f'\nVocal detection complete: {len(vocal_files)} vocal, {len(instrumental_files)} instrumental, {len(skipped_output_files)} already processed')
1896
+
1897
+ return vocal_files, instrumental_files, output_files
1898
+
1899
+
1900
+ ###################################################################################################
1901
+ # Config file loading
1902
+ MONKEYPLUG_CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'monkeyplug')
1903
+ MONKEYPLUG_CONFIG_PATH = os.path.join(MONKEYPLUG_CACHE_DIR, 'config.json')
1904
+
1905
+ DEFAULT_CONFIG = {
1906
+ "pad_milliseconds": 10,
1907
+ "pad_milliseconds_pre": 10,
1908
+ "pad_milliseconds_post": 10,
1909
+ "separation_padding": 1.0,
1910
+ "beep_hertz": BEEP_HERTZ_DEFAULT,
1911
+ }
1912
+
1913
+
1914
+ def load_config_settings(debug=False):
1915
+ """
1916
+ Load settings from JSON config file.
1917
+
1918
+ Config file search order (first found wins):
1919
+ 1. ./.monkeyplug.json (current directory, project-specific)
1920
+ 2. ~/.cache/monkeyplug/config.json (user-specific, alongside models)
1921
+
1922
+ If no config exists anywhere, a default one is created at
1923
+ ~/.cache/monkeyplug/config.json so the user can find and edit it.
1924
+
1925
+ Returns:
1926
+ dict: Config settings (empty dict if no config found)
1927
+ """
1928
+ config_paths = [
1929
+ os.path.join(os.getcwd(), '.monkeyplug.json'),
1930
+ MONKEYPLUG_CONFIG_PATH,
1931
+ ]
1932
+
1933
+ for config_path in config_paths:
1934
+ if os.path.isfile(config_path):
1935
+ try:
1936
+ with open(config_path, 'r') as f:
1937
+ config = json.load(f)
1938
+
1939
+ if debug:
1940
+ mmguero.eprint(f"Loaded config from: {config_path}")
1941
+
1942
+ return config
1943
+ except (json.JSONDecodeError, IOError) as e:
1944
+ if debug:
1945
+ mmguero.eprint(f"Warning: Failed to load config from {config_path}: {e}")
1946
+ continue
1947
+
1948
+ # No config found anywhere — create a default one so the user can edit it
1949
+ try:
1950
+ os.makedirs(MONKEYPLUG_CACHE_DIR, exist_ok=True)
1951
+ with open(MONKEYPLUG_CONFIG_PATH, 'w') as f:
1952
+ json.dump(DEFAULT_CONFIG, f, indent=2)
1953
+ f.write('\n')
1954
+ if debug:
1955
+ mmguero.eprint(f"Created default config at: {MONKEYPLUG_CONFIG_PATH}")
1956
+ except (IOError, OSError) as e:
1957
+ if debug:
1958
+ mmguero.eprint(f"Warning: Could not create default config: {e}")
1959
+
1960
+ return dict(DEFAULT_CONFIG)
1961
+
1962
+
1963
+ ###################################################################################################
1964
+ # RunMonkeyPlug
1965
+ def RunMonkeyPlug():
1966
+
1967
+ package_name = __package__ or "monkeyplug"
1968
+ try:
1969
+ metadata = importlib.metadata.metadata(package_name)
1970
+ version = metadata.get("Version", "unknown")
1971
+ except importlib.metadata.PackageNotFoundError:
1972
+ version = "source"
1973
+
1974
+ # Load config file for default values (can be overridden by CLI args)
1975
+ config = load_config_settings(debug=False)
1976
+
1977
+ parser = argparse.ArgumentParser(
1978
+ description=f"{package_name} (v{version})",
1979
+ add_help=True,
1980
+ usage=f"{package_name} <arguments>",
1981
+ )
1982
+ parser.add_argument(
1983
+ "-v",
1984
+ "--verbose",
1985
+ dest="verbose",
1986
+ type=str,
1987
+ nargs="?",
1988
+ const="concise",
1989
+ default="",
1990
+ metavar="[concise|full]",
1991
+ help="Verbose output level: -v for concise, -v full for detailed debug output",
1992
+ )
1993
+ parser.add_argument(
1994
+ "-m",
1995
+ "--mode",
1996
+ dest="speechRecMode",
1997
+ metavar="<string>",
1998
+ type=str,
1999
+ default=DEFAULT_SPEECH_REC_MODE,
2000
+ help=f"Speech recognition engine ({SPEECH_REC_MODE_GROQ}|{SPEECH_REC_MODE_WHISPER}|{SPEECH_REC_MODE_VOSK}) (default: {DEFAULT_SPEECH_REC_MODE})",
2001
+ )
2002
+ parser.add_argument(
2003
+ "-i",
2004
+ "--input",
2005
+ dest="input",
2006
+ type=str,
2007
+ default=None,
2008
+ required=True,
2009
+ metavar="<string>",
2010
+ help="Input file (or URL)",
2011
+ )
2012
+ parser.add_argument(
2013
+ "-o",
2014
+ "--output",
2015
+ dest="output",
2016
+ type=str,
2017
+ default=None,
2018
+ required=False,
2019
+ metavar="<string>",
2020
+ help="Output file",
2021
+ )
2022
+ parser.add_argument(
2023
+ "--output-json",
2024
+ dest="outputJson",
2025
+ type=str,
2026
+ default=None,
2027
+ required=False,
2028
+ metavar="<string>",
2029
+ help="Output file to store transcript JSON",
2030
+ )
2031
+ parser.add_argument(
2032
+ "-w",
2033
+ "--swears",
2034
+ help=f"text file containing profanity (default: \"{SWEARS_FILENAME_DEFAULT}\")",
2035
+ default=os.path.join(script_path, SWEARS_FILENAME_DEFAULT),
2036
+ metavar="<profanity file>",
2037
+ )
2038
+ parser.add_argument(
2039
+ "--input-transcript",
2040
+ dest="inputTranscript",
2041
+ type=str,
2042
+ default=None,
2043
+ required=False,
2044
+ metavar="<string>",
2045
+ help="Load existing transcript JSON instead of performing speech recognition",
2046
+ )
2047
+ parser.add_argument(
2048
+ "--save-transcript",
2049
+ dest="saveTranscript",
2050
+ action="store_true",
2051
+ default=False,
2052
+ help="Automatically save transcript JSON alongside output audio file",
2053
+ )
2054
+ parser.add_argument(
2055
+ "--force-retranscribe",
2056
+ dest="forceRetranscribe",
2057
+ action="store_true",
2058
+ default=False,
2059
+ help="Force new transcription even if transcript file exists (overrides automatic reuse)",
2060
+ )
2061
+ parser.add_argument(
2062
+ "--instrumental",
2063
+ dest="instrumentalFile",
2064
+ type=str,
2065
+ default=None,
2066
+ required=False,
2067
+ metavar="<mode|file>",
2068
+ help="Instrumental mode: 'auto' (default, try prefix search then generate), 'generate' (AI generation), 'prefix' (search with --instrumental-prefix), or file path",
2069
+ )
2070
+ parser.add_argument(
2071
+ "--instrumental-prefix",
2072
+ dest="instrumentalPrefix",
2073
+ type=str,
2074
+ default="AUTO",
2075
+ required=False,
2076
+ metavar="<string>",
2077
+ help="Prefix/suffix to search for instrumental file, or 'AUTO' for fuzzy matching (default)",
2078
+ )
2079
+ parser.add_argument(
2080
+ "--instrumental-auto-candidates",
2081
+ dest="instrumentalAutoCandidates",
2082
+ type=int,
2083
+ default=5,
2084
+ required=False,
2085
+ metavar="<int>",
2086
+ help="Number of top candidates to validate in AUTO mode (default: 5)",
2087
+ )
2088
+ parser.add_argument(
2089
+ "--separation-padding",
2090
+ dest="separationPadding",
2091
+ type=float,
2092
+ default=config.get("separation_padding", 1.0),
2093
+ metavar="<seconds>",
2094
+ help=f"Context padding for AI generation (default: {config.get('separation_padding', 1.0)} seconds)",
2095
+ )
2096
+ parser.add_argument(
2097
+ "--filter-instrumentals",
2098
+ dest="filterInstrumentals",
2099
+ action="store_true",
2100
+ default=False,
2101
+ help="In wildcard mode with --instrumental generate, filter out files detected as instrumentals (default: process all files)",
2102
+ )
2103
+ parser.add_argument(
2104
+ "--mute",
2105
+ dest="mute",
2106
+ action="store_true",
2107
+ default=False,
2108
+ help="Force mute mode (disable instrumental processing)",
2109
+ )
2110
+ parser.add_argument(
2111
+ "-a",
2112
+ "--audio-params",
2113
+ help="Audio parameters for ffmpeg (default depends on output audio codec)",
2114
+ dest="aParams",
2115
+ metavar="<str>",
2116
+ default=None,
2117
+ )
2118
+ parser.add_argument(
2119
+ "-c",
2120
+ "--channels",
2121
+ dest="aChannels",
2122
+ metavar="<int>",
2123
+ type=int,
2124
+ default=AUDIO_DEFAULT_CHANNELS,
2125
+ help=f"Audio output channels (default: {AUDIO_DEFAULT_CHANNELS})",
2126
+ )
2127
+ parser.add_argument(
2128
+ "-s",
2129
+ "--sample-rate",
2130
+ dest="aSampleRate",
2131
+ metavar="<int>",
2132
+ type=int,
2133
+ default=AUDIO_DEFAULT_SAMPLE_RATE,
2134
+ help=f"Audio output sample rate (default: {AUDIO_DEFAULT_SAMPLE_RATE})",
2135
+ )
2136
+ parser.add_argument(
2137
+ "-r",
2138
+ "--bitrate",
2139
+ dest="aBitRate",
2140
+ metavar="<str>",
2141
+ default=AUDIO_DEFAULT_BIT_RATE,
2142
+ help=f"Audio output bitrate (default: {AUDIO_DEFAULT_BIT_RATE})",
2143
+ )
2144
+ parser.add_argument(
2145
+ "-q",
2146
+ "--vorbis-qscale",
2147
+ dest="aVorbisQscale",
2148
+ metavar="<int>",
2149
+ type=int,
2150
+ default=AUDIO_DEFAULT_VORBIS_QSCALE,
2151
+ help=f"qscale for libvorbis output (default: {AUDIO_DEFAULT_VORBIS_QSCALE})",
2152
+ )
2153
+ parser.add_argument(
2154
+ "-f",
2155
+ "--format",
2156
+ dest="outputFormat",
2157
+ type=str,
2158
+ default=AUDIO_MATCH_FORMAT,
2159
+ required=False,
2160
+ metavar="<string>",
2161
+ help=f"Output file format (default: inferred from extension of --output, or \"{AUDIO_MATCH_FORMAT}\")",
2162
+ )
2163
+ parser.add_argument(
2164
+ "--pad-milliseconds",
2165
+ dest="padMsec",
2166
+ metavar="<int>",
2167
+ type=int,
2168
+ default=config.get("pad_milliseconds", 10),
2169
+ help=f"Milliseconds to pad on either side of muted segments (default: {config.get('pad_milliseconds', 10)})",
2170
+ )
2171
+ parser.add_argument(
2172
+ "--pad-milliseconds-pre",
2173
+ dest="padMsecPre",
2174
+ metavar="<int>",
2175
+ type=int,
2176
+ default=config.get("pad_milliseconds_pre", 10),
2177
+ help=f"Milliseconds to pad before muted segments (default: {config.get('pad_milliseconds_pre', 10)})",
2178
+ )
2179
+ parser.add_argument(
2180
+ "--pad-milliseconds-post",
2181
+ dest="padMsecPost",
2182
+ metavar="<int>",
2183
+ type=int,
2184
+ default=config.get("pad_milliseconds_post", 10),
2185
+ help=f"Milliseconds to pad after muted segments (default: {config.get('pad_milliseconds_post', 10)})",
2186
+ )
2187
+ parser.add_argument(
2188
+ "-b",
2189
+ "--beep",
2190
+ dest="beep",
2191
+ type=mmguero.str2bool,
2192
+ nargs="?",
2193
+ const=True,
2194
+ default=False,
2195
+ metavar="true|false",
2196
+ help="Beep instead of silence",
2197
+ )
2198
+ parser.add_argument(
2199
+ "-z",
2200
+ "--beep-hertz",
2201
+ dest="beepHertz",
2202
+ metavar="<int>",
2203
+ type=int,
2204
+ default=config.get("beep_hertz", BEEP_HERTZ_DEFAULT),
2205
+ help=f"Beep frequency hertz (default: {config.get('beep_hertz', BEEP_HERTZ_DEFAULT)})",
2206
+ )
2207
+ parser.add_argument(
2208
+ "--beep-mix-normalize",
2209
+ dest="beepMixNormalize",
2210
+ type=mmguero.str2bool,
2211
+ nargs="?",
2212
+ const=True,
2213
+ default=BEEP_MIX_NORMALIZE_DEFAULT,
2214
+ metavar="true|false",
2215
+ help=f"Normalize mix of audio and beeps (default: {BEEP_MIX_NORMALIZE_DEFAULT})",
2216
+ )
2217
+ parser.add_argument(
2218
+ "--beep-audio-weight",
2219
+ dest="beepAudioWeight",
2220
+ metavar="<int>",
2221
+ type=int,
2222
+ default=BEEP_AUDIO_WEIGHT_DEFAULT,
2223
+ help=f"Mix weight for non-beeped audio (default: {BEEP_AUDIO_WEIGHT_DEFAULT})",
2224
+ )
2225
+ parser.add_argument(
2226
+ "--beep-sine-weight",
2227
+ dest="beepSineWeight",
2228
+ metavar="<int>",
2229
+ type=int,
2230
+ default=BEEP_SINE_WEIGHT_DEFAULT,
2231
+ help=f"Mix weight for beep (default: {BEEP_SINE_WEIGHT_DEFAULT})",
2232
+ )
2233
+ parser.add_argument(
2234
+ "--beep-dropout-transition",
2235
+ dest="beepDropTransition",
2236
+ metavar="<int>",
2237
+ type=int,
2238
+ default=BEEP_DROPOUT_TRANSITION_DEFAULT,
2239
+ help=f"Dropout transition for beep (default: {BEEP_DROPOUT_TRANSITION_DEFAULT})",
2240
+ )
2241
+
2242
+ parser.add_argument(
2243
+ "--force",
2244
+ dest="forceDespiteTag",
2245
+ type=mmguero.str2bool,
2246
+ nargs="?",
2247
+ const=True,
2248
+ default=False,
2249
+ metavar="true|false",
2250
+ help="Process file despite existence of embedded tag",
2251
+ )
2252
+
2253
+ parser.add_argument(
2254
+ "--clean-cache",
2255
+ dest="cleanCache",
2256
+ action="store_true",
2257
+ default=False,
2258
+ help=f"Delete all cached data (models, config) at {MONKEYPLUG_CACHE_DIR} and exit",
2259
+ )
2260
+
2261
+ voskArgGroup = parser.add_argument_group('VOSK Options')
2262
+ voskArgGroup.add_argument(
2263
+ "--vosk-model-dir",
2264
+ dest="voskModelDir",
2265
+ metavar="<string>",
2266
+ type=str,
2267
+ default=DEFAULT_VOSK_MODEL_DIR,
2268
+ help=f"VOSK model directory (default: {DEFAULT_VOSK_MODEL_DIR})",
2269
+ )
2270
+ voskArgGroup.add_argument(
2271
+ "--vosk-read-frames-chunk",
2272
+ dest="voskReadFramesChunk",
2273
+ metavar="<int>",
2274
+ type=int,
2275
+ default=os.getenv("VOSK_READ_FRAMES", AUDIO_DEFAULT_WAV_FRAMES_CHUNK),
2276
+ help=f"WAV frame chunk (default: {AUDIO_DEFAULT_WAV_FRAMES_CHUNK})",
2277
+ )
2278
+
2279
+ whisperArgGroup = parser.add_argument_group('Whisper Options')
2280
+ whisperArgGroup.add_argument(
2281
+ "--whisper-model-dir",
2282
+ dest="whisperModelDir",
2283
+ metavar="<string>",
2284
+ type=str,
2285
+ default=DEFAULT_WHISPER_MODEL_DIR,
2286
+ help=f"Whisper model directory ({DEFAULT_WHISPER_MODEL_DIR})",
2287
+ )
2288
+ whisperArgGroup.add_argument(
2289
+ "--whisper-model-name",
2290
+ dest="whisperModelName",
2291
+ metavar="<string>",
2292
+ type=str,
2293
+ default=DEFAULT_WHISPER_MODEL_NAME,
2294
+ help=f"Whisper model name ({DEFAULT_WHISPER_MODEL_NAME})",
2295
+ )
2296
+ whisperArgGroup.add_argument(
2297
+ "--torch-threads",
2298
+ dest="torchThreads",
2299
+ metavar="<int>",
2300
+ type=int,
2301
+ default=DEFAULT_TORCH_THREADS,
2302
+ help=f"Number of threads used by torch for CPU inference ({DEFAULT_TORCH_THREADS})",
2303
+ )
2304
+
2305
+ groqArgGroup = parser.add_argument_group('Groq Options')
2306
+ groqArgGroup.add_argument(
2307
+ "--groq-api-key",
2308
+ dest="groqApiKey",
2309
+ metavar="<string>",
2310
+ type=str,
2311
+ default=None,
2312
+ help="Groq API key (default: GROQ_API_KEY env var, ~/.groq/config.json, or ./.groq_key)",
2313
+ )
2314
+ groqArgGroup.add_argument(
2315
+ "--groq-model",
2316
+ dest="groqModel",
2317
+ metavar="<string>",
2318
+ type=str,
2319
+ default="whisper-large-v3",
2320
+ help="Groq Whisper model (default: whisper-large-v3)",
2321
+ )
2322
+
2323
+ try:
2324
+ parser.error = parser.exit
2325
+ args = parser.parse_args()
2326
+ except SystemExit as se:
2327
+ mmguero.eprint(se)
2328
+ exit(2)
2329
+
2330
+ # Handle --clean-cache early and exit
2331
+ if args.cleanCache:
2332
+ import shutil
2333
+ if os.path.isdir(MONKEYPLUG_CACHE_DIR):
2334
+ shutil.rmtree(MONKEYPLUG_CACHE_DIR)
2335
+ print(f"Deleted cache directory: {MONKEYPLUG_CACHE_DIR}")
2336
+ else:
2337
+ print(f"No cache directory found at: {MONKEYPLUG_CACHE_DIR}")
2338
+ return
2339
+
2340
+ # Set debug flag based on verbose level for backward compatibility
2341
+ if args.verbose == "full":
2342
+ args.debug = True
2343
+ args.verbose_level = "full"
2344
+ elif args.verbose == "concise":
2345
+ args.debug = True
2346
+ args.verbose_level = "concise"
2347
+ else:
2348
+ args.debug = False
2349
+ args.verbose_level = ""
2350
+
2351
+ if args.debug:
2352
+ mmguero.eprint(os.path.join(script_path, script_name))
2353
+ mmguero.eprint(f"Arguments: {sys.argv[1:]}")
2354
+ if args.verbose_level == "full":
2355
+ mmguero.eprint(f"Arguments: {args}")
2356
+ else:
2357
+ sys.tracebacklimit = 0
2358
+
2359
+ # Check if wildcards are present in input or output
2360
+ has_wildcards = '*' in args.input or '*' in args.output
2361
+
2362
+ # Process instrumental mode arguments
2363
+ auto_generate = False
2364
+ auto_mode_requested = False # Track if --instrumental auto was used
2365
+ skip_detection = False # Skip vocal detection in wildcard mode (--instrumental generate)
2366
+
2367
+ # Mode priority: mute > beep > instrumental
2368
+ if args.mute:
2369
+ # Mute mode: disable all instrumental processing
2370
+ if args.debug:
2371
+ mmguero.eprint('Mute mode - disabling instrumental processing')
2372
+ args.instrumentalPrefix = None
2373
+ args.instrumentalFile = None
2374
+ auto_generate = False
2375
+
2376
+ elif args.beep:
2377
+ # Beep mode: disable all instrumental processing (beep takes precedence)
2378
+ if args.debug:
2379
+ mmguero.eprint('Beep mode enabled - disabling instrumental mode')
2380
+ args.instrumentalPrefix = None
2381
+ args.instrumentalFile = None
2382
+ auto_generate = False
2383
+
2384
+ # Process instrumental mode arguments
2385
+ # Default to auto mode if no instrumental flag provided or instrumentalPrefix is default "AUTO"
2386
+ elif args.instrumentalFile is None and (args.instrumentalPrefix is None or args.instrumentalPrefix == "AUTO"):
2387
+ # No --instrumental flag provided, default to auto mode
2388
+ auto_mode_requested = True
2389
+ args.instrumentalPrefix = "AUTO"
2390
+ if args.debug:
2391
+ mmguero.eprint('Default: Auto mode (try prefix search → if not found, generate)')
2392
+
2393
+ elif args.instrumentalFile:
2394
+ # If --instrumental was provided with a value
2395
+ instrumental_mode = args.instrumentalFile.lower()
2396
+
2397
+ if instrumental_mode == "auto":
2398
+ # Auto mode: try prefix search first, if not found, generate
2399
+ auto_mode_requested = True # Track that auto mode was requested
2400
+ args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
2401
+ if not args.instrumentalPrefix:
2402
+ args.instrumentalPrefix = "AUTO" # Set default for auto mode
2403
+
2404
+ # The search will be done later; if not found, we'll set auto_generate
2405
+ if args.debug:
2406
+ mmguero.eprint('Auto mode: Will try prefix search first, then generate if needed')
2407
+
2408
+ elif instrumental_mode == "generate":
2409
+ # Generate mode: force AI generation, skip instrumental file search
2410
+ auto_generate = True
2411
+ skip_detection = True
2412
+ args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
2413
+ args.instrumentalPrefix = None # Skip instrumental file search entirely
2414
+ if args.debug:
2415
+ mmguero.eprint('Generate mode: Will use AI to generate instrumental')
2416
+
2417
+ elif instrumental_mode == "prefix":
2418
+ # Prefix mode: search with --instrumental-prefix value
2419
+ args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
2420
+ if not args.instrumentalPrefix:
2421
+ args.instrumentalPrefix = "AUTO" # Default to AUTO if not specified
2422
+ if args.debug:
2423
+ mmguero.eprint(f'Prefix mode: Searching for instrumental with prefix "{args.instrumentalPrefix}"')
2424
+
2425
+ else:
2426
+ # Treat as filename - already set in args.instrumentalFile
2427
+ if args.debug:
2428
+ mmguero.eprint(f'Using specified instrumental file: {args.instrumentalFile}')
2429
+
2430
+ # --filter-instrumentals overrides generate mode's skip_detection
2431
+ if args.filterInstrumentals:
2432
+ skip_detection = False
2433
+
2434
+ if has_wildcards and args.speechRecMode == SPEECH_REC_MODE_GROQ:
2435
+ # Wildcard mode with vocal detection
2436
+ vocal_files, instrumental_files, output_files = expand_and_detect_vocals(
2437
+ args.input, args.output, args, skip_detection=skip_detection
2438
+ )
2439
+
2440
+ if not vocal_files:
2441
+ mmguero.eprint('No vocal files found to process. All files appear to be instrumentals.')
2442
+ sys.exit(0)
2443
+
2444
+ mmguero.eprint(f'\nProcessing {len(vocal_files)} file(s) with vocals...\n')
2445
+
2446
+ # Process each vocal file
2447
+ for idx, (input_file, output_file) in enumerate(zip(vocal_files, output_files), 1):
2448
+ mmguero.eprint(f'\n[{idx}/{len(vocal_files)}] Processing: {os.path.basename(input_file)}')
2449
+
2450
+ # Create a copy of args and modify input/output
2451
+ args_copy = argparse.Namespace(**vars(args))
2452
+ args_copy.input = input_file
2453
+ args_copy.output = output_file
2454
+
2455
+ # Find instrumental file for this specific file if using AUTO/prefix mode
2456
+ if args_copy.instrumentalPrefix and not args_copy.instrumentalFile:
2457
+ import glob
2458
+ from difflib import SequenceMatcher
2459
+
2460
+ input_dir = os.path.dirname(input_file)
2461
+ if not input_dir:
2462
+ input_dir = '.'
2463
+
2464
+ input_basename = os.path.basename(input_file)
2465
+ input_name, input_ext = os.path.splitext(input_basename)
2466
+
2467
+ # AUTO mode - fuzzy matching
2468
+ if args_copy.instrumentalPrefix.upper() == 'AUTO':
2469
+ if args_copy.debug:
2470
+ mmguero.eprint(f'AUTO mode: Searching for instrumental file using fuzzy matching')
2471
+
2472
+ # Get all audio files in the directory
2473
+ audio_extensions = ['.mp3', '.mp4', '.m4a', '.wav', '.flac', '.ogg', '.aac', '.wma']
2474
+ all_files = []
2475
+
2476
+ for ext in audio_extensions:
2477
+ all_files.extend(glob.glob(os.path.join(input_dir, f'*{ext}')))
2478
+
2479
+ # Filter out the input file itself and any files matching output pattern
2480
+ def pattern_to_regex(pattern):
2481
+ """Convert wildcard pattern to regex for matching"""
2482
+ import re
2483
+ regex = re.escape(pattern)
2484
+ regex = regex.replace(r'\*', '.*')
2485
+ return f'^{regex}$'
2486
+
2487
+ # If output file is specified, get its pattern to exclude matches
2488
+ output_pattern_to_exclude = None
2489
+ if output_file:
2490
+ # For single file, check exact basename match
2491
+ output_basename = os.path.basename(output_file)
2492
+ else:
2493
+ output_basename = None
2494
+
2495
+ other_files = []
2496
+ for f in all_files:
2497
+ basename = os.path.basename(f)
2498
+ # Skip input file
2499
+ if basename == input_basename:
2500
+ continue
2501
+ # Skip exact output file match if specified
2502
+ if output_basename and basename == output_basename:
2503
+ continue
2504
+ other_files.append(f)
2505
+
2506
+ # Two-way fuzzy matching with validation
2507
+ candidates_with_scores = []
2508
+ for candidate in other_files:
2509
+ candidate_basename = os.path.basename(candidate)
2510
+ candidate_name, _ = os.path.splitext(candidate_basename)
2511
+
2512
+ ratio = SequenceMatcher(None, input_name.lower(), candidate_name.lower()).ratio()
2513
+
2514
+ if args_copy.debug:
2515
+ mmguero.eprint(f' {candidate_basename}: similarity={ratio:.3f}')
2516
+
2517
+ if ratio < 1.0:
2518
+ candidates_with_scores.append((candidate, ratio))
2519
+
2520
+ candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
2521
+ top_candidates = candidates_with_scores[:args_copy.instrumentalAutoCandidates]
2522
+
2523
+ validated_candidates = []
2524
+ for candidate, candidate_to_input_score in top_candidates:
2525
+ candidate_basename = os.path.basename(candidate)
2526
+ candidate_name, _ = os.path.splitext(candidate_basename)
2527
+
2528
+ best_other_score = 0.0
2529
+ best_other_match = None
2530
+
2531
+ for other_file in all_files:
2532
+ other_basename = os.path.basename(other_file)
2533
+ if other_basename != input_basename and other_basename != candidate_basename:
2534
+ other_name, _ = os.path.splitext(other_basename)
2535
+ other_score = SequenceMatcher(None, candidate_name.lower(), other_name.lower()).ratio()
2536
+
2537
+ if other_score > best_other_score:
2538
+ best_other_score = other_score
2539
+ best_other_match = other_basename
2540
+
2541
+ if args_copy.debug:
2542
+ mmguero.eprint(f' Validating {candidate_basename}:')
2543
+ mmguero.eprint(f' to input: {candidate_to_input_score:.3f}')
2544
+ mmguero.eprint(f' to best other ({best_other_match}): {best_other_score:.3f}')
2545
+
2546
+ if candidate_to_input_score > best_other_score:
2547
+ validated_candidates.append((candidate, candidate_to_input_score))
2548
+ if args_copy.debug:
2549
+ mmguero.eprint(f' ✓ PASSED validation')
2550
+ else:
2551
+ if args_copy.debug:
2552
+ mmguero.eprint(f' ✗ FAILED validation')
2553
+
2554
+ if validated_candidates:
2555
+ best_match, best_ratio = validated_candidates[0]
2556
+ if best_ratio >= 0.3:
2557
+ args_copy.instrumentalFile = best_match
2558
+ if args_copy.debug:
2559
+ mmguero.eprint(f'AUTO mode matched: {os.path.basename(best_match)} (similarity: {best_ratio:.3f})')
2560
+ else:
2561
+ # Auto mode: no valid match found, enable AI generation
2562
+ if auto_mode_requested:
2563
+ if args_copy.debug:
2564
+ mmguero.eprint(f' Auto mode: No validated match above threshold, will use AI generation')
2565
+ else:
2566
+ mmguero.eprint(f' No validated match above threshold, will use AI generation')
2567
+ else:
2568
+ # Auto mode: all candidates failed validation, enable AI generation
2569
+ if auto_mode_requested:
2570
+ if args_copy.debug:
2571
+ mmguero.eprint(f' Auto mode: All candidates failed validation, will use AI generation')
2572
+ else:
2573
+ mmguero.eprint(f' All candidates failed validation, will use AI generation')
2574
+
2575
+ # Process this file
2576
+ # Determine if AI generation should be used for this specific file
2577
+ file_auto_generate = auto_generate
2578
+ if auto_mode_requested and not args_copy.instrumentalFile:
2579
+ file_auto_generate = True
2580
+
2581
+ plug = GroqPlugger(
2582
+ args_copy.input,
2583
+ args_copy.output,
2584
+ args_copy.outputFormat,
2585
+ args_copy.swears,
2586
+ args_copy.groqApiKey,
2587
+ args_copy.groqModel,
2588
+ args_copy.outputJson,
2589
+ inputTranscript=args_copy.inputTranscript,
2590
+ saveTranscript=args_copy.saveTranscript,
2591
+ forceRetranscribe=args_copy.forceRetranscribe,
2592
+ aParams=args_copy.aParams,
2593
+ aChannels=args_copy.aChannels,
2594
+ aSampleRate=args_copy.aSampleRate,
2595
+ aBitRate=args_copy.aBitRate,
2596
+ aVorbisQscale=args_copy.aVorbisQscale,
2597
+ padMsecPre=args_copy.padMsecPre if args_copy.padMsecPre > 0 else args_copy.padMsec,
2598
+ padMsecPost=args_copy.padMsecPost if args_copy.padMsecPost > 0 else args_copy.padMsec,
2599
+ beep=args_copy.beep,
2600
+ beepHertz=args_copy.beepHertz,
2601
+ beepMixNormalize=args_copy.beepMixNormalize,
2602
+ beepAudioWeight=args_copy.beepAudioWeight,
2603
+ beepSineWeight=args_copy.beepSineWeight,
2604
+ beepDropTransition=args_copy.beepDropTransition,
2605
+ force=args_copy.forceDespiteTag,
2606
+ dbug=args_copy.debug,
2607
+ instrumentalFileSpec=args_copy.instrumentalFile,
2608
+ verbose_level=args_copy.verbose_level if hasattr(args_copy, 'verbose_level') else "",
2609
+ auto_generate=file_auto_generate,
2610
+ separation_padding=args_copy.separationPadding,
2611
+ )
2612
+
2613
+ print(plug.EncodeCleanAudio())
2614
+
2615
+ mmguero.eprint(f'\n✓ Completed processing {len(vocal_files)} file(s)')
2616
+ mmguero.eprint(f'Skipped {len(instrumental_files)} instrumental file(s)')
2617
+ sys.exit(0)
2618
+
2619
+ # Single file mode (no wildcards or not using Groq mode)
2620
+ # Find instrumental file if prefix is specified
2621
+ if args.instrumentalPrefix and not args.instrumentalFile:
2622
+ import glob
2623
+ from difflib import SequenceMatcher
2624
+
2625
+ input_dir = os.path.dirname(args.input)
2626
+ if not input_dir:
2627
+ input_dir = '.'
2628
+
2629
+ input_basename = os.path.basename(args.input)
2630
+ input_name, input_ext = os.path.splitext(input_basename)
2631
+
2632
+ # AUTO mode - fuzzy matching
2633
+ if args.instrumentalPrefix.upper() == 'AUTO':
2634
+ if args.debug:
2635
+ mmguero.eprint(f'AUTO mode: Searching for instrumental file using fuzzy matching')
2636
+
2637
+ # Get all audio files in the directory
2638
+ audio_extensions = ['.mp3', '.mp4', '.m4a', '.wav', '.flac', '.ogg', '.aac', '.wma']
2639
+ all_files = []
2640
+
2641
+ for ext in audio_extensions:
2642
+ all_files.extend(glob.glob(os.path.join(input_dir, f'*{ext}')))
2643
+
2644
+ # Filter out the input file itself and the output file
2645
+ output_basename = os.path.basename(args.output) if args.output else None
2646
+ other_files = []
2647
+ for f in all_files:
2648
+ basename = os.path.basename(f)
2649
+ # Skip input file
2650
+ if basename == input_basename:
2651
+ continue
2652
+ # Skip exact output file match if specified
2653
+ if output_basename and basename == output_basename:
2654
+ continue
2655
+ other_files.append(f)
2656
+
2657
+ if not other_files:
2658
+ mmguero.eprint(f'Warning: AUTO mode found no other audio files in directory')
2659
+ else:
2660
+ # Two-way fuzzy matching with validation
2661
+ # Step 1: Find top N candidates by similarity to input
2662
+ candidates_with_scores = []
2663
+ for candidate in other_files:
2664
+ candidate_basename = os.path.basename(candidate)
2665
+ candidate_name, _ = os.path.splitext(candidate_basename)
2666
+
2667
+ # Calculate similarity ratio (0 to 1)
2668
+ ratio = SequenceMatcher(None, input_name.lower(), candidate_name.lower()).ratio()
2669
+
2670
+ if args.debug:
2671
+ mmguero.eprint(f' {candidate_basename}: similarity={ratio:.3f}')
2672
+
2673
+ if ratio < 1.0: # Don't match the exact same file
2674
+ candidates_with_scores.append((candidate, ratio))
2675
+
2676
+ # Sort by score descending, take top N
2677
+ candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
2678
+ top_candidates = candidates_with_scores[:args.instrumentalAutoCandidates]
2679
+
2680
+ if args.debug and top_candidates:
2681
+ mmguero.eprint(f'Top {len(top_candidates)} candidates: {[os.path.basename(c[0]) for c in top_candidates]}')
2682
+
2683
+ # Step 2: Validate each candidate with two-way check
2684
+ validated_candidates = []
2685
+ for candidate, candidate_to_input_score in top_candidates:
2686
+ candidate_basename = os.path.basename(candidate)
2687
+ candidate_name, _ = os.path.splitext(candidate_basename)
2688
+
2689
+ # Find candidate's best match among ALL files (except input and itself)
2690
+ best_other_score = 0.0
2691
+ best_other_match = None
2692
+
2693
+ for other_file in all_files:
2694
+ other_basename = os.path.basename(other_file)
2695
+ if other_basename != input_basename and other_basename != candidate_basename:
2696
+ other_name, _ = os.path.splitext(other_basename)
2697
+
2698
+ # Calculate similarity between candidate and this other file
2699
+ other_score = SequenceMatcher(None, candidate_name.lower(), other_name.lower()).ratio()
2700
+
2701
+ if other_score > best_other_score:
2702
+ best_other_score = other_score
2703
+ best_other_match = other_basename
2704
+
2705
+ # Validation: candidate must be more similar to input than to any other file
2706
+ if args.debug:
2707
+ mmguero.eprint(f' Validating {candidate_basename}:')
2708
+ mmguero.eprint(f' to input: {candidate_to_input_score:.3f}')
2709
+ mmguero.eprint(f' to best other ({best_other_match}): {best_other_score:.3f}')
2710
+
2711
+ if candidate_to_input_score > best_other_score:
2712
+ validated_candidates.append((candidate, candidate_to_input_score))
2713
+ if args.debug:
2714
+ mmguero.eprint(f' ✓ PASSED validation')
2715
+ else:
2716
+ if args.debug:
2717
+ mmguero.eprint(f' ✗ FAILED validation (better match with {best_other_match})')
2718
+
2719
+ # Step 3: Use best validated candidate
2720
+ if validated_candidates:
2721
+ best_match, best_ratio = validated_candidates[0] # Already sorted by score
2722
+ if best_ratio >= 0.3: # 30% similarity threshold
2723
+ args.instrumentalFile = best_match
2724
+ if args.debug:
2725
+ mmguero.eprint(f'AUTO mode matched: {os.path.basename(best_match)} (similarity: {best_ratio:.3f})')
2726
+ else:
2727
+ # Auto mode: no valid match found, will use AI generation
2728
+ if auto_mode_requested:
2729
+ mmguero.eprint(f'Warning: AUTO mode found candidates but all below 30% threshold')
2730
+ mmguero.eprint(f'Best validated match was {os.path.basename(best_match)} with similarity {best_ratio:.3f}')
2731
+ mmguero.eprint(f'Auto mode: Will use AI to generate instrumental')
2732
+ else:
2733
+ mmguero.eprint(f'Warning: AUTO mode found candidates but all below 30% threshold')
2734
+ mmguero.eprint(f'Best validated match was {os.path.basename(best_match)} with similarity {best_ratio:.3f}')
2735
+ mmguero.eprint(f'No instrumental file found, will use AI generation')
2736
+ else:
2737
+ # Auto mode: all candidates failed validation, will use AI generation
2738
+ if auto_mode_requested:
2739
+ mmguero.eprint(f'Warning: AUTO mode could not find a validated instrumental file')
2740
+ mmguero.eprint(f'All top candidates failed two-way validation (likely belong to other songs)')
2741
+ mmguero.eprint(f'Auto mode: Will use AI to generate instrumental')
2742
+ else:
2743
+ mmguero.eprint(f'Warning: AUTO mode could not find a validated instrumental file')
2744
+ mmguero.eprint(f'All top candidates failed two-way validation (likely belong to other songs)')
2745
+ mmguero.eprint(f'No instrumental file found, will use AI generation')
2746
+ else:
2747
+ # Pattern-based search with specified prefix
2748
+ # Common patterns to search for
2749
+ patterns = [
2750
+ f"{input_name}_{args.instrumentalPrefix}{input_ext}", # song_instrumental.mp3
2751
+ f"{input_name}-{args.instrumentalPrefix}{input_ext}", # song-instrumental.mp3
2752
+ f"{input_name}{args.instrumentalPrefix}{input_ext}", # songinstrumental.mp3
2753
+ f"{args.instrumentalPrefix}_{input_name}{input_ext}", # instrumental_song.mp3
2754
+ f"{args.instrumentalPrefix}-{input_name}{input_ext}", # instrumental-song.mp3
2755
+ ]
2756
+
2757
+ if args.debug:
2758
+ mmguero.eprint(f'Searching for instrumental file with prefix: {args.instrumentalPrefix}')
2759
+ mmguero.eprint(f'Patterns: {patterns}')
2760
+
2761
+ found = False
2762
+ for pattern in patterns:
2763
+ search_path = os.path.join(input_dir, pattern)
2764
+ matches = glob.glob(search_path)
2765
+ if matches:
2766
+ args.instrumentalFile = matches[0]
2767
+ found = True
2768
+ if args.debug:
2769
+ mmguero.eprint(f'Found instrumental file: {args.instrumentalFile}')
2770
+ break
2771
+
2772
+ if not found:
2773
+ mmguero.eprint(f'Warning: Could not find instrumental file matching prefix "{args.instrumentalPrefix}"')
2774
+ mmguero.eprint(f'Searched for patterns: {patterns}')
2775
+ # If auto mode was requested, enable AI generation
2776
+ if auto_mode_requested:
2777
+ auto_generate = True
2778
+ mmguero.eprint(f'Auto mode: No instrumental found, will use AI to generate instrumental')
2779
+ else:
2780
+ mmguero.eprint(f'Will use AI to generate instrumental instead')
2781
+
2782
+ # Single file mode: check if we should enable auto_generate after search
2783
+ # If auto mode was requested and no file was found, enable generation
2784
+ if auto_mode_requested and not args.instrumentalFile and not auto_generate:
2785
+ auto_generate = True
2786
+ if args.debug:
2787
+ mmguero.eprint('Auto mode: No instrumental file found, enabling AI generation')
2788
+
2789
+ if args.speechRecMode == SPEECH_REC_MODE_VOSK:
2790
+ pathlib.Path(args.voskModelDir).mkdir(parents=True, exist_ok=True)
2791
+ plug = VoskPlugger(
2792
+ args.input,
2793
+ args.output,
2794
+ args.outputFormat,
2795
+ args.swears,
2796
+ args.voskModelDir,
2797
+ args.outputJson,
2798
+ inputTranscript=args.inputTranscript,
2799
+ saveTranscript=args.saveTranscript,
2800
+ forceRetranscribe=args.forceRetranscribe,
2801
+ aParams=args.aParams,
2802
+ aChannels=args.aChannels,
2803
+ aSampleRate=args.aSampleRate,
2804
+ aBitRate=args.aBitRate,
2805
+ aVorbisQscale=args.aVorbisQscale,
2806
+ wChunk=args.voskReadFramesChunk,
2807
+ padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
2808
+ padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
2809
+ beep=args.beep,
2810
+ beepHertz=args.beepHertz,
2811
+ beepMixNormalize=args.beepMixNormalize,
2812
+ beepAudioWeight=args.beepAudioWeight,
2813
+ beepSineWeight=args.beepSineWeight,
2814
+ beepDropTransition=args.beepDropTransition,
2815
+ force=args.forceDespiteTag,
2816
+ dbug=args.debug,
2817
+ )
2818
+
2819
+ elif args.speechRecMode == SPEECH_REC_MODE_WHISPER:
2820
+ pathlib.Path(args.whisperModelDir).mkdir(parents=True, exist_ok=True)
2821
+ plug = WhisperPlugger(
2822
+ args.input,
2823
+ args.output,
2824
+ args.outputFormat,
2825
+ args.swears,
2826
+ args.whisperModelDir,
2827
+ args.whisperModelName,
2828
+ args.torchThreads,
2829
+ args.outputJson,
2830
+ inputTranscript=args.inputTranscript,
2831
+ saveTranscript=args.saveTranscript,
2832
+ forceRetranscribe=args.forceRetranscribe,
2833
+ aParams=args.aParams,
2834
+ aChannels=args.aChannels,
2835
+ aSampleRate=args.aSampleRate,
2836
+ aBitRate=args.aBitRate,
2837
+ aVorbisQscale=args.aVorbisQscale,
2838
+ padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
2839
+ padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
2840
+ beep=args.beep,
2841
+ beepHertz=args.beepHertz,
2842
+ beepMixNormalize=args.beepMixNormalize,
2843
+ beepAudioWeight=args.beepAudioWeight,
2844
+ beepSineWeight=args.beepSineWeight,
2845
+ beepDropTransition=args.beepDropTransition,
2846
+ force=args.forceDespiteTag,
2847
+ dbug=args.debug,
2848
+ )
2849
+
2850
+ elif args.speechRecMode == SPEECH_REC_MODE_GROQ:
2851
+ plug = GroqPlugger(
2852
+ args.input,
2853
+ args.output,
2854
+ args.outputFormat,
2855
+ args.swears,
2856
+ args.groqApiKey,
2857
+ args.groqModel,
2858
+ args.outputJson,
2859
+ inputTranscript=args.inputTranscript,
2860
+ saveTranscript=args.saveTranscript,
2861
+ forceRetranscribe=args.forceRetranscribe,
2862
+ aParams=args.aParams,
2863
+ aChannels=args.aChannels,
2864
+ aSampleRate=args.aSampleRate,
2865
+ aBitRate=args.aBitRate,
2866
+ aVorbisQscale=args.aVorbisQscale,
2867
+ padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
2868
+ padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
2869
+ beep=args.beep,
2870
+ beepHertz=args.beepHertz,
2871
+ beepMixNormalize=args.beepMixNormalize,
2872
+ beepAudioWeight=args.beepAudioWeight,
2873
+ beepSineWeight=args.beepSineWeight,
2874
+ beepDropTransition=args.beepDropTransition,
2875
+ force=args.forceDespiteTag,
2876
+ dbug=args.debug,
2877
+ instrumentalFileSpec=args.instrumentalFile,
2878
+ verbose_level=args.verbose_level if hasattr(args, 'verbose_level') else "",
2879
+ auto_generate=auto_generate,
2880
+ separation_padding=args.separationPadding,
2881
+ )
2882
+ else:
2883
+ raise ValueError(f"Unsupported speech recognition engine {args.speechRecMode}")
2884
+
2885
+ print(plug.EncodeCleanAudio())
2886
+
2887
+ sys.exit(0)
2888
+
2889
+
2890
+ ###################################################################################################
2891
+ if __name__ == "__main__":
2892
+ RunMonkeyPlug()