monkeyplug-enhanced 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monkeyplug/__init__.py +21 -0
- monkeyplug/data/profanity_list.json +1 -0
- monkeyplug/groq_config.py +70 -0
- monkeyplug/monkeyplug.py +2892 -0
- monkeyplug/separation.py +147 -0
- monkeyplug_enhanced-2.2.0.data/data/monkeyplug/data/profanity_list.json +1 -0
- monkeyplug_enhanced-2.2.0.dist-info/METADATA +360 -0
- monkeyplug_enhanced-2.2.0.dist-info/RECORD +11 -0
- monkeyplug_enhanced-2.2.0.dist-info/WHEEL +4 -0
- monkeyplug_enhanced-2.2.0.dist-info/entry_points.txt +2 -0
- monkeyplug_enhanced-2.2.0.dist-info/licenses/LICENSE +29 -0
monkeyplug/monkeyplug.py
ADDED
|
@@ -0,0 +1,2892 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import base64
|
|
6
|
+
import errno
|
|
7
|
+
import importlib
|
|
8
|
+
import importlib.metadata
|
|
9
|
+
import importlib.util
|
|
10
|
+
import json
|
|
11
|
+
import mmguero
|
|
12
|
+
import mutagen
|
|
13
|
+
import os
|
|
14
|
+
import pathlib
|
|
15
|
+
import requests
|
|
16
|
+
import shutil
|
|
17
|
+
import string
|
|
18
|
+
import sys
|
|
19
|
+
import wave
|
|
20
|
+
|
|
21
|
+
from urllib.parse import urlparse
|
|
22
|
+
from itertools import tee
|
|
23
|
+
|
|
24
|
+
###################################################################################################
|
|
25
|
+
CHANNELS_REPLACER = 'CHANNELS'
|
|
26
|
+
SAMPLE_RATE_REPLACER = 'SAMPLE'
|
|
27
|
+
BIT_RATE_REPLACER = 'BITRATE'
|
|
28
|
+
VORBIS_QSCALE_REPLACER = 'QSCALE'
|
|
29
|
+
AUDIO_DEFAULT_PARAMS_BY_FORMAT = {
|
|
30
|
+
"flac": ["-c:a", "flac", "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
31
|
+
"m4a": ["-c:a", "aac", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
32
|
+
"aac": ["-c:a", "aac", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
33
|
+
"mp3": ["-c:a", "libmp3lame", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
34
|
+
"ogg": [
|
|
35
|
+
"-c:a",
|
|
36
|
+
"libvorbis",
|
|
37
|
+
"-qscale:a",
|
|
38
|
+
VORBIS_QSCALE_REPLACER,
|
|
39
|
+
"-ar",
|
|
40
|
+
SAMPLE_RATE_REPLACER,
|
|
41
|
+
"-ac",
|
|
42
|
+
CHANNELS_REPLACER,
|
|
43
|
+
],
|
|
44
|
+
"opus": ["-c:a", "libopus", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
45
|
+
"ac3": ["-c:a", "ac3", "-b:a", BIT_RATE_REPLACER, "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
46
|
+
"wav": ["-c:a", "pcm_s16le", "-ar", SAMPLE_RATE_REPLACER, "-ac", CHANNELS_REPLACER],
|
|
47
|
+
}
|
|
48
|
+
AUDIO_CODEC_TO_FORMAT = {
|
|
49
|
+
"aac": "m4a",
|
|
50
|
+
"ac3": "ac3",
|
|
51
|
+
"flac": "flac",
|
|
52
|
+
"mp3": "mp3",
|
|
53
|
+
"opus": "opus",
|
|
54
|
+
"vorbis": "ogg",
|
|
55
|
+
"pcm_s16le": "wav",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
AUDIO_DEFAULT_FORMAT = "mp3"
|
|
59
|
+
AUDIO_DEFAULT_CHANNELS = 2
|
|
60
|
+
AUDIO_DEFAULT_SAMPLE_RATE = 48000
|
|
61
|
+
AUDIO_DEFAULT_BIT_RATE = "256K"
|
|
62
|
+
AUDIO_DEFAULT_VORBIS_QSCALE = 5
|
|
63
|
+
AUDIO_MATCH_FORMAT = "MATCH"
|
|
64
|
+
AUDIO_INTERMEDIATE_PARAMS = ["-c:a", "pcm_s16le", "-ac", "1", "-ar", "16000"]
|
|
65
|
+
AUDIO_DEFAULT_WAV_FRAMES_CHUNK = 8000
|
|
66
|
+
BEEP_HERTZ_DEFAULT = 1000
|
|
67
|
+
BEEP_MIX_NORMALIZE_DEFAULT = False
|
|
68
|
+
BEEP_AUDIO_WEIGHT_DEFAULT = 1
|
|
69
|
+
BEEP_SINE_WEIGHT_DEFAULT = 1
|
|
70
|
+
BEEP_DROPOUT_TRANSITION_DEFAULT = 0
|
|
71
|
+
SWEARS_FILENAME_DEFAULT = 'swears.txt'
|
|
72
|
+
MUTAGEN_METADATA_TAGS = ['encodedby', 'comment']
|
|
73
|
+
MUTAGEN_METADATA_TAG_VALUE = u'monkeyplug'
|
|
74
|
+
SPEECH_REC_MODE_VOSK = "vosk"
|
|
75
|
+
SPEECH_REC_MODE_WHISPER = "whisper"
|
|
76
|
+
SPEECH_REC_MODE_GROQ = "groq"
|
|
77
|
+
DEFAULT_SPEECH_REC_MODE = os.getenv("MONKEYPLUG_MODE", SPEECH_REC_MODE_GROQ)
|
|
78
|
+
DEFAULT_VOSK_MODEL_DIR = os.getenv(
|
|
79
|
+
"VOSK_MODEL_DIR", os.path.join(os.path.join(os.path.join(os.path.expanduser("~"), '.cache'), 'vosk'))
|
|
80
|
+
)
|
|
81
|
+
DEFAULT_WHISPER_MODEL_DIR = os.getenv(
|
|
82
|
+
"WHISPER_MODEL_DIR", os.path.join(os.path.join(os.path.join(os.path.expanduser("~"), '.cache'), 'whisper'))
|
|
83
|
+
)
|
|
84
|
+
DEFAULT_WHISPER_MODEL_NAME = os.getenv("WHISPER_MODEL_NAME", "small.en")
|
|
85
|
+
DEFAULT_TORCH_THREADS = 0
|
|
86
|
+
|
|
87
|
+
###################################################################################################
|
|
88
|
+
# Determine script_path and script_name in a way that works both as module and direct execution
|
|
89
|
+
try:
|
|
90
|
+
# This works when running as a module
|
|
91
|
+
script_name = 'monkeyplug.py'
|
|
92
|
+
script_path = os.path.dirname(os.path.realpath(__file__))
|
|
93
|
+
except (NameError, TypeError):
|
|
94
|
+
# Fallback for edge cases
|
|
95
|
+
script_name = 'monkeyplug.py'
|
|
96
|
+
script_path = os.path.dirname(os.path.realpath(sys.argv[0])) if sys.argv and sys.argv[0] else os.getcwd()
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# thanks https://docs.python.org/3/library/itertools.html#recipes
|
|
100
|
+
def pairwise(iterable):
|
|
101
|
+
a, b = tee(iterable)
|
|
102
|
+
next(b, None)
|
|
103
|
+
return zip(a, b)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def scrubword(value):
|
|
107
|
+
return str(value).lower().replace("’", "'").lower().strip(string.punctuation)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
###################################################################################################
|
|
111
|
+
# download to file
|
|
112
|
+
def DownloadToFile(url, local_filename=None, chunk_bytes=4096, debug=False):
|
|
113
|
+
tmpDownloadedFileSpec = local_filename if local_filename else os.path.basename(urlparse(url).path)
|
|
114
|
+
r = requests.get(url, stream=True, allow_redirects=True)
|
|
115
|
+
with open(tmpDownloadedFileSpec, "wb") as f:
|
|
116
|
+
for chunk in r.iter_content(chunk_size=chunk_bytes):
|
|
117
|
+
if chunk:
|
|
118
|
+
f.write(chunk)
|
|
119
|
+
fExists = os.path.isfile(tmpDownloadedFileSpec)
|
|
120
|
+
fSize = os.path.getsize(tmpDownloadedFileSpec)
|
|
121
|
+
if debug:
|
|
122
|
+
mmguero.eprint(
|
|
123
|
+
f"Download of {url} to {tmpDownloadedFileSpec} {'succeeded' if fExists else 'failed'} ({mmguero.size_human_format(fSize)})"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if fExists and (fSize > 0):
|
|
127
|
+
return tmpDownloadedFileSpec
|
|
128
|
+
else:
|
|
129
|
+
if fExists:
|
|
130
|
+
os.remove(tmpDownloadedFileSpec)
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
###################################################################################################
|
|
135
|
+
# Get tag from file to indicate monkeyplug has already been set
|
|
136
|
+
def GetMonkeyplugTagged(local_filename, debug=False):
|
|
137
|
+
result = False
|
|
138
|
+
if os.path.isfile(local_filename):
|
|
139
|
+
mut = mutagen.File(local_filename, easy=True)
|
|
140
|
+
if debug:
|
|
141
|
+
mmguero.eprint(f'Tags of {local_filename}: {mut}')
|
|
142
|
+
if hasattr(mut, 'get'):
|
|
143
|
+
for tag in MUTAGEN_METADATA_TAGS:
|
|
144
|
+
try:
|
|
145
|
+
if MUTAGEN_METADATA_TAG_VALUE in mmguero.get_iterable(mut.get(tag, default=())):
|
|
146
|
+
result = True
|
|
147
|
+
break
|
|
148
|
+
except Exception as e:
|
|
149
|
+
if debug:
|
|
150
|
+
mmguero.eprint(e)
|
|
151
|
+
return result
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
###################################################################################################
|
|
155
|
+
# Set tag to file to indicate monkeyplug has worked its magic
|
|
156
|
+
def SetMonkeyplugTag(local_filename, debug=False):
|
|
157
|
+
result = False
|
|
158
|
+
if os.path.isfile(local_filename):
|
|
159
|
+
mut = mutagen.File(local_filename, easy=True)
|
|
160
|
+
if debug:
|
|
161
|
+
mmguero.eprint(f'Tags of {local_filename} before: {mut}')
|
|
162
|
+
if hasattr(mut, '__setitem__'):
|
|
163
|
+
for tag in MUTAGEN_METADATA_TAGS:
|
|
164
|
+
try:
|
|
165
|
+
mut[tag] = MUTAGEN_METADATA_TAG_VALUE
|
|
166
|
+
result = True
|
|
167
|
+
break
|
|
168
|
+
except Exception as e:
|
|
169
|
+
if debug:
|
|
170
|
+
mmguero.eprint(e)
|
|
171
|
+
if result:
|
|
172
|
+
try:
|
|
173
|
+
mut.save(local_filename)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
result = False
|
|
176
|
+
mmguero.eprint(e)
|
|
177
|
+
if debug:
|
|
178
|
+
mmguero.eprint(f'Tags of {local_filename} after: {mut}')
|
|
179
|
+
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
###################################################################################################
|
|
184
|
+
# get stream codecs from an input filename
|
|
185
|
+
# e.g. result: {'video': {'h264'}, 'audio': {'eac3'}, 'subtitle': {'subrip'}}
|
|
186
|
+
def GetCodecs(local_filename, debug=False):
|
|
187
|
+
result = {}
|
|
188
|
+
if os.path.isfile(local_filename):
|
|
189
|
+
ffprobeCmd = [
|
|
190
|
+
'ffprobe',
|
|
191
|
+
'-v',
|
|
192
|
+
'quiet',
|
|
193
|
+
'-print_format',
|
|
194
|
+
'json',
|
|
195
|
+
'-show_format',
|
|
196
|
+
'-show_streams',
|
|
197
|
+
local_filename,
|
|
198
|
+
]
|
|
199
|
+
ffprobeResult, ffprobeOutput = mmguero.run_process(ffprobeCmd, stdout=True, stderr=False, debug=debug)
|
|
200
|
+
if ffprobeResult == 0:
|
|
201
|
+
ffprobeOutput = mmguero.load_str_if_json(' '.join(ffprobeOutput))
|
|
202
|
+
if 'streams' in ffprobeOutput:
|
|
203
|
+
for stream in ffprobeOutput['streams']:
|
|
204
|
+
if 'codec_name' in stream and 'codec_type' in stream:
|
|
205
|
+
cType = stream['codec_type'].lower()
|
|
206
|
+
cValue = stream['codec_name'].lower()
|
|
207
|
+
if cType in result:
|
|
208
|
+
result[cType].add(cValue)
|
|
209
|
+
else:
|
|
210
|
+
result[cType] = set([cValue])
|
|
211
|
+
result['format'] = mmguero.deep_get(ffprobeOutput, ['format', 'format_name'])
|
|
212
|
+
if isinstance(result['format'], str):
|
|
213
|
+
result['format'] = result['format'].split(',')
|
|
214
|
+
else:
|
|
215
|
+
mmguero.eprint(' '.join(mmguero.flatten(ffprobeCmd)))
|
|
216
|
+
mmguero.eprint(ffprobeResult)
|
|
217
|
+
mmguero.eprint(ffprobeOutput)
|
|
218
|
+
raise ValueError(f"Could not analyze {local_filename}")
|
|
219
|
+
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
#################################################################################
|
|
224
|
+
class Plugger(object):
|
|
225
|
+
debug = False
|
|
226
|
+
inputFileSpec = ""
|
|
227
|
+
inputCodecs = {}
|
|
228
|
+
inputFileParts = None
|
|
229
|
+
outputFileSpec = ""
|
|
230
|
+
outputAudioFileFormat = ""
|
|
231
|
+
outputVideoFileFormat = ""
|
|
232
|
+
outputJson = ""
|
|
233
|
+
tmpDownloadedFileSpec = ""
|
|
234
|
+
swearsFileSpec = ""
|
|
235
|
+
swearsMap = {}
|
|
236
|
+
wordList = []
|
|
237
|
+
naughtyWordList = []
|
|
238
|
+
# for beep and mute
|
|
239
|
+
muteTimeList = []
|
|
240
|
+
# for beep only
|
|
241
|
+
sineTimeList = []
|
|
242
|
+
beepDelayList = []
|
|
243
|
+
padSecPre = 0.0
|
|
244
|
+
padSecPost = 0.0
|
|
245
|
+
beep = False
|
|
246
|
+
beepHertz = BEEP_HERTZ_DEFAULT
|
|
247
|
+
beepMixNormalize = BEEP_MIX_NORMALIZE_DEFAULT
|
|
248
|
+
beepAudioWeight = BEEP_AUDIO_WEIGHT_DEFAULT
|
|
249
|
+
beepSineWeight = BEEP_SINE_WEIGHT_DEFAULT
|
|
250
|
+
beepDropTransition = BEEP_DROPOUT_TRANSITION_DEFAULT
|
|
251
|
+
forceDespiteTag = False
|
|
252
|
+
aParams = None
|
|
253
|
+
tags = None
|
|
254
|
+
# for instrumental splicing
|
|
255
|
+
instrumentalFileSpec = ""
|
|
256
|
+
instrumentalMode = False
|
|
257
|
+
instrumentalSegments = [] # List of (start, end) tuples for profanity sections
|
|
258
|
+
|
|
259
|
+
######## init #################################################################
|
|
260
|
+
def __init__(
|
|
261
|
+
self,
|
|
262
|
+
iFileSpec,
|
|
263
|
+
oFileSpec,
|
|
264
|
+
oAudioFileFormat,
|
|
265
|
+
iSwearsFileSpec,
|
|
266
|
+
outputJson,
|
|
267
|
+
inputTranscript=None,
|
|
268
|
+
saveTranscript=False,
|
|
269
|
+
forceRetranscribe=False,
|
|
270
|
+
aParams=None,
|
|
271
|
+
aChannels=AUDIO_DEFAULT_CHANNELS,
|
|
272
|
+
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
|
|
273
|
+
aBitRate=AUDIO_DEFAULT_BIT_RATE,
|
|
274
|
+
aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
|
|
275
|
+
padMsecPre=0,
|
|
276
|
+
padMsecPost=0,
|
|
277
|
+
beep=False,
|
|
278
|
+
beepHertz=BEEP_HERTZ_DEFAULT,
|
|
279
|
+
beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
|
|
280
|
+
beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
|
|
281
|
+
beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
|
|
282
|
+
beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
|
|
283
|
+
force=False,
|
|
284
|
+
dbug=False,
|
|
285
|
+
instrumentalFileSpec=None,
|
|
286
|
+
):
|
|
287
|
+
self.padSecPre = padMsecPre / 1000.0
|
|
288
|
+
self.padSecPost = padMsecPost / 1000.0
|
|
289
|
+
self.beep = beep
|
|
290
|
+
self.beepHertz = beepHertz
|
|
291
|
+
self.beepMixNormalize = beepMixNormalize
|
|
292
|
+
self.beepAudioWeight = beepAudioWeight
|
|
293
|
+
self.beepSineWeight = beepSineWeight
|
|
294
|
+
self.beepDropTransition = beepDropTransition
|
|
295
|
+
self.forceDespiteTag = force
|
|
296
|
+
self.debug = dbug
|
|
297
|
+
self.outputJson = outputJson
|
|
298
|
+
self.inputTranscript = inputTranscript
|
|
299
|
+
self.saveTranscript = saveTranscript
|
|
300
|
+
|
|
301
|
+
# determine input file name, or download and save file
|
|
302
|
+
if (iFileSpec is not None) and os.path.isfile(iFileSpec):
|
|
303
|
+
self.inputFileSpec = iFileSpec
|
|
304
|
+
elif iFileSpec.lower().startswith("http"):
|
|
305
|
+
self.tmpDownloadedFileSpec = DownloadToFile(iFileSpec)
|
|
306
|
+
if (self.tmpDownloadedFileSpec is not None) and os.path.isfile(self.tmpDownloadedFileSpec):
|
|
307
|
+
self.inputFileSpec = self.tmpDownloadedFileSpec
|
|
308
|
+
else:
|
|
309
|
+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), iFileSpec)
|
|
310
|
+
else:
|
|
311
|
+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), iFileSpec)
|
|
312
|
+
|
|
313
|
+
# input file should exist locally by now
|
|
314
|
+
if os.path.isfile(self.inputFileSpec):
|
|
315
|
+
self.inputFileParts = os.path.splitext(self.inputFileSpec)
|
|
316
|
+
self.inputCodecs = GetCodecs(self.inputFileSpec)
|
|
317
|
+
inputFormat = next(
|
|
318
|
+
iter([x for x in self.inputCodecs.get('format', None) if x in AUDIO_DEFAULT_PARAMS_BY_FORMAT]), None
|
|
319
|
+
)
|
|
320
|
+
else:
|
|
321
|
+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.inputFileSpec)
|
|
322
|
+
|
|
323
|
+
# determine output file name (either specified or based on input filename)
|
|
324
|
+
self.outputFileSpec = oFileSpec if oFileSpec else self.inputFileParts[0] + "_clean"
|
|
325
|
+
if self.outputFileSpec:
|
|
326
|
+
outParts = os.path.splitext(self.outputFileSpec)
|
|
327
|
+
if (
|
|
328
|
+
((not oAudioFileFormat) or (str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT))
|
|
329
|
+
and oFileSpec
|
|
330
|
+
and (len(outParts) > 1)
|
|
331
|
+
and outParts[1]
|
|
332
|
+
):
|
|
333
|
+
oAudioFileFormat = outParts[1]
|
|
334
|
+
|
|
335
|
+
if str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT:
|
|
336
|
+
# output format not specified, base on input filename matching extension (or codec)
|
|
337
|
+
if self.inputFileParts[1] in AUDIO_DEFAULT_PARAMS_BY_FORMAT:
|
|
338
|
+
self.outputFileSpec = self.outputFileSpec + self.inputFileParts[1]
|
|
339
|
+
elif str(inputFormat).lower() in AUDIO_DEFAULT_PARAMS_BY_FORMAT:
|
|
340
|
+
self.outputFileSpec = self.outputFileSpec + '.' + inputFormat.lower()
|
|
341
|
+
else:
|
|
342
|
+
for codec in mmguero.get_iterable(self.inputCodecs.get('audio', [])):
|
|
343
|
+
if codec.lower() in AUDIO_CODEC_TO_FORMAT:
|
|
344
|
+
self.outputFileSpec = self.outputFileSpec + '.' + AUDIO_CODEC_TO_FORMAT[codec.lower()]
|
|
345
|
+
break
|
|
346
|
+
|
|
347
|
+
elif oAudioFileFormat:
|
|
348
|
+
# output filename not specified, base on input filename with specified format
|
|
349
|
+
newSuffix = '.' + oAudioFileFormat.lower().lstrip('.')
|
|
350
|
+
self.outputFileSpec = mmguero.remove_suffix(self.outputFileSpec, newSuffix) + newSuffix
|
|
351
|
+
|
|
352
|
+
else:
|
|
353
|
+
# can't determine what output file audio format should be
|
|
354
|
+
raise ValueError("Output file audio format unspecified")
|
|
355
|
+
|
|
356
|
+
# determine output file extension if it's not already obvious
|
|
357
|
+
outParts = os.path.splitext(self.outputFileSpec)
|
|
358
|
+
self.outputAudioFileFormat = outParts[1].lower().lstrip('.')
|
|
359
|
+
|
|
360
|
+
if (not self.outputAudioFileFormat) or (
|
|
361
|
+
(not aParams) and (self.outputAudioFileFormat not in AUDIO_DEFAULT_PARAMS_BY_FORMAT)
|
|
362
|
+
):
|
|
363
|
+
raise ValueError("Output file audio format unspecified or unsupported")
|
|
364
|
+
elif not aParams:
|
|
365
|
+
# we're using ffmpeg encoding params based on output file format
|
|
366
|
+
self.aParams = AUDIO_DEFAULT_PARAMS_BY_FORMAT[self.outputAudioFileFormat]
|
|
367
|
+
else:
|
|
368
|
+
# they specified custom ffmpeg encoding params
|
|
369
|
+
self.aParams = aParams
|
|
370
|
+
if self.aParams.startswith("base64:"):
|
|
371
|
+
self.aParams = base64.b64decode(self.aParams[7:]).decode("utf-8")
|
|
372
|
+
self.aParams = self.aParams.split(' ')
|
|
373
|
+
self.aParams = [
|
|
374
|
+
{
|
|
375
|
+
CHANNELS_REPLACER: str(aChannels),
|
|
376
|
+
SAMPLE_RATE_REPLACER: str(aSampleRate),
|
|
377
|
+
BIT_RATE_REPLACER: str(aBitRate),
|
|
378
|
+
VORBIS_QSCALE_REPLACER: str(aVorbisQscale),
|
|
379
|
+
}.get(aParam, aParam)
|
|
380
|
+
for aParam in self.aParams
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
# if we're actually just replacing the audio stream(s) inside a video file, the actual output file is still a video file
|
|
384
|
+
self.outputVideoFileFormat = (
|
|
385
|
+
self.inputFileParts[1]
|
|
386
|
+
if (
|
|
387
|
+
(len(mmguero.get_iterable(self.inputCodecs.get('video', []))) > 0)
|
|
388
|
+
and (str(oAudioFileFormat).upper() == AUDIO_MATCH_FORMAT)
|
|
389
|
+
)
|
|
390
|
+
else ''
|
|
391
|
+
)
|
|
392
|
+
if self.outputVideoFileFormat:
|
|
393
|
+
self.outputFileSpec = outParts[0] + self.outputVideoFileFormat
|
|
394
|
+
|
|
395
|
+
# create output directory if it doesn't exist
|
|
396
|
+
self._ensure_directory_exists(self.outputFileSpec, "output directory")
|
|
397
|
+
|
|
398
|
+
# if output file already exists, remove as we'll be overwriting it anyway
|
|
399
|
+
if os.path.isfile(self.outputFileSpec):
|
|
400
|
+
if self.debug:
|
|
401
|
+
mmguero.eprint(f'Removing existing destination file {self.outputFileSpec}')
|
|
402
|
+
os.remove(self.outputFileSpec)
|
|
403
|
+
|
|
404
|
+
# If save-transcript is enabled and no explicit JSON output path, auto-generate one
|
|
405
|
+
if self.saveTranscript and not self.outputJson:
|
|
406
|
+
outputBaseName = os.path.splitext(self.outputFileSpec)[0]
|
|
407
|
+
self.outputJson = outputBaseName + '_transcript.json'
|
|
408
|
+
if self.debug:
|
|
409
|
+
mmguero.eprint(f'Auto-generated transcript output: {self.outputJson}')
|
|
410
|
+
|
|
411
|
+
# Auto-detect existing transcript for reuse (unless force flag set or explicit input provided)
|
|
412
|
+
if self.saveTranscript and not self.inputTranscript and self.outputJson and not forceRetranscribe:
|
|
413
|
+
if os.path.exists(self.outputJson):
|
|
414
|
+
self.inputTranscript = self.outputJson
|
|
415
|
+
if self.debug:
|
|
416
|
+
mmguero.eprint(f'Found existing transcript, reusing: {self.inputTranscript}')
|
|
417
|
+
|
|
418
|
+
# If JSON output is specified, ensure its directory exists too
|
|
419
|
+
if self.outputJson:
|
|
420
|
+
self._ensure_directory_exists(self.outputJson, "JSON output directory")
|
|
421
|
+
|
|
422
|
+
# load the swears file (not actually mapping right now, but who knows, speech synthesis maybe someday?)
|
|
423
|
+
self.swearsFileSpec = iSwearsFileSpec if (iSwearsFileSpec is not None) and os.path.isfile(iSwearsFileSpec) else None
|
|
424
|
+
|
|
425
|
+
self._load_swears_file()
|
|
426
|
+
|
|
427
|
+
# validate instrumental file if provided
|
|
428
|
+
if instrumentalFileSpec:
|
|
429
|
+
if not os.path.isfile(instrumentalFileSpec):
|
|
430
|
+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), instrumentalFileSpec)
|
|
431
|
+
|
|
432
|
+
# Check duration of instrumental vs original
|
|
433
|
+
# Need to get duration directly from ffprobe since GetCodecs doesn't extract it
|
|
434
|
+
instrumentalDuration = self._get_file_duration(instrumentalFileSpec)
|
|
435
|
+
originalDuration = self._get_file_duration(self.inputFileSpec)
|
|
436
|
+
|
|
437
|
+
if instrumentalDuration > 0 and originalDuration > 0:
|
|
438
|
+
if instrumentalDuration < originalDuration:
|
|
439
|
+
raise ValueError(
|
|
440
|
+
f"Instrumental file duration ({instrumentalDuration}s) is shorter than "
|
|
441
|
+
f"original file duration ({originalDuration}s)"
|
|
442
|
+
)
|
|
443
|
+
elif self.debug:
|
|
444
|
+
mmguero.eprint('Warning: Could not verify file durations')
|
|
445
|
+
|
|
446
|
+
self.instrumentalFileSpec = instrumentalFileSpec
|
|
447
|
+
self.instrumentalMode = True
|
|
448
|
+
else:
|
|
449
|
+
self.instrumentalMode = False
|
|
450
|
+
|
|
451
|
+
if self.debug:
|
|
452
|
+
mmguero.eprint(f'Input: {self.inputFileSpec}')
|
|
453
|
+
mmguero.eprint(f'Input codec: {self.inputCodecs}')
|
|
454
|
+
mmguero.eprint(f'Output: {self.outputFileSpec}')
|
|
455
|
+
mmguero.eprint(f'Output audio format: {self.outputAudioFileFormat}')
|
|
456
|
+
mmguero.eprint(f'Encode parameters: {self.aParams}')
|
|
457
|
+
mmguero.eprint(f'Profanity file: {self.swearsFileSpec if self.swearsFileSpec else "built-in"}')
|
|
458
|
+
mmguero.eprint(f'Intermediate downloaded file: {self.tmpDownloadedFileSpec}')
|
|
459
|
+
if self.outputJson:
|
|
460
|
+
mmguero.eprint(f'Transcript output: {self.outputJson}')
|
|
461
|
+
if self.inputTranscript:
|
|
462
|
+
mmguero.eprint(f'Input transcript: {self.inputTranscript}')
|
|
463
|
+
mmguero.eprint(f'Beep instead of mute: {self.beep}')
|
|
464
|
+
if self.beep:
|
|
465
|
+
mmguero.eprint(f'Beep hertz: {self.beepHertz}')
|
|
466
|
+
mmguero.eprint(f'Beep mix normalization: {self.beepMixNormalize}')
|
|
467
|
+
mmguero.eprint(f'Beep audio weight: {self.beepAudioWeight}')
|
|
468
|
+
mmguero.eprint(f'Beep sine weight: {self.beepSineWeight}')
|
|
469
|
+
mmguero.eprint(f'Beep dropout transition: {self.beepDropTransition}')
|
|
470
|
+
mmguero.eprint(f'Force despite tags: {self.forceDespiteTag}')
|
|
471
|
+
mmguero.eprint(f'Instrumental mode: {self.instrumentalMode}')
|
|
472
|
+
if self.instrumentalMode:
|
|
473
|
+
mmguero.eprint(f'Instrumental file: {self.instrumentalFileSpec}')
|
|
474
|
+
|
|
475
|
+
######## del ##################################################################
|
|
476
|
+
def __del__(self):
|
|
477
|
+
# if we downloaded the input file, remove it as well
|
|
478
|
+
if os.path.isfile(self.tmpDownloadedFileSpec):
|
|
479
|
+
os.remove(self.tmpDownloadedFileSpec)
|
|
480
|
+
|
|
481
|
+
# Clean up temporary separation files
|
|
482
|
+
if hasattr(self, 'separationCacheDir') and self.separationCacheDir:
|
|
483
|
+
import shutil
|
|
484
|
+
try:
|
|
485
|
+
if os.path.exists(self.separationCacheDir):
|
|
486
|
+
shutil.rmtree(self.separationCacheDir)
|
|
487
|
+
if self.debug:
|
|
488
|
+
mmguero.eprint(f'Cleaned up separation cache: {self.separationCacheDir}')
|
|
489
|
+
except Exception as e:
|
|
490
|
+
if self.debug:
|
|
491
|
+
mmguero.eprint(f'Warning: Failed to cleanup separation cache: {e}')
|
|
492
|
+
|
|
493
|
+
######## _ensure_directory_exists #############################################
|
|
494
|
+
def _ensure_directory_exists(self, filepath, description="directory"):
|
|
495
|
+
"""Ensure the directory for a file path exists, creating it if necessary"""
|
|
496
|
+
directory = os.path.dirname(filepath)
|
|
497
|
+
if directory and not os.path.exists(directory):
|
|
498
|
+
if self.debug:
|
|
499
|
+
mmguero.eprint(f'Creating {description}: {directory}')
|
|
500
|
+
os.makedirs(directory, exist_ok=True)
|
|
501
|
+
return directory
|
|
502
|
+
|
|
503
|
+
######## _get_file_duration ###################################################
|
|
504
|
+
def _get_file_duration(self, filepath):
|
|
505
|
+
"""Get the duration of an audio/video file using ffprobe"""
|
|
506
|
+
try:
|
|
507
|
+
ffprobeCmd = [
|
|
508
|
+
'ffprobe',
|
|
509
|
+
'-v',
|
|
510
|
+
'quiet',
|
|
511
|
+
'-print_format',
|
|
512
|
+
'json',
|
|
513
|
+
'-show_entries',
|
|
514
|
+
'format=duration',
|
|
515
|
+
filepath,
|
|
516
|
+
]
|
|
517
|
+
ffprobeResult, ffprobeOutput = mmguero.run_process(ffprobeCmd, stdout=True, stderr=False, debug=False)
|
|
518
|
+
if ffprobeResult == 0:
|
|
519
|
+
ffprobeData = mmguero.load_str_if_json(' '.join(ffprobeOutput))
|
|
520
|
+
duration_str = mmguero.deep_get(ffprobeData, ['format', 'duration'], '0')
|
|
521
|
+
return float(duration_str)
|
|
522
|
+
else:
|
|
523
|
+
return 0.0
|
|
524
|
+
except Exception as e:
|
|
525
|
+
if self.debug:
|
|
526
|
+
mmguero.eprint(f'Error getting duration for {filepath}: {e}')
|
|
527
|
+
return 0.0
|
|
528
|
+
|
|
529
|
+
######## LoadTranscriptFromFile ##############################################
|
|
530
|
+
def LoadTranscriptFromFile(self):
|
|
531
|
+
"""Load pre-generated transcript from JSON file"""
|
|
532
|
+
if not self.inputTranscript:
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
if not os.path.isfile(self.inputTranscript):
|
|
536
|
+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.inputTranscript)
|
|
537
|
+
|
|
538
|
+
if self.debug:
|
|
539
|
+
mmguero.eprint(f'Loading transcript from: {self.inputTranscript}')
|
|
540
|
+
|
|
541
|
+
with open(self.inputTranscript, 'r') as f:
|
|
542
|
+
self.wordList = json.load(f)
|
|
543
|
+
|
|
544
|
+
# Recalculate scrub flags with current swears list
|
|
545
|
+
for word in self.wordList:
|
|
546
|
+
word['scrub'] = scrubword(word.get('word', '')) in self.swearsMap
|
|
547
|
+
|
|
548
|
+
if self.debug:
|
|
549
|
+
mmguero.eprint(f'Loaded {len(self.wordList)} words from transcript')
|
|
550
|
+
scrubbed_count = sum(1 for w in self.wordList if w.get('scrub', False))
|
|
551
|
+
mmguero.eprint(f'Words to censor with current swear list: {scrubbed_count}')
|
|
552
|
+
|
|
553
|
+
return True
|
|
554
|
+
|
|
555
|
+
######## _load_swears_file ####################################################
|
|
556
|
+
def _load_swears_file(self):
|
|
557
|
+
"""Load swears from built-in list first, then from custom text or JSON file if provided"""
|
|
558
|
+
# Load built-in profanity list first
|
|
559
|
+
self._load_builtin_swears()
|
|
560
|
+
|
|
561
|
+
# Load custom swears file if provided
|
|
562
|
+
if self.swearsFileSpec:
|
|
563
|
+
# Try to detect and parse JSON first
|
|
564
|
+
is_json = False
|
|
565
|
+
if self.swearsFileSpec.lower().endswith('.json'):
|
|
566
|
+
is_json = True
|
|
567
|
+
else:
|
|
568
|
+
# Try to parse as JSON even without .json extension
|
|
569
|
+
try:
|
|
570
|
+
with open(self.swearsFileSpec, 'r') as f:
|
|
571
|
+
content = f.read()
|
|
572
|
+
json.loads(content)
|
|
573
|
+
is_json = True
|
|
574
|
+
except (json.JSONDecodeError, ValueError):
|
|
575
|
+
pass
|
|
576
|
+
|
|
577
|
+
if is_json:
|
|
578
|
+
self._load_swears_from_json()
|
|
579
|
+
else:
|
|
580
|
+
self._load_swears_from_text()
|
|
581
|
+
|
|
582
|
+
if self.debug:
|
|
583
|
+
mmguero.eprint(f'Loaded {len(self.swearsMap)} profanity entries (built-in + custom from {self.swearsFileSpec})')
|
|
584
|
+
else:
|
|
585
|
+
if self.debug:
|
|
586
|
+
mmguero.eprint(f'Loaded {len(self.swearsMap)} profanity entries from built-in list')
|
|
587
|
+
|
|
588
|
+
def _load_builtin_swears(self):
|
|
589
|
+
"""Load built-in profanity list from package data"""
|
|
590
|
+
data = None
|
|
591
|
+
error_msgs = []
|
|
592
|
+
|
|
593
|
+
# Method 1: Try importlib.resources.files (Python 3.9+)
|
|
594
|
+
try:
|
|
595
|
+
import importlib.resources as resources
|
|
596
|
+
with resources.files('monkeyplug.data').joinpath('profanity_list.json').open('r') as f:
|
|
597
|
+
data = json.load(f)
|
|
598
|
+
if self.debug:
|
|
599
|
+
mmguero.eprint('Loaded profanity list using importlib.resources.files')
|
|
600
|
+
except Exception as e:
|
|
601
|
+
error_msgs.append(f"importlib.resources.files failed: {e}")
|
|
602
|
+
|
|
603
|
+
# Method 2: Fallback for older Python versions using pkg_resources
|
|
604
|
+
if data is None:
|
|
605
|
+
try:
|
|
606
|
+
import pkg_resources
|
|
607
|
+
resource_package = 'monkeyplug'
|
|
608
|
+
resource_path = '/'.join(('data', 'profanity_list.json'))
|
|
609
|
+
data = json.loads(pkg_resources.resource_string(resource_package, resource_path).decode('UTF-8'))
|
|
610
|
+
if self.debug:
|
|
611
|
+
mmguero.eprint('Loaded profanity list using pkg_resources')
|
|
612
|
+
except Exception as e:
|
|
613
|
+
error_msgs.append(f"pkg_resources failed: {e}")
|
|
614
|
+
|
|
615
|
+
# Method 3: Last resort - try to find the file relative to this module
|
|
616
|
+
if data is None:
|
|
617
|
+
try:
|
|
618
|
+
module_dir = os.path.dirname(os.path.abspath(__file__))
|
|
619
|
+
data_file = os.path.join(module_dir, 'data', 'profanity_list.json')
|
|
620
|
+
if os.path.exists(data_file):
|
|
621
|
+
with open(data_file, 'r') as f:
|
|
622
|
+
data = json.load(f)
|
|
623
|
+
if self.debug:
|
|
624
|
+
mmguero.eprint(f'Loaded profanity list from file path: {data_file}')
|
|
625
|
+
else:
|
|
626
|
+
error_msgs.append(f"File not found at {data_file}")
|
|
627
|
+
except Exception as e:
|
|
628
|
+
error_msgs.append(f"File path fallback failed: {e}")
|
|
629
|
+
|
|
630
|
+
# If all methods failed, warn but continue (custom swears file might be provided)
|
|
631
|
+
if data is None:
|
|
632
|
+
if self.debug:
|
|
633
|
+
mmguero.eprint('Could not load built-in profanity list:')
|
|
634
|
+
for msg in error_msgs:
|
|
635
|
+
mmguero.eprint(f' {msg}')
|
|
636
|
+
return
|
|
637
|
+
|
|
638
|
+
if isinstance(data, list):
|
|
639
|
+
for item in data:
|
|
640
|
+
if isinstance(item, str) and item.strip():
|
|
641
|
+
self.swearsMap[scrubword(item)] = "*****"
|
|
642
|
+
elif self.debug:
|
|
643
|
+
mmguero.eprint('Built-in profanity list has unexpected format')
|
|
644
|
+
|
|
645
|
+
def _load_swears_from_json(self):
|
|
646
|
+
"""Load swears from JSON format - simple array of strings
|
|
647
|
+
|
|
648
|
+
Format: ["word1", "word2", "word3", ...]
|
|
649
|
+
Example: https://github.com/zautumnz/profane-words/blob/master/words.json
|
|
650
|
+
"""
|
|
651
|
+
with open(self.swearsFileSpec, 'r') as f:
|
|
652
|
+
data = json.load(f)
|
|
653
|
+
|
|
654
|
+
if not isinstance(data, list):
|
|
655
|
+
raise ValueError(f"JSON swears file must contain an array of strings, got {type(data).__name__}")
|
|
656
|
+
|
|
657
|
+
for item in data:
|
|
658
|
+
if isinstance(item, str) and item.strip():
|
|
659
|
+
self.swearsMap[scrubword(item)] = "*****"
|
|
660
|
+
|
|
661
|
+
def _load_swears_from_text(self):
|
|
662
|
+
"""Load swears from pipe-delimited text format (legacy)"""
|
|
663
|
+
lines = []
|
|
664
|
+
with open(self.swearsFileSpec) as f:
|
|
665
|
+
lines = [line.rstrip("\n") for line in f]
|
|
666
|
+
for line in lines:
|
|
667
|
+
lineMap = line.split("|")
|
|
668
|
+
self.swearsMap[scrubword(lineMap[0])] = lineMap[1] if len(lineMap) > 1 else "*****"
|
|
669
|
+
|
|
670
|
+
######## CreateCleanMuteList #################################################
|
|
671
|
+
def CreateCleanMuteList(self):
|
|
672
|
+
# Try to load existing transcript first, otherwise perform speech recognition
|
|
673
|
+
if not self.LoadTranscriptFromFile():
|
|
674
|
+
self.RecognizeSpeech()
|
|
675
|
+
|
|
676
|
+
self.naughtyWordList = [word for word in self.wordList if word["scrub"] is True]
|
|
677
|
+
|
|
678
|
+
# Handle auto-generation mode
|
|
679
|
+
if hasattr(self, 'autoGenerateMode') and self.autoGenerateMode and len(self.naughtyWordList) > 0:
|
|
680
|
+
# Create merged profanity segments
|
|
681
|
+
self._create_instrumental_splice_list()
|
|
682
|
+
|
|
683
|
+
# Extract, separate, and get instrumental file
|
|
684
|
+
if self.instrumentalSegments:
|
|
685
|
+
try:
|
|
686
|
+
self.instrumentalFileSpec = self._create_combined_profanity_file()
|
|
687
|
+
if self.instrumentalFileSpec:
|
|
688
|
+
self.instrumentalMode = True
|
|
689
|
+
self._build_instrumental_filters()
|
|
690
|
+
return [] # Return empty list for muteTimeList
|
|
691
|
+
except Exception as e:
|
|
692
|
+
# Fallback to mute if generation fails
|
|
693
|
+
if self.debug:
|
|
694
|
+
mmguero.eprint(f"Generation failed: {e}, falling back to mute mode")
|
|
695
|
+
self.instrumentalMode = False
|
|
696
|
+
return self._create_mute_beep_list()
|
|
697
|
+
else:
|
|
698
|
+
return []
|
|
699
|
+
|
|
700
|
+
# Handle traditional instrumental file mode or mute/beep mode
|
|
701
|
+
if self.instrumentalMode:
|
|
702
|
+
return self._create_instrumental_splice_list()
|
|
703
|
+
else:
|
|
704
|
+
return self._create_mute_beep_list()
|
|
705
|
+
|
|
706
|
+
def _create_instrumental_splice_list(self):
|
|
707
|
+
"""Create list of profanity segments for instrumental splicing"""
|
|
708
|
+
if len(self.naughtyWordList) == 0:
|
|
709
|
+
self.instrumentalSegments = []
|
|
710
|
+
return []
|
|
711
|
+
|
|
712
|
+
# Sort by start time
|
|
713
|
+
sorted_naughty = sorted(self.naughtyWordList, key=lambda x: x['start'])
|
|
714
|
+
|
|
715
|
+
# Merge consecutive profanity segments (gap < 100ms)
|
|
716
|
+
merged_segments = []
|
|
717
|
+
if sorted_naughty:
|
|
718
|
+
current_start = max(0, sorted_naughty[0]['start'] - self.padSecPre)
|
|
719
|
+
current_end = sorted_naughty[0]['end'] + self.padSecPost
|
|
720
|
+
|
|
721
|
+
for word in sorted_naughty[1:]:
|
|
722
|
+
word_start = max(0, word['start'] - self.padSecPre)
|
|
723
|
+
word_end = word['end'] + self.padSecPost
|
|
724
|
+
|
|
725
|
+
# If gap between segments is less than 100ms, merge them
|
|
726
|
+
if word_start - current_end < 0.1:
|
|
727
|
+
current_end = max(current_end, word_end)
|
|
728
|
+
else:
|
|
729
|
+
merged_segments.append((current_start, current_end))
|
|
730
|
+
current_start = word_start
|
|
731
|
+
current_end = word_end
|
|
732
|
+
|
|
733
|
+
# Add the last segment
|
|
734
|
+
merged_segments.append((current_start, current_end))
|
|
735
|
+
|
|
736
|
+
self.instrumentalSegments = merged_segments
|
|
737
|
+
|
|
738
|
+
if self.debug:
|
|
739
|
+
mmguero.eprint(f'Instrumental segments: {self.instrumentalSegments}')
|
|
740
|
+
|
|
741
|
+
# Return empty list for muteTimeList (not used in instrumental mode)
|
|
742
|
+
return []
|
|
743
|
+
|
|
744
|
+
def _create_mute_beep_list(self):
|
|
745
|
+
"""Create traditional mute or beep filter list"""
|
|
746
|
+
if len(self.naughtyWordList) > 0:
|
|
747
|
+
# append a dummy word at the very end so that pairwise can peek then ignore it
|
|
748
|
+
self.naughtyWordList.extend(
|
|
749
|
+
[
|
|
750
|
+
{
|
|
751
|
+
"conf": 1,
|
|
752
|
+
"end": self.naughtyWordList[-1]["end"] + 2.0,
|
|
753
|
+
"start": self.naughtyWordList[-1]["end"] + 1.0,
|
|
754
|
+
"word": "mothaflippin",
|
|
755
|
+
"scrub": True,
|
|
756
|
+
}
|
|
757
|
+
]
|
|
758
|
+
)
|
|
759
|
+
if self.debug:
|
|
760
|
+
mmguero.eprint(self.naughtyWordList)
|
|
761
|
+
|
|
762
|
+
self.muteTimeList = []
|
|
763
|
+
self.sineTimeList = []
|
|
764
|
+
self.beepDelayList = []
|
|
765
|
+
for word, wordPeek in pairwise(self.naughtyWordList):
|
|
766
|
+
wordStart = format(word["start"] - self.padSecPre, ".3f")
|
|
767
|
+
wordEnd = format(word["end"] + self.padSecPost, ".3f")
|
|
768
|
+
wordDuration = format(float(wordEnd) - float(wordStart), ".3f")
|
|
769
|
+
wordPeekStart = format(wordPeek["start"] - self.padSecPre, ".3f")
|
|
770
|
+
if self.beep:
|
|
771
|
+
self.muteTimeList.append(f"volume=enable='between(t,{wordStart},{wordEnd})':volume=0")
|
|
772
|
+
self.sineTimeList.append(f"sine=f={self.beepHertz}:duration={wordDuration}")
|
|
773
|
+
self.beepDelayList.append(
|
|
774
|
+
f"atrim=0:{wordDuration},adelay={'|'.join([str(int(float(wordStart) * 1000))] * 2)}"
|
|
775
|
+
)
|
|
776
|
+
else:
|
|
777
|
+
self.muteTimeList.append(
|
|
778
|
+
"afade=enable='between(t," + wordStart + "," + wordEnd + ")':t=out:st=" + wordStart + ":d=5ms"
|
|
779
|
+
)
|
|
780
|
+
self.muteTimeList.append(
|
|
781
|
+
"afade=enable='between(t," + wordEnd + "," + wordPeekStart + ")':t=in:st=" + wordEnd + ":d=5ms"
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
if self.debug:
|
|
785
|
+
mmguero.eprint(self.muteTimeList)
|
|
786
|
+
if self.beep:
|
|
787
|
+
mmguero.eprint(self.sineTimeList)
|
|
788
|
+
mmguero.eprint(self.beepDelayList)
|
|
789
|
+
|
|
790
|
+
return self.muteTimeList
|
|
791
|
+
|
|
792
|
+
def _build_instrumental_filters(self):
|
|
793
|
+
"""Build FFmpeg filter complex for instrumental splicing
|
|
794
|
+
|
|
795
|
+
Supports both:
|
|
796
|
+
- Traditional instrumental file (instrumentalFileSpec provided by user)
|
|
797
|
+
- Auto-generated combined file (autoGenerateMode with segMapping)
|
|
798
|
+
"""
|
|
799
|
+
if not self.instrumentalSegments:
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
duration = self._get_file_duration(self.inputFileSpec)
|
|
803
|
+
filter_parts = []
|
|
804
|
+
seg_index = 0
|
|
805
|
+
last_end = 0.0
|
|
806
|
+
|
|
807
|
+
if hasattr(self, 'autoGenerateMode') and self.autoGenerateMode and hasattr(self, 'segMapping') and self.segMapping:
|
|
808
|
+
# AUTO-SEPARATION MODE: Use segMapping to translate timestamps
|
|
809
|
+
for idx, (orig_start, orig_end) in enumerate(self.instrumentalSegments):
|
|
810
|
+
# Get the mapping for this segment
|
|
811
|
+
if idx < len(self.segMapping):
|
|
812
|
+
profanity_start, profanity_end, combined_start, combined_end, padded_start, padded_end = self.segMapping[idx]
|
|
813
|
+
else:
|
|
814
|
+
# Fallback: shouldn't happen
|
|
815
|
+
if orig_start > last_end:
|
|
816
|
+
filter_parts.append(f"[0:a]atrim={last_end:.2f}:{orig_start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
|
|
817
|
+
seg_index += 1
|
|
818
|
+
filter_parts.append(f"[0:a]atrim={orig_start:.2f}:{orig_end:.2f},volume=0[seg{seg_index}]")
|
|
819
|
+
seg_index += 1
|
|
820
|
+
last_end = orig_end
|
|
821
|
+
continue
|
|
822
|
+
|
|
823
|
+
# Original audio before profanity
|
|
824
|
+
if orig_start > last_end:
|
|
825
|
+
filter_parts.append(f"[0:a]atrim={last_end:.2f}:{orig_start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
|
|
826
|
+
seg_index += 1
|
|
827
|
+
|
|
828
|
+
# Extract the profanity portion from the combined instrumental file
|
|
829
|
+
# Calculate the position in the combined file where profanity starts
|
|
830
|
+
# combined_start = where this padded segment is in combined file
|
|
831
|
+
# (profanity_start - padded_start) = offset of profanity within the padded segment
|
|
832
|
+
position_in_combined = combined_start + (profanity_start - padded_start)
|
|
833
|
+
profanity_duration = profanity_end - profanity_start
|
|
834
|
+
|
|
835
|
+
filter_parts.append(
|
|
836
|
+
f"[1:a]atrim={position_in_combined:.2f}:{position_in_combined + profanity_duration:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
|
|
837
|
+
)
|
|
838
|
+
seg_index += 1
|
|
839
|
+
|
|
840
|
+
last_end = orig_end
|
|
841
|
+
|
|
842
|
+
# Final original audio segment
|
|
843
|
+
if last_end < duration:
|
|
844
|
+
filter_parts.append(f"[0:a]atrim={last_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]")
|
|
845
|
+
seg_index += 1
|
|
846
|
+
|
|
847
|
+
# Concatenate all segments
|
|
848
|
+
concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
|
|
849
|
+
filter_parts.append(f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]")
|
|
850
|
+
|
|
851
|
+
else:
|
|
852
|
+
# TRADITIONAL MODE: Use provided instrumental file
|
|
853
|
+
# Original logic works fine here
|
|
854
|
+
filter_parts.append("[0:a]asplit=2[orig][inst]")
|
|
855
|
+
|
|
856
|
+
seg_index = 0
|
|
857
|
+
last_end = 0.0
|
|
858
|
+
|
|
859
|
+
for start, end in self.instrumentalSegments:
|
|
860
|
+
# Add original audio segment before profanity
|
|
861
|
+
if start > last_end:
|
|
862
|
+
filter_parts.append(
|
|
863
|
+
f"[orig]atrim={last_end:.2f}:{start:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
|
|
864
|
+
)
|
|
865
|
+
seg_index += 1
|
|
866
|
+
|
|
867
|
+
# Add instrumental audio segment for profanity
|
|
868
|
+
filter_parts.append(
|
|
869
|
+
f"[inst]atrim={start:.2f}:{end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
|
|
870
|
+
)
|
|
871
|
+
seg_index += 1
|
|
872
|
+
|
|
873
|
+
last_end = end
|
|
874
|
+
|
|
875
|
+
# Add final original audio segment after last profanity
|
|
876
|
+
filter_parts.append(
|
|
877
|
+
f"[orig]atrim={last_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
|
|
878
|
+
)
|
|
879
|
+
seg_index += 1
|
|
880
|
+
|
|
881
|
+
# Concatenate all segments
|
|
882
|
+
concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
|
|
883
|
+
filter_parts.append(
|
|
884
|
+
f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]"
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
filter_complex = ';'.join(filter_parts)
|
|
888
|
+
|
|
889
|
+
if self.debug:
|
|
890
|
+
if hasattr(self, 'verbose_level') and self.verbose_level == "full":
|
|
891
|
+
mmguero.eprint(f'Filter complex: {filter_complex}')
|
|
892
|
+
else:
|
|
893
|
+
# Concise mode: just show segment count
|
|
894
|
+
mode = "auto-separation" if (hasattr(self, 'autoGenerateMode') and self.autoGenerateMode) else "traditional"
|
|
895
|
+
mmguero.eprint(f'Building FFmpeg filter with {len(self.instrumentalSegments)} instrumental segment(s) ({mode} mode)')
|
|
896
|
+
|
|
897
|
+
return ['-filter_complex', filter_complex, '-map', '[outa]']
|
|
898
|
+
|
|
899
|
+
######## EncodeCleanAudio ####################################################
|
|
900
|
+
def EncodeCleanAudio(self):
|
|
901
|
+
if (self.forceDespiteTag is True) or (GetMonkeyplugTagged(self.inputFileSpec, debug=self.debug) is False):
|
|
902
|
+
self.CreateCleanMuteList()
|
|
903
|
+
|
|
904
|
+
# Handle instrumental mode differently
|
|
905
|
+
if self.instrumentalMode:
|
|
906
|
+
# Use instrumental splicing
|
|
907
|
+
audioArgs = self._build_instrumental_filters()
|
|
908
|
+
else:
|
|
909
|
+
# Traditional mute or beep
|
|
910
|
+
if len(self.muteTimeList) > 0:
|
|
911
|
+
if self.beep:
|
|
912
|
+
muteTimeListStr = ','.join(self.muteTimeList)
|
|
913
|
+
sineTimeListStr = ';'.join([f'{val}[beep{i+1}]' for i, val in enumerate(self.sineTimeList)])
|
|
914
|
+
beepDelayList = ';'.join(
|
|
915
|
+
[f'[beep{i+1}]{val}[beep{i+1}_delayed]' for i, val in enumerate(self.beepDelayList)]
|
|
916
|
+
)
|
|
917
|
+
beepMixList = ''.join([f'[beep{i+1}_delayed]' for i in range(len(self.beepDelayList))])
|
|
918
|
+
filterStr = f"[0:a]{muteTimeListStr}[mute];{sineTimeListStr};{beepDelayList};[mute]{beepMixList}amix=inputs={len(self.beepDelayList)+1}:normalize={str(self.beepMixNormalize).lower()}:dropout_transition={self.beepDropTransition}:weights={self.beepAudioWeight} {' '.join([str(self.beepSineWeight)] * len(self.beepDelayList))}"
|
|
919
|
+
audioArgs = ['-filter_complex', filterStr]
|
|
920
|
+
else:
|
|
921
|
+
audioArgs = ['-af', ",".join(self.muteTimeList)]
|
|
922
|
+
else:
|
|
923
|
+
audioArgs = []
|
|
924
|
+
|
|
925
|
+
if self.outputVideoFileFormat:
|
|
926
|
+
# replace existing audio stream in video file with -copy
|
|
927
|
+
ffmpegCmd = [
|
|
928
|
+
'ffmpeg',
|
|
929
|
+
'-nostdin',
|
|
930
|
+
'-hide_banner',
|
|
931
|
+
'-nostats',
|
|
932
|
+
'-loglevel',
|
|
933
|
+
'error',
|
|
934
|
+
'-y',
|
|
935
|
+
'-i',
|
|
936
|
+
self.inputFileSpec,
|
|
937
|
+
]
|
|
938
|
+
|
|
939
|
+
# Add instrumental file input if in instrumental mode
|
|
940
|
+
if self.instrumentalMode:
|
|
941
|
+
ffmpegCmd.extend(['-i', self.instrumentalFileSpec])
|
|
942
|
+
|
|
943
|
+
ffmpegCmd.extend([
|
|
944
|
+
'-c:v',
|
|
945
|
+
'copy',
|
|
946
|
+
'-sn',
|
|
947
|
+
'-dn',
|
|
948
|
+
])
|
|
949
|
+
ffmpegCmd.extend(audioArgs)
|
|
950
|
+
ffmpegCmd.extend(self.aParams)
|
|
951
|
+
ffmpegCmd.append(self.outputFileSpec)
|
|
952
|
+
|
|
953
|
+
else:
|
|
954
|
+
ffmpegCmd = [
|
|
955
|
+
'ffmpeg',
|
|
956
|
+
'-nostdin',
|
|
957
|
+
'-hide_banner',
|
|
958
|
+
'-nostats',
|
|
959
|
+
'-loglevel',
|
|
960
|
+
'error',
|
|
961
|
+
'-y',
|
|
962
|
+
'-i',
|
|
963
|
+
self.inputFileSpec,
|
|
964
|
+
]
|
|
965
|
+
|
|
966
|
+
# Add instrumental file input if in instrumental mode
|
|
967
|
+
if self.instrumentalMode:
|
|
968
|
+
ffmpegCmd.extend(['-i', self.instrumentalFileSpec])
|
|
969
|
+
|
|
970
|
+
ffmpegCmd.extend(['-vn', '-sn', '-dn'])
|
|
971
|
+
ffmpegCmd.extend(audioArgs)
|
|
972
|
+
ffmpegCmd.extend(self.aParams)
|
|
973
|
+
ffmpegCmd.append(self.outputFileSpec)
|
|
974
|
+
|
|
975
|
+
ffmpegResult, ffmpegOutput = mmguero.run_process(ffmpegCmd, stdout=True, stderr=True, debug=self.debug)
|
|
976
|
+
if (ffmpegResult != 0) or (not os.path.isfile(self.outputFileSpec)):
|
|
977
|
+
mmguero.eprint(' '.join(mmguero.flatten(ffmpegCmd)))
|
|
978
|
+
mmguero.eprint(ffmpegResult)
|
|
979
|
+
mmguero.eprint(ffmpegOutput)
|
|
980
|
+
raise ValueError(f"Could not process {self.inputFileSpec}")
|
|
981
|
+
|
|
982
|
+
SetMonkeyplugTag(self.outputFileSpec, debug=self.debug)
|
|
983
|
+
|
|
984
|
+
else:
|
|
985
|
+
shutil.copyfile(self.inputFileSpec, self.outputFileSpec)
|
|
986
|
+
|
|
987
|
+
return self.outputFileSpec
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
#################################################################################
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
#################################################################################
|
|
994
|
+
class VoskPlugger(Plugger):
|
|
995
|
+
tmpWavFileSpec = ""
|
|
996
|
+
modelPath = ""
|
|
997
|
+
wavReadFramesChunk = AUDIO_DEFAULT_WAV_FRAMES_CHUNK
|
|
998
|
+
vosk = None
|
|
999
|
+
|
|
1000
|
+
def __init__(
|
|
1001
|
+
self,
|
|
1002
|
+
iFileSpec,
|
|
1003
|
+
oFileSpec,
|
|
1004
|
+
oAudioFileFormat,
|
|
1005
|
+
iSwearsFileSpec,
|
|
1006
|
+
mDir,
|
|
1007
|
+
outputJson,
|
|
1008
|
+
inputTranscript=None,
|
|
1009
|
+
saveTranscript=False,
|
|
1010
|
+
forceRetranscribe=False,
|
|
1011
|
+
aParams=None,
|
|
1012
|
+
aChannels=AUDIO_DEFAULT_CHANNELS,
|
|
1013
|
+
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
|
|
1014
|
+
aBitRate=AUDIO_DEFAULT_BIT_RATE,
|
|
1015
|
+
aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
|
|
1016
|
+
wChunk=AUDIO_DEFAULT_WAV_FRAMES_CHUNK,
|
|
1017
|
+
padMsecPre=0,
|
|
1018
|
+
padMsecPost=0,
|
|
1019
|
+
beep=False,
|
|
1020
|
+
beepHertz=BEEP_HERTZ_DEFAULT,
|
|
1021
|
+
beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
|
|
1022
|
+
beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
|
|
1023
|
+
beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
|
|
1024
|
+
beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
|
|
1025
|
+
force=False,
|
|
1026
|
+
dbug=False,
|
|
1027
|
+
):
|
|
1028
|
+
self.wavReadFramesChunk = wChunk
|
|
1029
|
+
self.modelPath = None
|
|
1030
|
+
self.vosk = None
|
|
1031
|
+
|
|
1032
|
+
# Only load model if we're actually going to transcribe
|
|
1033
|
+
if not inputTranscript:
|
|
1034
|
+
# make sure the VOSK model path exists
|
|
1035
|
+
if (mDir is not None) and os.path.isdir(mDir):
|
|
1036
|
+
self.modelPath = mDir
|
|
1037
|
+
else:
|
|
1038
|
+
raise IOError(
|
|
1039
|
+
errno.ENOENT,
|
|
1040
|
+
os.strerror(errno.ENOENT) + " (see https://alphacephei.com/vosk/models)",
|
|
1041
|
+
mDir,
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
self.vosk = mmguero.dynamic_import("vosk", "vosk", debug=dbug)
|
|
1045
|
+
if not self.vosk:
|
|
1046
|
+
raise Exception("Unable to initialize VOSK API")
|
|
1047
|
+
if not dbug:
|
|
1048
|
+
self.vosk.SetLogLevel(-1)
|
|
1049
|
+
|
|
1050
|
+
super().__init__(
|
|
1051
|
+
iFileSpec=iFileSpec,
|
|
1052
|
+
oFileSpec=oFileSpec,
|
|
1053
|
+
oAudioFileFormat=oAudioFileFormat,
|
|
1054
|
+
iSwearsFileSpec=iSwearsFileSpec,
|
|
1055
|
+
outputJson=outputJson,
|
|
1056
|
+
inputTranscript=inputTranscript,
|
|
1057
|
+
saveTranscript=saveTranscript,
|
|
1058
|
+
forceRetranscribe=forceRetranscribe,
|
|
1059
|
+
aParams=aParams,
|
|
1060
|
+
aChannels=aChannels,
|
|
1061
|
+
aSampleRate=aSampleRate,
|
|
1062
|
+
aBitRate=aBitRate,
|
|
1063
|
+
aVorbisQscale=aVorbisQscale,
|
|
1064
|
+
padMsecPre=padMsecPre,
|
|
1065
|
+
padMsecPost=padMsecPost,
|
|
1066
|
+
beep=beep,
|
|
1067
|
+
beepHertz=beepHertz,
|
|
1068
|
+
beepMixNormalize=beepMixNormalize,
|
|
1069
|
+
beepAudioWeight=beepAudioWeight,
|
|
1070
|
+
beepSineWeight=beepSineWeight,
|
|
1071
|
+
beepDropTransition=beepDropTransition,
|
|
1072
|
+
force=force,
|
|
1073
|
+
dbug=dbug,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
self.tmpWavFileSpec = self.inputFileParts[0] + ".wav"
|
|
1077
|
+
|
|
1078
|
+
if self.debug:
|
|
1079
|
+
if inputTranscript:
|
|
1080
|
+
mmguero.eprint(f'Using input transcript (skipping speech recognition)')
|
|
1081
|
+
else:
|
|
1082
|
+
mmguero.eprint(f'Model directory: {self.modelPath}')
|
|
1083
|
+
mmguero.eprint(f'Intermediate audio file: {self.tmpWavFileSpec}')
|
|
1084
|
+
mmguero.eprint(f'Read frames: {self.wavReadFramesChunk}')
|
|
1085
|
+
|
|
1086
|
+
def __del__(self):
|
|
1087
|
+
super().__del__()
|
|
1088
|
+
# clean up intermediate WAV file used for speech recognition
|
|
1089
|
+
if os.path.isfile(self.tmpWavFileSpec):
|
|
1090
|
+
os.remove(self.tmpWavFileSpec)
|
|
1091
|
+
|
|
1092
|
+
def CreateIntermediateWAV(self):
|
|
1093
|
+
ffmpegCmd = [
|
|
1094
|
+
'ffmpeg',
|
|
1095
|
+
'-nostdin',
|
|
1096
|
+
'-hide_banner',
|
|
1097
|
+
'-nostats',
|
|
1098
|
+
'-loglevel',
|
|
1099
|
+
'error',
|
|
1100
|
+
'-y',
|
|
1101
|
+
'-i',
|
|
1102
|
+
self.inputFileSpec,
|
|
1103
|
+
'-vn',
|
|
1104
|
+
'-sn',
|
|
1105
|
+
'-dn',
|
|
1106
|
+
AUDIO_INTERMEDIATE_PARAMS,
|
|
1107
|
+
self.tmpWavFileSpec,
|
|
1108
|
+
]
|
|
1109
|
+
ffmpegResult, ffmpegOutput = mmguero.run_process(ffmpegCmd, stdout=True, stderr=True, debug=self.debug)
|
|
1110
|
+
if (ffmpegResult != 0) or (not os.path.isfile(self.tmpWavFileSpec)):
|
|
1111
|
+
mmguero.eprint(' '.join(mmguero.flatten(ffmpegCmd)))
|
|
1112
|
+
mmguero.eprint(ffmpegResult)
|
|
1113
|
+
mmguero.eprint(ffmpegOutput)
|
|
1114
|
+
raise ValueError(
|
|
1115
|
+
f"Could not convert {self.inputFileSpec} to {self.tmpWavFileSpec} (16 kHz, mono, s16 PCM WAV)"
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
return self.inputFileSpec
|
|
1119
|
+
|
|
1120
|
+
def RecognizeSpeech(self):
|
|
1121
|
+
self.CreateIntermediateWAV()
|
|
1122
|
+
self.wordList.clear()
|
|
1123
|
+
with wave.open(self.tmpWavFileSpec, "rb") as wf:
|
|
1124
|
+
if (
|
|
1125
|
+
(wf.getnchannels() != 1)
|
|
1126
|
+
or (wf.getframerate() != 16000)
|
|
1127
|
+
or (wf.getsampwidth() != 2)
|
|
1128
|
+
or (wf.getcomptype() != "NONE")
|
|
1129
|
+
):
|
|
1130
|
+
raise Exception(f"Audio file ({self.tmpWavFileSpec}) must be 16 kHz, mono, s16 PCM WAV")
|
|
1131
|
+
|
|
1132
|
+
rec = self.vosk.KaldiRecognizer(self.vosk.Model(self.modelPath), wf.getframerate())
|
|
1133
|
+
rec.SetWords(True)
|
|
1134
|
+
while True:
|
|
1135
|
+
data = wf.readframes(self.wavReadFramesChunk)
|
|
1136
|
+
if len(data) == 0:
|
|
1137
|
+
break
|
|
1138
|
+
if rec.AcceptWaveform(data):
|
|
1139
|
+
res = json.loads(rec.Result())
|
|
1140
|
+
if "result" in res:
|
|
1141
|
+
self.wordList.extend(
|
|
1142
|
+
[
|
|
1143
|
+
dict(r, **{'scrub': scrubword(mmguero.deep_get(r, ["word"])) in self.swearsMap})
|
|
1144
|
+
for r in res["result"]
|
|
1145
|
+
]
|
|
1146
|
+
)
|
|
1147
|
+
res = json.loads(rec.FinalResult())
|
|
1148
|
+
if "result" in res:
|
|
1149
|
+
self.wordList.extend(
|
|
1150
|
+
[
|
|
1151
|
+
dict(r, **{'scrub': scrubword(mmguero.deep_get(r, ["word"])) in self.swearsMap})
|
|
1152
|
+
for r in res["result"]
|
|
1153
|
+
]
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1156
|
+
if self.debug:
|
|
1157
|
+
if hasattr(self, 'verbose_level') and self.verbose_level == "full":
|
|
1158
|
+
mmguero.eprint(json.dumps(self.wordList))
|
|
1159
|
+
else:
|
|
1160
|
+
# Concise mode: just show summary
|
|
1161
|
+
profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
|
|
1162
|
+
mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
|
|
1163
|
+
|
|
1164
|
+
if self.outputJson:
|
|
1165
|
+
with open(self.outputJson, "w") as f:
|
|
1166
|
+
f.write(json.dumps(self.wordList))
|
|
1167
|
+
|
|
1168
|
+
return self.wordList
|
|
1169
|
+
|
|
1170
|
+
|
|
1171
|
+
#################################################################################
|
|
1172
|
+
|
|
1173
|
+
|
|
1174
|
+
#################################################################################
|
|
1175
|
+
class WhisperPlugger(Plugger):
|
|
1176
|
+
debug = False
|
|
1177
|
+
model = None
|
|
1178
|
+
torch = None
|
|
1179
|
+
whisper = None
|
|
1180
|
+
transcript = None
|
|
1181
|
+
|
|
1182
|
+
def __init__(
|
|
1183
|
+
self,
|
|
1184
|
+
iFileSpec,
|
|
1185
|
+
oFileSpec,
|
|
1186
|
+
oAudioFileFormat,
|
|
1187
|
+
iSwearsFileSpec,
|
|
1188
|
+
mDir,
|
|
1189
|
+
mName,
|
|
1190
|
+
torchThreads,
|
|
1191
|
+
outputJson,
|
|
1192
|
+
inputTranscript=None,
|
|
1193
|
+
saveTranscript=False,
|
|
1194
|
+
forceRetranscribe=False,
|
|
1195
|
+
aParams=None,
|
|
1196
|
+
aChannels=AUDIO_DEFAULT_CHANNELS,
|
|
1197
|
+
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
|
|
1198
|
+
aBitRate=AUDIO_DEFAULT_BIT_RATE,
|
|
1199
|
+
aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
|
|
1200
|
+
padMsecPre=0,
|
|
1201
|
+
padMsecPost=0,
|
|
1202
|
+
beep=False,
|
|
1203
|
+
beepHertz=BEEP_HERTZ_DEFAULT,
|
|
1204
|
+
beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
|
|
1205
|
+
beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
|
|
1206
|
+
beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
|
|
1207
|
+
beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
|
|
1208
|
+
force=False,
|
|
1209
|
+
dbug=False,
|
|
1210
|
+
):
|
|
1211
|
+
self.whisper = None
|
|
1212
|
+
self.model = None
|
|
1213
|
+
self.torch = None
|
|
1214
|
+
|
|
1215
|
+
# Only load model if we're actually going to transcribe (no input transcript provided)
|
|
1216
|
+
if not inputTranscript:
|
|
1217
|
+
if torchThreads > 0:
|
|
1218
|
+
self.torch = mmguero.dynamic_import("torch", "torch", debug=dbug)
|
|
1219
|
+
if self.torch:
|
|
1220
|
+
self.torch.set_num_threads(torchThreads)
|
|
1221
|
+
|
|
1222
|
+
self.whisper = mmguero.dynamic_import("whisper", "openai-whisper", debug=dbug)
|
|
1223
|
+
if not self.whisper:
|
|
1224
|
+
raise Exception("Unable to initialize Whisper API")
|
|
1225
|
+
|
|
1226
|
+
self.model = self.whisper.load_model(mName, download_root=mDir)
|
|
1227
|
+
if not self.model:
|
|
1228
|
+
raise Exception(f"Unable to load Whisper model {mName} in {mDir}")
|
|
1229
|
+
|
|
1230
|
+
super().__init__(
|
|
1231
|
+
iFileSpec=iFileSpec,
|
|
1232
|
+
oFileSpec=oFileSpec,
|
|
1233
|
+
oAudioFileFormat=oAudioFileFormat,
|
|
1234
|
+
iSwearsFileSpec=iSwearsFileSpec,
|
|
1235
|
+
outputJson=outputJson,
|
|
1236
|
+
inputTranscript=inputTranscript,
|
|
1237
|
+
saveTranscript=saveTranscript,
|
|
1238
|
+
forceRetranscribe=forceRetranscribe,
|
|
1239
|
+
aParams=aParams,
|
|
1240
|
+
aChannels=aChannels,
|
|
1241
|
+
aSampleRate=aSampleRate,
|
|
1242
|
+
aBitRate=aBitRate,
|
|
1243
|
+
aVorbisQscale=aVorbisQscale,
|
|
1244
|
+
padMsecPre=padMsecPre,
|
|
1245
|
+
padMsecPost=padMsecPost,
|
|
1246
|
+
beep=beep,
|
|
1247
|
+
beepHertz=beepHertz,
|
|
1248
|
+
beepMixNormalize=beepMixNormalize,
|
|
1249
|
+
beepAudioWeight=beepAudioWeight,
|
|
1250
|
+
beepSineWeight=beepSineWeight,
|
|
1251
|
+
beepDropTransition=beepDropTransition,
|
|
1252
|
+
force=force,
|
|
1253
|
+
dbug=dbug,
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
if self.debug:
|
|
1257
|
+
if inputTranscript:
|
|
1258
|
+
mmguero.eprint(f'Using input transcript (skipping speech recognition)')
|
|
1259
|
+
else:
|
|
1260
|
+
mmguero.eprint(f'Model directory: {mDir}')
|
|
1261
|
+
mmguero.eprint(f'Model name: {mName}')
|
|
1262
|
+
|
|
1263
|
+
def __del__(self):
|
|
1264
|
+
super().__del__()
|
|
1265
|
+
|
|
1266
|
+
def RecognizeSpeech(self):
|
|
1267
|
+
self.wordList.clear()
|
|
1268
|
+
|
|
1269
|
+
self.transcript = self.model.transcribe(word_timestamps=True, audio=self.inputFileSpec)
|
|
1270
|
+
if self.transcript and ('segments' in self.transcript):
|
|
1271
|
+
for segment in self.transcript['segments']:
|
|
1272
|
+
if 'words' in segment:
|
|
1273
|
+
for word in segment['words']:
|
|
1274
|
+
word['word'] = word['word'].strip()
|
|
1275
|
+
word['scrub'] = scrubword(word['word']) in self.swearsMap
|
|
1276
|
+
self.wordList.append(word)
|
|
1277
|
+
|
|
1278
|
+
if self.debug:
|
|
1279
|
+
if hasattr(self, 'verbose_level') and self.verbose_level == "full":
|
|
1280
|
+
mmguero.eprint(json.dumps(self.wordList))
|
|
1281
|
+
else:
|
|
1282
|
+
# Concise mode: just show summary
|
|
1283
|
+
profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
|
|
1284
|
+
mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
|
|
1285
|
+
|
|
1286
|
+
if self.outputJson:
|
|
1287
|
+
with open(self.outputJson, "w") as f:
|
|
1288
|
+
f.write(json.dumps(self.wordList))
|
|
1289
|
+
|
|
1290
|
+
return self.wordList
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
#################################################################################
|
|
1294
|
+
class GroqPlugger(Plugger):
|
|
1295
|
+
GROQ_API_ENDPOINT = "https://api.groq.com/openai/v1/audio/transcriptions"
|
|
1296
|
+
debug = False
|
|
1297
|
+
api_key = None
|
|
1298
|
+
groq_model = "whisper-large-v3"
|
|
1299
|
+
transcript = None
|
|
1300
|
+
VOCAL_DETECTION_SAMPLE_DURATION = 10 # Seconds to sample for vocal detection
|
|
1301
|
+
# Filler words that indicate silence (including common hallucinations)
|
|
1302
|
+
VOCAL_DETECTION_FILLER_WORDS = {
|
|
1303
|
+
'thank', 'thanks', 'please', 'you', 'hey', 'yeah', 'oh', 'wow',
|
|
1304
|
+
'¶', '¶¶', # Common hallucinations/artifacts
|
|
1305
|
+
'', # Empty strings
|
|
1306
|
+
} # Filler words that indicate silence
|
|
1307
|
+
|
|
1308
|
+
def __init__(
|
|
1309
|
+
self,
|
|
1310
|
+
iFileSpec,
|
|
1311
|
+
oFileSpec,
|
|
1312
|
+
oAudioFileFormat,
|
|
1313
|
+
iSwearsFileSpec,
|
|
1314
|
+
groq_api_key,
|
|
1315
|
+
groq_model,
|
|
1316
|
+
outputJson,
|
|
1317
|
+
inputTranscript=None,
|
|
1318
|
+
saveTranscript=False,
|
|
1319
|
+
forceRetranscribe=False,
|
|
1320
|
+
aParams=None,
|
|
1321
|
+
aChannels=AUDIO_DEFAULT_CHANNELS,
|
|
1322
|
+
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
|
|
1323
|
+
aBitRate=AUDIO_DEFAULT_BIT_RATE,
|
|
1324
|
+
aVorbisQscale=AUDIO_DEFAULT_VORBIS_QSCALE,
|
|
1325
|
+
padMsecPre=0,
|
|
1326
|
+
padMsecPost=0,
|
|
1327
|
+
beep=False,
|
|
1328
|
+
beepHertz=BEEP_HERTZ_DEFAULT,
|
|
1329
|
+
beepMixNormalize=BEEP_MIX_NORMALIZE_DEFAULT,
|
|
1330
|
+
beepAudioWeight=BEEP_AUDIO_WEIGHT_DEFAULT,
|
|
1331
|
+
beepSineWeight=BEEP_SINE_WEIGHT_DEFAULT,
|
|
1332
|
+
beepDropTransition=BEEP_DROPOUT_TRANSITION_DEFAULT,
|
|
1333
|
+
force=False,
|
|
1334
|
+
dbug=False,
|
|
1335
|
+
instrumentalFileSpec=None,
|
|
1336
|
+
verbose_level="",
|
|
1337
|
+
auto_generate=False,
|
|
1338
|
+
separation_padding=1.0,
|
|
1339
|
+
):
|
|
1340
|
+
# Import groq_config - handle both relative and absolute imports
|
|
1341
|
+
try:
|
|
1342
|
+
from .groq_config import load_groq_api_key
|
|
1343
|
+
except ImportError:
|
|
1344
|
+
from monkeyplug.groq_config import load_groq_api_key
|
|
1345
|
+
|
|
1346
|
+
self.api_key = load_groq_api_key(groq_api_key, debug=dbug)
|
|
1347
|
+
if not self.api_key:
|
|
1348
|
+
raise ValueError(
|
|
1349
|
+
"Groq API key not found. Please provide it via --groq-api-key parameter, "
|
|
1350
|
+
"GROQ_API_KEY environment variable, ~/.groq/config.json file, or ./.groq_key file"
|
|
1351
|
+
)
|
|
1352
|
+
|
|
1353
|
+
self.groq_model = groq_model
|
|
1354
|
+
self.debug = dbug
|
|
1355
|
+
self.verbose_level = verbose_level
|
|
1356
|
+
|
|
1357
|
+
super().__init__(
|
|
1358
|
+
iFileSpec=iFileSpec,
|
|
1359
|
+
oFileSpec=oFileSpec,
|
|
1360
|
+
oAudioFileFormat=oAudioFileFormat,
|
|
1361
|
+
iSwearsFileSpec=iSwearsFileSpec,
|
|
1362
|
+
outputJson=outputJson,
|
|
1363
|
+
inputTranscript=inputTranscript,
|
|
1364
|
+
saveTranscript=saveTranscript,
|
|
1365
|
+
forceRetranscribe=forceRetranscribe,
|
|
1366
|
+
aParams=aParams,
|
|
1367
|
+
aChannels=aChannels,
|
|
1368
|
+
aSampleRate=aSampleRate,
|
|
1369
|
+
aBitRate=aBitRate,
|
|
1370
|
+
aVorbisQscale=aVorbisQscale,
|
|
1371
|
+
padMsecPre=padMsecPre,
|
|
1372
|
+
padMsecPost=padMsecPost,
|
|
1373
|
+
beep=beep,
|
|
1374
|
+
beepHertz=beepHertz,
|
|
1375
|
+
beepMixNormalize=beepMixNormalize,
|
|
1376
|
+
beepAudioWeight=beepAudioWeight,
|
|
1377
|
+
beepSineWeight=beepSineWeight,
|
|
1378
|
+
beepDropTransition=beepDropTransition,
|
|
1379
|
+
force=force,
|
|
1380
|
+
dbug=dbug,
|
|
1381
|
+
instrumentalFileSpec=instrumentalFileSpec,
|
|
1382
|
+
)
|
|
1383
|
+
|
|
1384
|
+
# Initialize auto-separation mode
|
|
1385
|
+
self.autoGenerateMode = auto_generate
|
|
1386
|
+
self.separationPadding = separation_padding
|
|
1387
|
+
self.separationCacheDir = None
|
|
1388
|
+
self.segMapping = [] # Timestamp mapping for combined file
|
|
1389
|
+
self.separator = None
|
|
1390
|
+
|
|
1391
|
+
if self.autoGenerateMode:
|
|
1392
|
+
try:
|
|
1393
|
+
from .separation import SourceSeparator
|
|
1394
|
+
except ImportError:
|
|
1395
|
+
from monkeyplug.separation import SourceSeparator
|
|
1396
|
+
|
|
1397
|
+
import tempfile
|
|
1398
|
+
self.separator = SourceSeparator(debug=self.debug)
|
|
1399
|
+
self.separationCacheDir = tempfile.mkdtemp(prefix="monkeyplug_separation_")
|
|
1400
|
+
if self.debug:
|
|
1401
|
+
mmguero.eprint(f'Auto-separation mode enabled (padding: {self.separationPadding}s)')
|
|
1402
|
+
mmguero.eprint(f'Cache directory: {self.separationCacheDir}')
|
|
1403
|
+
|
|
1404
|
+
if self.debug:
|
|
1405
|
+
if inputTranscript:
|
|
1406
|
+
mmguero.eprint('Using input transcript (skipping speech recognition)')
|
|
1407
|
+
else:
|
|
1408
|
+
mmguero.eprint(f'Using Groq API with model: {self.groq_model}')
|
|
1409
|
+
|
|
1410
|
+
def RecognizeSpeech(self):
|
|
1411
|
+
import requests
|
|
1412
|
+
import time
|
|
1413
|
+
|
|
1414
|
+
self.wordList.clear()
|
|
1415
|
+
|
|
1416
|
+
# Prepare the API request
|
|
1417
|
+
headers = {
|
|
1418
|
+
"Authorization": f"Bearer {self.api_key}"
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
data = {
|
|
1422
|
+
"model": self.groq_model,
|
|
1423
|
+
"response_format": "verbose_json",
|
|
1424
|
+
"timestamp_granularities[]": "word"
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
# Implement retry logic for rate limiting
|
|
1428
|
+
max_retries = 3
|
|
1429
|
+
retry_delay = 1 # Initial delay in seconds
|
|
1430
|
+
|
|
1431
|
+
for attempt in range(max_retries):
|
|
1432
|
+
file_handle = None
|
|
1433
|
+
try:
|
|
1434
|
+
# Prepare the file and data - open fresh for each attempt
|
|
1435
|
+
filename = os.path.basename(self.inputFileSpec)
|
|
1436
|
+
file_handle = open(self.inputFileSpec, 'rb')
|
|
1437
|
+
files = {
|
|
1438
|
+
"file": (filename, file_handle, "audio/mpeg")
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
if self.debug:
|
|
1442
|
+
mmguero.eprint(f"Sending request to Groq API (attempt {attempt + 1}/{max_retries})...")
|
|
1443
|
+
|
|
1444
|
+
response = requests.post(
|
|
1445
|
+
self.GROQ_API_ENDPOINT,
|
|
1446
|
+
headers=headers,
|
|
1447
|
+
files=files,
|
|
1448
|
+
data=data,
|
|
1449
|
+
timeout=120 # 2 minute timeout
|
|
1450
|
+
)
|
|
1451
|
+
|
|
1452
|
+
# Handle rate limiting (HTTP 429)
|
|
1453
|
+
if response.status_code == 429:
|
|
1454
|
+
if attempt < max_retries - 1:
|
|
1455
|
+
if self.debug:
|
|
1456
|
+
mmguero.eprint(f"Rate limit hit, retrying in {retry_delay} seconds...")
|
|
1457
|
+
time.sleep(retry_delay)
|
|
1458
|
+
retry_delay *= 2 # Exponential backoff
|
|
1459
|
+
continue
|
|
1460
|
+
else:
|
|
1461
|
+
raise Exception("Rate limit exceeded. Please try again later.")
|
|
1462
|
+
|
|
1463
|
+
# Handle authentication errors (HTTP 401)
|
|
1464
|
+
if response.status_code == 401:
|
|
1465
|
+
raise Exception(
|
|
1466
|
+
"Invalid Groq API key. Please check your API key configuration."
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
# Raise for other HTTP errors
|
|
1470
|
+
response.raise_for_status()
|
|
1471
|
+
|
|
1472
|
+
# Parse the response
|
|
1473
|
+
self.transcript = response.json()
|
|
1474
|
+
|
|
1475
|
+
if self.transcript and 'words' in self.transcript:
|
|
1476
|
+
for word in self.transcript['words']:
|
|
1477
|
+
word['word'] = word['word'].strip()
|
|
1478
|
+
word['scrub'] = scrubword(word['word']) in self.swearsMap
|
|
1479
|
+
self.wordList.append(word)
|
|
1480
|
+
|
|
1481
|
+
if self.debug:
|
|
1482
|
+
if hasattr(self, 'verbose_level') and self.verbose_level == "full":
|
|
1483
|
+
mmguero.eprint(json.dumps(self.wordList))
|
|
1484
|
+
else:
|
|
1485
|
+
# Concise mode: just show summary
|
|
1486
|
+
profanity_count = sum(1 for word in self.wordList if word.get('scrub', False))
|
|
1487
|
+
mmguero.eprint(f'Transcribed {len(self.wordList)} words, {profanity_count} profanity instances detected')
|
|
1488
|
+
|
|
1489
|
+
if self.outputJson:
|
|
1490
|
+
with open(self.outputJson, "w") as f:
|
|
1491
|
+
f.write(json.dumps(self.wordList))
|
|
1492
|
+
|
|
1493
|
+
return self.wordList
|
|
1494
|
+
|
|
1495
|
+
except requests.exceptions.Timeout:
|
|
1496
|
+
if attempt < max_retries - 1:
|
|
1497
|
+
if self.debug:
|
|
1498
|
+
mmguero.eprint(f"Request timed out, retrying (attempt {attempt + 1}/{max_retries})...")
|
|
1499
|
+
time.sleep(retry_delay)
|
|
1500
|
+
retry_delay *= 2
|
|
1501
|
+
else:
|
|
1502
|
+
raise Exception("Request timed out. Please check your internet connection and try again.")
|
|
1503
|
+
|
|
1504
|
+
except requests.exceptions.RequestException as e:
|
|
1505
|
+
if attempt < max_retries - 1:
|
|
1506
|
+
if self.debug:
|
|
1507
|
+
mmguero.eprint(f"Request failed: {e}, retrying (attempt {attempt + 1}/{max_retries})...")
|
|
1508
|
+
time.sleep(retry_delay)
|
|
1509
|
+
retry_delay *= 2
|
|
1510
|
+
else:
|
|
1511
|
+
raise Exception(f"Failed to connect to Groq API: {e}")
|
|
1512
|
+
|
|
1513
|
+
finally:
|
|
1514
|
+
# Make sure the file is closed after each attempt
|
|
1515
|
+
if file_handle is not None:
|
|
1516
|
+
file_handle.close()
|
|
1517
|
+
|
|
1518
|
+
raise Exception("Failed to complete speech recognition after maximum retries")
|
|
1519
|
+
|
|
1520
|
+
def DetectVocals(self, filepath):
|
|
1521
|
+
"""Detect if file has vocals by transcribing a short sample from the middle.
|
|
1522
|
+
|
|
1523
|
+
Args:
|
|
1524
|
+
filepath: Path to audio file to check
|
|
1525
|
+
|
|
1526
|
+
Returns:
|
|
1527
|
+
bool: True if vocals detected, False if instrumental (no speech)
|
|
1528
|
+
"""
|
|
1529
|
+
import requests
|
|
1530
|
+
import tempfile
|
|
1531
|
+
|
|
1532
|
+
# Get file duration
|
|
1533
|
+
duration = self._get_file_duration(filepath)
|
|
1534
|
+
if duration < self.VOCAL_DETECTION_SAMPLE_DURATION:
|
|
1535
|
+
# Short files, assume vocal (too short to be instrumental)
|
|
1536
|
+
return True
|
|
1537
|
+
|
|
1538
|
+
# Calculate middle position for sample
|
|
1539
|
+
start_time = (duration - self.VOCAL_DETECTION_SAMPLE_DURATION) / 2
|
|
1540
|
+
|
|
1541
|
+
# Create temporary file for sample
|
|
1542
|
+
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
|
|
1543
|
+
tmp_path = tmp.name
|
|
1544
|
+
|
|
1545
|
+
try:
|
|
1546
|
+
# Extract sample from middle using ffmpeg
|
|
1547
|
+
ffmpegCmd = [
|
|
1548
|
+
'ffmpeg', '-nostdin', '-hide_banner', '-nostats', '-loglevel', 'error',
|
|
1549
|
+
'-i', filepath,
|
|
1550
|
+
'-ss', str(start_time),
|
|
1551
|
+
'-t', str(self.VOCAL_DETECTION_SAMPLE_DURATION),
|
|
1552
|
+
'-acodec', 'libmp3lame', '-b:a', '128K',
|
|
1553
|
+
'-y', tmp_path
|
|
1554
|
+
]
|
|
1555
|
+
|
|
1556
|
+
result, _ = mmguero.run_process(ffmpegCmd, stdout=False, stderr=False, debug=False)
|
|
1557
|
+
|
|
1558
|
+
if result != 0:
|
|
1559
|
+
# On error, assume vocal
|
|
1560
|
+
if self.debug:
|
|
1561
|
+
mmguero.eprint(f'Warning: Failed to extract sample from {os.path.basename(filepath)}, assuming vocals')
|
|
1562
|
+
return True
|
|
1563
|
+
|
|
1564
|
+
# Transcribe sample with Groq API
|
|
1565
|
+
file_handle = None
|
|
1566
|
+
try:
|
|
1567
|
+
file_handle = open(tmp_path, 'rb')
|
|
1568
|
+
files = {"file": (os.path.basename(filepath), file_handle, "audio/mpeg")}
|
|
1569
|
+
data = {
|
|
1570
|
+
"model": self.groq_model,
|
|
1571
|
+
"response_format": "verbose_json",
|
|
1572
|
+
"timestamp_granularities[]": "word"
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
1576
|
+
response = requests.post(
|
|
1577
|
+
self.GROQ_API_ENDPOINT,
|
|
1578
|
+
headers=headers,
|
|
1579
|
+
files=files,
|
|
1580
|
+
data=data,
|
|
1581
|
+
timeout=30
|
|
1582
|
+
)
|
|
1583
|
+
|
|
1584
|
+
if response.status_code == 200:
|
|
1585
|
+
result = response.json()
|
|
1586
|
+
# Check if any words were detected
|
|
1587
|
+
words = result.get('words', [])
|
|
1588
|
+
|
|
1589
|
+
if len(words) == 0:
|
|
1590
|
+
# No words detected = instrumental
|
|
1591
|
+
if self.debug:
|
|
1592
|
+
mmguero.eprint(f'Vocal detection: 0 words detected → instrumental')
|
|
1593
|
+
return False
|
|
1594
|
+
|
|
1595
|
+
# Get all detected words for debugging
|
|
1596
|
+
# Clean words: lowercase, strip punctuation and special characters
|
|
1597
|
+
def clean_word(w):
|
|
1598
|
+
# Remove common punctuation and special Unicode characters
|
|
1599
|
+
cleaned = w.lower().strip('.,!?;:"\'()[]{}©®™¶§†‡•—–')
|
|
1600
|
+
return cleaned
|
|
1601
|
+
|
|
1602
|
+
detected_words = {clean_word(word['word']) for word in words}
|
|
1603
|
+
all_words_text = ', '.join([word['word'] for word in words])
|
|
1604
|
+
|
|
1605
|
+
# Check for "thank you" pattern - if only filler words detected, it's silence/instrumental
|
|
1606
|
+
# If ALL detected words are filler words, treat as instrumental
|
|
1607
|
+
if detected_words.issubset(self.VOCAL_DETECTION_FILLER_WORDS):
|
|
1608
|
+
if self.debug:
|
|
1609
|
+
mmguero.eprint(f'Vocal detection: Only filler words detected ({all_words_text}) → instrumental (silence)')
|
|
1610
|
+
return False
|
|
1611
|
+
|
|
1612
|
+
# Real lyrics detected = vocal track
|
|
1613
|
+
if self.debug:
|
|
1614
|
+
mmguero.eprint(f'Vocal detection: {len(words)} words detected → vocals')
|
|
1615
|
+
mmguero.eprint(f' Words: {all_words_text}')
|
|
1616
|
+
|
|
1617
|
+
return True
|
|
1618
|
+
|
|
1619
|
+
# On error, assume vocal
|
|
1620
|
+
if self.debug:
|
|
1621
|
+
mmguero.eprint(f'Warning: API error during vocal detection, assuming vocals')
|
|
1622
|
+
return True
|
|
1623
|
+
|
|
1624
|
+
finally:
|
|
1625
|
+
if file_handle:
|
|
1626
|
+
file_handle.close()
|
|
1627
|
+
|
|
1628
|
+
except Exception as e:
|
|
1629
|
+
# On any error, assume vocal
|
|
1630
|
+
if self.debug:
|
|
1631
|
+
mmguero.eprint(f'Warning: Exception during vocal detection: {e}, assuming vocals')
|
|
1632
|
+
return True
|
|
1633
|
+
|
|
1634
|
+
finally:
|
|
1635
|
+
# Clean up temporary file
|
|
1636
|
+
if os.path.exists(tmp_path):
|
|
1637
|
+
os.remove(tmp_path)
|
|
1638
|
+
|
|
1639
|
+
def _extract_combined_segments(self, output_file):
|
|
1640
|
+
"""
|
|
1641
|
+
Extract all profanity segments with padding and concatenate into one file
|
|
1642
|
+
Uses FFmpeg filter_complex to concatenate segments
|
|
1643
|
+
Tracks mapping between original timestamps and combined file timestamps
|
|
1644
|
+
|
|
1645
|
+
Returns:
|
|
1646
|
+
float: Total duration of combined file, or 0 if failed
|
|
1647
|
+
"""
|
|
1648
|
+
if not self.instrumentalSegments:
|
|
1649
|
+
return 0.0
|
|
1650
|
+
|
|
1651
|
+
duration = self._get_file_duration(self.inputFileSpec)
|
|
1652
|
+
|
|
1653
|
+
# Build filter to extract and concatenate all profanity segments
|
|
1654
|
+
filter_parts = []
|
|
1655
|
+
seg_index = 0
|
|
1656
|
+
combined_time = 0.0 # Track current position in combined file
|
|
1657
|
+
|
|
1658
|
+
for start, end in self.instrumentalSegments:
|
|
1659
|
+
# Add padding
|
|
1660
|
+
padded_start = max(0, start - self.separationPadding)
|
|
1661
|
+
padded_end = min(duration, end + self.separationPadding)
|
|
1662
|
+
segment_duration = padded_end - padded_start
|
|
1663
|
+
|
|
1664
|
+
# Extract this segment
|
|
1665
|
+
filter_parts.append(
|
|
1666
|
+
f"[0:a]atrim={padded_start:.2f}:{padded_end:.2f},asetpts=PTS-STARTPTS[seg{seg_index}]"
|
|
1667
|
+
)
|
|
1668
|
+
|
|
1669
|
+
# Track mapping: where this original segment appears in combined file
|
|
1670
|
+
# Format: (original profanity start, original profanity end,
|
|
1671
|
+
# combined file start, combined file end,
|
|
1672
|
+
# padded segment start, padded segment end)
|
|
1673
|
+
padded_start = max(0, start - self.separationPadding)
|
|
1674
|
+
padded_end = min(duration, end + self.separationPadding)
|
|
1675
|
+
self.segMapping.append((
|
|
1676
|
+
start, # Original profanity start
|
|
1677
|
+
end, # Original profanity end
|
|
1678
|
+
combined_time, # Start position in combined file
|
|
1679
|
+
combined_time + segment_duration, # End position in combined file
|
|
1680
|
+
padded_start, # Padded segment start (for offset calculation)
|
|
1681
|
+
padded_end, # Padded segment end (for offset calculation)
|
|
1682
|
+
))
|
|
1683
|
+
|
|
1684
|
+
combined_time += segment_duration
|
|
1685
|
+
seg_index += 1
|
|
1686
|
+
|
|
1687
|
+
# Concatenate all segments
|
|
1688
|
+
concat_input = ''.join([f'[seg{i}]' for i in range(seg_index)])
|
|
1689
|
+
filter_parts.append(f"{concat_input}concat=n={seg_index}:v=0:a=1[outa]")
|
|
1690
|
+
|
|
1691
|
+
filter_complex = ';'.join(filter_parts)
|
|
1692
|
+
|
|
1693
|
+
# Run ffmpeg to extract and concatenate
|
|
1694
|
+
ffmpegCmd = [
|
|
1695
|
+
'ffmpeg', '-nostdin', '-hide_banner', '-nostats', '-loglevel', 'error',
|
|
1696
|
+
'-y',
|
|
1697
|
+
'-i', self.inputFileSpec,
|
|
1698
|
+
'-filter_complex', filter_complex,
|
|
1699
|
+
'-map', '[outa]',
|
|
1700
|
+
'-acodec', 'pcm_s16le', # WAV for sherpa-onnx
|
|
1701
|
+
'-ar', '44100',
|
|
1702
|
+
'-ac', '2',
|
|
1703
|
+
output_file
|
|
1704
|
+
]
|
|
1705
|
+
|
|
1706
|
+
result, _ = mmguero.run_process(ffmpegCmd, stdout=False, stderr=False, debug=self.debug)
|
|
1707
|
+
|
|
1708
|
+
if result != 0:
|
|
1709
|
+
raise IOError("Failed to extract combined profanity segments")
|
|
1710
|
+
|
|
1711
|
+
# Return duration of combined file
|
|
1712
|
+
return self._get_file_duration(output_file)
|
|
1713
|
+
|
|
1714
|
+
def _create_combined_profanity_file(self):
|
|
1715
|
+
"""
|
|
1716
|
+
Extract all profanity segments (with padding) into a single continuous file
|
|
1717
|
+
and separate it into instrumental
|
|
1718
|
+
|
|
1719
|
+
Also creates timestamp mapping: where each original segment appears in the combined file
|
|
1720
|
+
|
|
1721
|
+
Returns:
|
|
1722
|
+
str: Path to the combined instrumental file
|
|
1723
|
+
"""
|
|
1724
|
+
if not self.instrumentalSegments:
|
|
1725
|
+
return None
|
|
1726
|
+
|
|
1727
|
+
# Step 1: Extract all profanity segments (with padding) into one file
|
|
1728
|
+
# Also track the mapping between original timestamps and combined file timestamps
|
|
1729
|
+
combined_file = os.path.join(self.separationCacheDir, "combined_profanity.wav")
|
|
1730
|
+
self.segMapping = [] # Reset mapping
|
|
1731
|
+
|
|
1732
|
+
segment_duration = self._extract_combined_segments(combined_file)
|
|
1733
|
+
|
|
1734
|
+
if not segment_duration:
|
|
1735
|
+
return None
|
|
1736
|
+
|
|
1737
|
+
if self.debug:
|
|
1738
|
+
mmguero.eprint(f'Extracted {len(self.instrumentalSegments)} profanity segment(s) into combined file ({segment_duration:.2f}s)')
|
|
1739
|
+
|
|
1740
|
+
# Step 2: Separate the combined file
|
|
1741
|
+
instrumental_path, vocals_path = self.separator.separate_audio_file(
|
|
1742
|
+
combined_file,
|
|
1743
|
+
self.separationCacheDir
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
return instrumental_path
|
|
1747
|
+
|
|
1748
|
+
|
|
1749
|
+
#################################################################################
|
|
1750
|
+
|
|
1751
|
+
|
|
1752
|
+
###################################################################################################
|
|
1753
|
+
# Wildcard and batch processing helpers
|
|
1754
|
+
def apply_output_pattern(input_file, output_pattern):
|
|
1755
|
+
"""Generate output filename from pattern.
|
|
1756
|
+
|
|
1757
|
+
Args:
|
|
1758
|
+
input_file: Path to input file
|
|
1759
|
+
output_pattern: Output pattern (e.g., '*_clean.mp3')
|
|
1760
|
+
|
|
1761
|
+
Returns:
|
|
1762
|
+
str: Generated output filepath
|
|
1763
|
+
"""
|
|
1764
|
+
input_dir = os.path.dirname(input_file)
|
|
1765
|
+
input_basename = os.path.basename(input_file)
|
|
1766
|
+
input_name, input_ext = os.path.splitext(input_basename)
|
|
1767
|
+
|
|
1768
|
+
# Replace * with input name
|
|
1769
|
+
output_name = output_pattern.replace('*', input_name)
|
|
1770
|
+
|
|
1771
|
+
# Add extension if not present in pattern
|
|
1772
|
+
if not os.path.splitext(output_name)[1]:
|
|
1773
|
+
output_name += input_ext
|
|
1774
|
+
|
|
1775
|
+
if input_dir:
|
|
1776
|
+
return os.path.join(input_dir, output_name)
|
|
1777
|
+
return output_name
|
|
1778
|
+
|
|
1779
|
+
|
|
1780
|
+
def expand_and_detect_vocals(input_pattern, output_pattern, args, skip_detection=False):
|
|
1781
|
+
"""Expand wildcards and detect which files have vocals.
|
|
1782
|
+
|
|
1783
|
+
Args:
|
|
1784
|
+
input_pattern: Input file pattern (e.g., '*.mp3')
|
|
1785
|
+
output_pattern: Output file pattern (e.g., '*_clean.mp3')
|
|
1786
|
+
args: Parsed command-line arguments
|
|
1787
|
+
skip_detection: If True, assume all files have vocals (used with --instrumental generate)
|
|
1788
|
+
|
|
1789
|
+
Returns:
|
|
1790
|
+
tuple: (vocal_files, instrumental_files, output_files)
|
|
1791
|
+
"""
|
|
1792
|
+
import glob
|
|
1793
|
+
import re
|
|
1794
|
+
|
|
1795
|
+
# Expand input wildcard
|
|
1796
|
+
input_files = glob.glob(input_pattern)
|
|
1797
|
+
|
|
1798
|
+
if not input_files:
|
|
1799
|
+
raise IOError(f"No files found matching pattern: {input_pattern}")
|
|
1800
|
+
|
|
1801
|
+
# If only one file and no wildcard, return it directly
|
|
1802
|
+
if len(input_files) == 1 and '*' not in input_pattern:
|
|
1803
|
+
output_file = apply_output_pattern(input_files[0], output_pattern)
|
|
1804
|
+
return [input_files[0]], [], [output_file]
|
|
1805
|
+
|
|
1806
|
+
# Filter out files that match the output pattern (already processed)
|
|
1807
|
+
# Convert output pattern to regex for matching
|
|
1808
|
+
def pattern_to_regex(pattern):
|
|
1809
|
+
"""Convert wildcard pattern to regex for matching"""
|
|
1810
|
+
# Escape special regex characters except *
|
|
1811
|
+
regex = re.escape(pattern)
|
|
1812
|
+
# Replace escaped * with .* (match anything)
|
|
1813
|
+
regex = regex.replace(r'\*', '.*')
|
|
1814
|
+
# Add anchors to match entire filename
|
|
1815
|
+
return f'^{regex}$'
|
|
1816
|
+
|
|
1817
|
+
output_regex = pattern_to_regex(output_pattern)
|
|
1818
|
+
filtered_files = []
|
|
1819
|
+
skipped_output_files = []
|
|
1820
|
+
|
|
1821
|
+
for filepath in input_files:
|
|
1822
|
+
basename = os.path.basename(filepath)
|
|
1823
|
+
# Check if file matches output pattern
|
|
1824
|
+
if re.match(output_regex, basename, re.IGNORECASE):
|
|
1825
|
+
skipped_output_files.append(filepath)
|
|
1826
|
+
if args.debug:
|
|
1827
|
+
mmguero.eprint(f'Skipping output file: {basename} (matches output pattern)')
|
|
1828
|
+
else:
|
|
1829
|
+
filtered_files.append(filepath)
|
|
1830
|
+
|
|
1831
|
+
input_files = filtered_files
|
|
1832
|
+
|
|
1833
|
+
if not input_files:
|
|
1834
|
+
mmguero.eprint('No files to process after filtering out already-processed output files.')
|
|
1835
|
+
return [], [], []
|
|
1836
|
+
|
|
1837
|
+
if args.debug:
|
|
1838
|
+
mmguero.eprint(f'Expanded wildcard to {len(input_files)} file(s) (skipped {len(skipped_output_files)} output files)')
|
|
1839
|
+
|
|
1840
|
+
if skip_detection:
|
|
1841
|
+
if args.debug:
|
|
1842
|
+
mmguero.eprint('Skipping vocal detection (generate mode — assuming all files have vocals)')
|
|
1843
|
+
output_files = [apply_output_pattern(f, output_pattern) for f in input_files]
|
|
1844
|
+
return input_files, [], output_files
|
|
1845
|
+
|
|
1846
|
+
# Create a GroqPlugger instance just for detection
|
|
1847
|
+
# We need to use dummy values for most parameters since we're only detecting vocals
|
|
1848
|
+
try:
|
|
1849
|
+
from .groq_config import load_groq_api_key
|
|
1850
|
+
except ImportError:
|
|
1851
|
+
from monkeyplug.groq_config import load_groq_api_key
|
|
1852
|
+
|
|
1853
|
+
api_key = load_groq_api_key(args.groqApiKey, debug=args.debug)
|
|
1854
|
+
if not api_key:
|
|
1855
|
+
raise ValueError("Groq API key required for wildcard vocal detection")
|
|
1856
|
+
|
|
1857
|
+
# Create minimal GroqPlugger for detection
|
|
1858
|
+
detector = GroqPlugger(
|
|
1859
|
+
iFileSpec=input_files[0], # Dummy, will be overridden
|
|
1860
|
+
oFileSpec="dummy.mp3",
|
|
1861
|
+
oAudioFileFormat="MATCH",
|
|
1862
|
+
iSwearsFileSpec=args.swears,
|
|
1863
|
+
groq_api_key=api_key,
|
|
1864
|
+
groq_model=args.groqModel,
|
|
1865
|
+
outputJson=None,
|
|
1866
|
+
dbug=args.debug,
|
|
1867
|
+
verbose_level=args.verbose_level if hasattr(args, 'verbose_level') else "",
|
|
1868
|
+
)
|
|
1869
|
+
|
|
1870
|
+
vocal_files = []
|
|
1871
|
+
instrumental_files = []
|
|
1872
|
+
output_files = []
|
|
1873
|
+
|
|
1874
|
+
# Detect vocals in each file
|
|
1875
|
+
for filepath in input_files:
|
|
1876
|
+
basename = os.path.basename(filepath)
|
|
1877
|
+
|
|
1878
|
+
if args.debug:
|
|
1879
|
+
mmguero.eprint(f'Detecting vocals in: {basename}')
|
|
1880
|
+
|
|
1881
|
+
has_vocals = detector.DetectVocals(filepath)
|
|
1882
|
+
|
|
1883
|
+
if has_vocals:
|
|
1884
|
+
output_file = apply_output_pattern(filepath, output_pattern)
|
|
1885
|
+
vocal_files.append(filepath)
|
|
1886
|
+
output_files.append(output_file)
|
|
1887
|
+
if args.debug:
|
|
1888
|
+
mmguero.eprint(f' ✓ Vocals detected → will process')
|
|
1889
|
+
else:
|
|
1890
|
+
instrumental_files.append(filepath)
|
|
1891
|
+
if args.debug:
|
|
1892
|
+
mmguero.eprint(f' ✗ No vocals → skipping (likely instrumental)')
|
|
1893
|
+
|
|
1894
|
+
if args.debug:
|
|
1895
|
+
mmguero.eprint(f'\nVocal detection complete: {len(vocal_files)} vocal, {len(instrumental_files)} instrumental, {len(skipped_output_files)} already processed')
|
|
1896
|
+
|
|
1897
|
+
return vocal_files, instrumental_files, output_files
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
###################################################################################################
|
|
1901
|
+
# Config file loading
|
|
1902
|
+
MONKEYPLUG_CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'monkeyplug')
|
|
1903
|
+
MONKEYPLUG_CONFIG_PATH = os.path.join(MONKEYPLUG_CACHE_DIR, 'config.json')
|
|
1904
|
+
|
|
1905
|
+
DEFAULT_CONFIG = {
|
|
1906
|
+
"pad_milliseconds": 10,
|
|
1907
|
+
"pad_milliseconds_pre": 10,
|
|
1908
|
+
"pad_milliseconds_post": 10,
|
|
1909
|
+
"separation_padding": 1.0,
|
|
1910
|
+
"beep_hertz": BEEP_HERTZ_DEFAULT,
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
|
|
1914
|
+
def load_config_settings(debug=False):
|
|
1915
|
+
"""
|
|
1916
|
+
Load settings from JSON config file.
|
|
1917
|
+
|
|
1918
|
+
Config file search order (first found wins):
|
|
1919
|
+
1. ./.monkeyplug.json (current directory, project-specific)
|
|
1920
|
+
2. ~/.cache/monkeyplug/config.json (user-specific, alongside models)
|
|
1921
|
+
|
|
1922
|
+
If no config exists anywhere, a default one is created at
|
|
1923
|
+
~/.cache/monkeyplug/config.json so the user can find and edit it.
|
|
1924
|
+
|
|
1925
|
+
Returns:
|
|
1926
|
+
dict: Config settings (empty dict if no config found)
|
|
1927
|
+
"""
|
|
1928
|
+
config_paths = [
|
|
1929
|
+
os.path.join(os.getcwd(), '.monkeyplug.json'),
|
|
1930
|
+
MONKEYPLUG_CONFIG_PATH,
|
|
1931
|
+
]
|
|
1932
|
+
|
|
1933
|
+
for config_path in config_paths:
|
|
1934
|
+
if os.path.isfile(config_path):
|
|
1935
|
+
try:
|
|
1936
|
+
with open(config_path, 'r') as f:
|
|
1937
|
+
config = json.load(f)
|
|
1938
|
+
|
|
1939
|
+
if debug:
|
|
1940
|
+
mmguero.eprint(f"Loaded config from: {config_path}")
|
|
1941
|
+
|
|
1942
|
+
return config
|
|
1943
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
1944
|
+
if debug:
|
|
1945
|
+
mmguero.eprint(f"Warning: Failed to load config from {config_path}: {e}")
|
|
1946
|
+
continue
|
|
1947
|
+
|
|
1948
|
+
# No config found anywhere — create a default one so the user can edit it
|
|
1949
|
+
try:
|
|
1950
|
+
os.makedirs(MONKEYPLUG_CACHE_DIR, exist_ok=True)
|
|
1951
|
+
with open(MONKEYPLUG_CONFIG_PATH, 'w') as f:
|
|
1952
|
+
json.dump(DEFAULT_CONFIG, f, indent=2)
|
|
1953
|
+
f.write('\n')
|
|
1954
|
+
if debug:
|
|
1955
|
+
mmguero.eprint(f"Created default config at: {MONKEYPLUG_CONFIG_PATH}")
|
|
1956
|
+
except (IOError, OSError) as e:
|
|
1957
|
+
if debug:
|
|
1958
|
+
mmguero.eprint(f"Warning: Could not create default config: {e}")
|
|
1959
|
+
|
|
1960
|
+
return dict(DEFAULT_CONFIG)
|
|
1961
|
+
|
|
1962
|
+
|
|
1963
|
+
###################################################################################################
|
|
1964
|
+
# RunMonkeyPlug
|
|
1965
|
+
def RunMonkeyPlug():
|
|
1966
|
+
|
|
1967
|
+
package_name = __package__ or "monkeyplug"
|
|
1968
|
+
try:
|
|
1969
|
+
metadata = importlib.metadata.metadata(package_name)
|
|
1970
|
+
version = metadata.get("Version", "unknown")
|
|
1971
|
+
except importlib.metadata.PackageNotFoundError:
|
|
1972
|
+
version = "source"
|
|
1973
|
+
|
|
1974
|
+
# Load config file for default values (can be overridden by CLI args)
|
|
1975
|
+
config = load_config_settings(debug=False)
|
|
1976
|
+
|
|
1977
|
+
parser = argparse.ArgumentParser(
|
|
1978
|
+
description=f"{package_name} (v{version})",
|
|
1979
|
+
add_help=True,
|
|
1980
|
+
usage=f"{package_name} <arguments>",
|
|
1981
|
+
)
|
|
1982
|
+
parser.add_argument(
|
|
1983
|
+
"-v",
|
|
1984
|
+
"--verbose",
|
|
1985
|
+
dest="verbose",
|
|
1986
|
+
type=str,
|
|
1987
|
+
nargs="?",
|
|
1988
|
+
const="concise",
|
|
1989
|
+
default="",
|
|
1990
|
+
metavar="[concise|full]",
|
|
1991
|
+
help="Verbose output level: -v for concise, -v full for detailed debug output",
|
|
1992
|
+
)
|
|
1993
|
+
parser.add_argument(
|
|
1994
|
+
"-m",
|
|
1995
|
+
"--mode",
|
|
1996
|
+
dest="speechRecMode",
|
|
1997
|
+
metavar="<string>",
|
|
1998
|
+
type=str,
|
|
1999
|
+
default=DEFAULT_SPEECH_REC_MODE,
|
|
2000
|
+
help=f"Speech recognition engine ({SPEECH_REC_MODE_GROQ}|{SPEECH_REC_MODE_WHISPER}|{SPEECH_REC_MODE_VOSK}) (default: {DEFAULT_SPEECH_REC_MODE})",
|
|
2001
|
+
)
|
|
2002
|
+
parser.add_argument(
|
|
2003
|
+
"-i",
|
|
2004
|
+
"--input",
|
|
2005
|
+
dest="input",
|
|
2006
|
+
type=str,
|
|
2007
|
+
default=None,
|
|
2008
|
+
required=True,
|
|
2009
|
+
metavar="<string>",
|
|
2010
|
+
help="Input file (or URL)",
|
|
2011
|
+
)
|
|
2012
|
+
parser.add_argument(
|
|
2013
|
+
"-o",
|
|
2014
|
+
"--output",
|
|
2015
|
+
dest="output",
|
|
2016
|
+
type=str,
|
|
2017
|
+
default=None,
|
|
2018
|
+
required=False,
|
|
2019
|
+
metavar="<string>",
|
|
2020
|
+
help="Output file",
|
|
2021
|
+
)
|
|
2022
|
+
parser.add_argument(
|
|
2023
|
+
"--output-json",
|
|
2024
|
+
dest="outputJson",
|
|
2025
|
+
type=str,
|
|
2026
|
+
default=None,
|
|
2027
|
+
required=False,
|
|
2028
|
+
metavar="<string>",
|
|
2029
|
+
help="Output file to store transcript JSON",
|
|
2030
|
+
)
|
|
2031
|
+
parser.add_argument(
|
|
2032
|
+
"-w",
|
|
2033
|
+
"--swears",
|
|
2034
|
+
help=f"text file containing profanity (default: \"{SWEARS_FILENAME_DEFAULT}\")",
|
|
2035
|
+
default=os.path.join(script_path, SWEARS_FILENAME_DEFAULT),
|
|
2036
|
+
metavar="<profanity file>",
|
|
2037
|
+
)
|
|
2038
|
+
parser.add_argument(
|
|
2039
|
+
"--input-transcript",
|
|
2040
|
+
dest="inputTranscript",
|
|
2041
|
+
type=str,
|
|
2042
|
+
default=None,
|
|
2043
|
+
required=False,
|
|
2044
|
+
metavar="<string>",
|
|
2045
|
+
help="Load existing transcript JSON instead of performing speech recognition",
|
|
2046
|
+
)
|
|
2047
|
+
parser.add_argument(
|
|
2048
|
+
"--save-transcript",
|
|
2049
|
+
dest="saveTranscript",
|
|
2050
|
+
action="store_true",
|
|
2051
|
+
default=False,
|
|
2052
|
+
help="Automatically save transcript JSON alongside output audio file",
|
|
2053
|
+
)
|
|
2054
|
+
parser.add_argument(
|
|
2055
|
+
"--force-retranscribe",
|
|
2056
|
+
dest="forceRetranscribe",
|
|
2057
|
+
action="store_true",
|
|
2058
|
+
default=False,
|
|
2059
|
+
help="Force new transcription even if transcript file exists (overrides automatic reuse)",
|
|
2060
|
+
)
|
|
2061
|
+
parser.add_argument(
|
|
2062
|
+
"--instrumental",
|
|
2063
|
+
dest="instrumentalFile",
|
|
2064
|
+
type=str,
|
|
2065
|
+
default=None,
|
|
2066
|
+
required=False,
|
|
2067
|
+
metavar="<mode|file>",
|
|
2068
|
+
help="Instrumental mode: 'auto' (default, try prefix search then generate), 'generate' (AI generation), 'prefix' (search with --instrumental-prefix), or file path",
|
|
2069
|
+
)
|
|
2070
|
+
parser.add_argument(
|
|
2071
|
+
"--instrumental-prefix",
|
|
2072
|
+
dest="instrumentalPrefix",
|
|
2073
|
+
type=str,
|
|
2074
|
+
default="AUTO",
|
|
2075
|
+
required=False,
|
|
2076
|
+
metavar="<string>",
|
|
2077
|
+
help="Prefix/suffix to search for instrumental file, or 'AUTO' for fuzzy matching (default)",
|
|
2078
|
+
)
|
|
2079
|
+
parser.add_argument(
|
|
2080
|
+
"--instrumental-auto-candidates",
|
|
2081
|
+
dest="instrumentalAutoCandidates",
|
|
2082
|
+
type=int,
|
|
2083
|
+
default=5,
|
|
2084
|
+
required=False,
|
|
2085
|
+
metavar="<int>",
|
|
2086
|
+
help="Number of top candidates to validate in AUTO mode (default: 5)",
|
|
2087
|
+
)
|
|
2088
|
+
parser.add_argument(
|
|
2089
|
+
"--separation-padding",
|
|
2090
|
+
dest="separationPadding",
|
|
2091
|
+
type=float,
|
|
2092
|
+
default=config.get("separation_padding", 1.0),
|
|
2093
|
+
metavar="<seconds>",
|
|
2094
|
+
help=f"Context padding for AI generation (default: {config.get('separation_padding', 1.0)} seconds)",
|
|
2095
|
+
)
|
|
2096
|
+
parser.add_argument(
|
|
2097
|
+
"--filter-instrumentals",
|
|
2098
|
+
dest="filterInstrumentals",
|
|
2099
|
+
action="store_true",
|
|
2100
|
+
default=False,
|
|
2101
|
+
help="In wildcard mode with --instrumental generate, filter out files detected as instrumentals (default: process all files)",
|
|
2102
|
+
)
|
|
2103
|
+
parser.add_argument(
|
|
2104
|
+
"--mute",
|
|
2105
|
+
dest="mute",
|
|
2106
|
+
action="store_true",
|
|
2107
|
+
default=False,
|
|
2108
|
+
help="Force mute mode (disable instrumental processing)",
|
|
2109
|
+
)
|
|
2110
|
+
parser.add_argument(
|
|
2111
|
+
"-a",
|
|
2112
|
+
"--audio-params",
|
|
2113
|
+
help="Audio parameters for ffmpeg (default depends on output audio codec)",
|
|
2114
|
+
dest="aParams",
|
|
2115
|
+
metavar="<str>",
|
|
2116
|
+
default=None,
|
|
2117
|
+
)
|
|
2118
|
+
parser.add_argument(
|
|
2119
|
+
"-c",
|
|
2120
|
+
"--channels",
|
|
2121
|
+
dest="aChannels",
|
|
2122
|
+
metavar="<int>",
|
|
2123
|
+
type=int,
|
|
2124
|
+
default=AUDIO_DEFAULT_CHANNELS,
|
|
2125
|
+
help=f"Audio output channels (default: {AUDIO_DEFAULT_CHANNELS})",
|
|
2126
|
+
)
|
|
2127
|
+
parser.add_argument(
|
|
2128
|
+
"-s",
|
|
2129
|
+
"--sample-rate",
|
|
2130
|
+
dest="aSampleRate",
|
|
2131
|
+
metavar="<int>",
|
|
2132
|
+
type=int,
|
|
2133
|
+
default=AUDIO_DEFAULT_SAMPLE_RATE,
|
|
2134
|
+
help=f"Audio output sample rate (default: {AUDIO_DEFAULT_SAMPLE_RATE})",
|
|
2135
|
+
)
|
|
2136
|
+
parser.add_argument(
|
|
2137
|
+
"-r",
|
|
2138
|
+
"--bitrate",
|
|
2139
|
+
dest="aBitRate",
|
|
2140
|
+
metavar="<str>",
|
|
2141
|
+
default=AUDIO_DEFAULT_BIT_RATE,
|
|
2142
|
+
help=f"Audio output bitrate (default: {AUDIO_DEFAULT_BIT_RATE})",
|
|
2143
|
+
)
|
|
2144
|
+
parser.add_argument(
|
|
2145
|
+
"-q",
|
|
2146
|
+
"--vorbis-qscale",
|
|
2147
|
+
dest="aVorbisQscale",
|
|
2148
|
+
metavar="<int>",
|
|
2149
|
+
type=int,
|
|
2150
|
+
default=AUDIO_DEFAULT_VORBIS_QSCALE,
|
|
2151
|
+
help=f"qscale for libvorbis output (default: {AUDIO_DEFAULT_VORBIS_QSCALE})",
|
|
2152
|
+
)
|
|
2153
|
+
parser.add_argument(
|
|
2154
|
+
"-f",
|
|
2155
|
+
"--format",
|
|
2156
|
+
dest="outputFormat",
|
|
2157
|
+
type=str,
|
|
2158
|
+
default=AUDIO_MATCH_FORMAT,
|
|
2159
|
+
required=False,
|
|
2160
|
+
metavar="<string>",
|
|
2161
|
+
help=f"Output file format (default: inferred from extension of --output, or \"{AUDIO_MATCH_FORMAT}\")",
|
|
2162
|
+
)
|
|
2163
|
+
parser.add_argument(
|
|
2164
|
+
"--pad-milliseconds",
|
|
2165
|
+
dest="padMsec",
|
|
2166
|
+
metavar="<int>",
|
|
2167
|
+
type=int,
|
|
2168
|
+
default=config.get("pad_milliseconds", 10),
|
|
2169
|
+
help=f"Milliseconds to pad on either side of muted segments (default: {config.get('pad_milliseconds', 10)})",
|
|
2170
|
+
)
|
|
2171
|
+
parser.add_argument(
|
|
2172
|
+
"--pad-milliseconds-pre",
|
|
2173
|
+
dest="padMsecPre",
|
|
2174
|
+
metavar="<int>",
|
|
2175
|
+
type=int,
|
|
2176
|
+
default=config.get("pad_milliseconds_pre", 10),
|
|
2177
|
+
help=f"Milliseconds to pad before muted segments (default: {config.get('pad_milliseconds_pre', 10)})",
|
|
2178
|
+
)
|
|
2179
|
+
parser.add_argument(
|
|
2180
|
+
"--pad-milliseconds-post",
|
|
2181
|
+
dest="padMsecPost",
|
|
2182
|
+
metavar="<int>",
|
|
2183
|
+
type=int,
|
|
2184
|
+
default=config.get("pad_milliseconds_post", 10),
|
|
2185
|
+
help=f"Milliseconds to pad after muted segments (default: {config.get('pad_milliseconds_post', 10)})",
|
|
2186
|
+
)
|
|
2187
|
+
parser.add_argument(
|
|
2188
|
+
"-b",
|
|
2189
|
+
"--beep",
|
|
2190
|
+
dest="beep",
|
|
2191
|
+
type=mmguero.str2bool,
|
|
2192
|
+
nargs="?",
|
|
2193
|
+
const=True,
|
|
2194
|
+
default=False,
|
|
2195
|
+
metavar="true|false",
|
|
2196
|
+
help="Beep instead of silence",
|
|
2197
|
+
)
|
|
2198
|
+
parser.add_argument(
|
|
2199
|
+
"-z",
|
|
2200
|
+
"--beep-hertz",
|
|
2201
|
+
dest="beepHertz",
|
|
2202
|
+
metavar="<int>",
|
|
2203
|
+
type=int,
|
|
2204
|
+
default=config.get("beep_hertz", BEEP_HERTZ_DEFAULT),
|
|
2205
|
+
help=f"Beep frequency hertz (default: {config.get('beep_hertz', BEEP_HERTZ_DEFAULT)})",
|
|
2206
|
+
)
|
|
2207
|
+
parser.add_argument(
|
|
2208
|
+
"--beep-mix-normalize",
|
|
2209
|
+
dest="beepMixNormalize",
|
|
2210
|
+
type=mmguero.str2bool,
|
|
2211
|
+
nargs="?",
|
|
2212
|
+
const=True,
|
|
2213
|
+
default=BEEP_MIX_NORMALIZE_DEFAULT,
|
|
2214
|
+
metavar="true|false",
|
|
2215
|
+
help=f"Normalize mix of audio and beeps (default: {BEEP_MIX_NORMALIZE_DEFAULT})",
|
|
2216
|
+
)
|
|
2217
|
+
parser.add_argument(
|
|
2218
|
+
"--beep-audio-weight",
|
|
2219
|
+
dest="beepAudioWeight",
|
|
2220
|
+
metavar="<int>",
|
|
2221
|
+
type=int,
|
|
2222
|
+
default=BEEP_AUDIO_WEIGHT_DEFAULT,
|
|
2223
|
+
help=f"Mix weight for non-beeped audio (default: {BEEP_AUDIO_WEIGHT_DEFAULT})",
|
|
2224
|
+
)
|
|
2225
|
+
parser.add_argument(
|
|
2226
|
+
"--beep-sine-weight",
|
|
2227
|
+
dest="beepSineWeight",
|
|
2228
|
+
metavar="<int>",
|
|
2229
|
+
type=int,
|
|
2230
|
+
default=BEEP_SINE_WEIGHT_DEFAULT,
|
|
2231
|
+
help=f"Mix weight for beep (default: {BEEP_SINE_WEIGHT_DEFAULT})",
|
|
2232
|
+
)
|
|
2233
|
+
parser.add_argument(
|
|
2234
|
+
"--beep-dropout-transition",
|
|
2235
|
+
dest="beepDropTransition",
|
|
2236
|
+
metavar="<int>",
|
|
2237
|
+
type=int,
|
|
2238
|
+
default=BEEP_DROPOUT_TRANSITION_DEFAULT,
|
|
2239
|
+
help=f"Dropout transition for beep (default: {BEEP_DROPOUT_TRANSITION_DEFAULT})",
|
|
2240
|
+
)
|
|
2241
|
+
|
|
2242
|
+
parser.add_argument(
|
|
2243
|
+
"--force",
|
|
2244
|
+
dest="forceDespiteTag",
|
|
2245
|
+
type=mmguero.str2bool,
|
|
2246
|
+
nargs="?",
|
|
2247
|
+
const=True,
|
|
2248
|
+
default=False,
|
|
2249
|
+
metavar="true|false",
|
|
2250
|
+
help="Process file despite existence of embedded tag",
|
|
2251
|
+
)
|
|
2252
|
+
|
|
2253
|
+
parser.add_argument(
|
|
2254
|
+
"--clean-cache",
|
|
2255
|
+
dest="cleanCache",
|
|
2256
|
+
action="store_true",
|
|
2257
|
+
default=False,
|
|
2258
|
+
help=f"Delete all cached data (models, config) at {MONKEYPLUG_CACHE_DIR} and exit",
|
|
2259
|
+
)
|
|
2260
|
+
|
|
2261
|
+
voskArgGroup = parser.add_argument_group('VOSK Options')
|
|
2262
|
+
voskArgGroup.add_argument(
|
|
2263
|
+
"--vosk-model-dir",
|
|
2264
|
+
dest="voskModelDir",
|
|
2265
|
+
metavar="<string>",
|
|
2266
|
+
type=str,
|
|
2267
|
+
default=DEFAULT_VOSK_MODEL_DIR,
|
|
2268
|
+
help=f"VOSK model directory (default: {DEFAULT_VOSK_MODEL_DIR})",
|
|
2269
|
+
)
|
|
2270
|
+
voskArgGroup.add_argument(
|
|
2271
|
+
"--vosk-read-frames-chunk",
|
|
2272
|
+
dest="voskReadFramesChunk",
|
|
2273
|
+
metavar="<int>",
|
|
2274
|
+
type=int,
|
|
2275
|
+
default=os.getenv("VOSK_READ_FRAMES", AUDIO_DEFAULT_WAV_FRAMES_CHUNK),
|
|
2276
|
+
help=f"WAV frame chunk (default: {AUDIO_DEFAULT_WAV_FRAMES_CHUNK})",
|
|
2277
|
+
)
|
|
2278
|
+
|
|
2279
|
+
whisperArgGroup = parser.add_argument_group('Whisper Options')
|
|
2280
|
+
whisperArgGroup.add_argument(
|
|
2281
|
+
"--whisper-model-dir",
|
|
2282
|
+
dest="whisperModelDir",
|
|
2283
|
+
metavar="<string>",
|
|
2284
|
+
type=str,
|
|
2285
|
+
default=DEFAULT_WHISPER_MODEL_DIR,
|
|
2286
|
+
help=f"Whisper model directory ({DEFAULT_WHISPER_MODEL_DIR})",
|
|
2287
|
+
)
|
|
2288
|
+
whisperArgGroup.add_argument(
|
|
2289
|
+
"--whisper-model-name",
|
|
2290
|
+
dest="whisperModelName",
|
|
2291
|
+
metavar="<string>",
|
|
2292
|
+
type=str,
|
|
2293
|
+
default=DEFAULT_WHISPER_MODEL_NAME,
|
|
2294
|
+
help=f"Whisper model name ({DEFAULT_WHISPER_MODEL_NAME})",
|
|
2295
|
+
)
|
|
2296
|
+
whisperArgGroup.add_argument(
|
|
2297
|
+
"--torch-threads",
|
|
2298
|
+
dest="torchThreads",
|
|
2299
|
+
metavar="<int>",
|
|
2300
|
+
type=int,
|
|
2301
|
+
default=DEFAULT_TORCH_THREADS,
|
|
2302
|
+
help=f"Number of threads used by torch for CPU inference ({DEFAULT_TORCH_THREADS})",
|
|
2303
|
+
)
|
|
2304
|
+
|
|
2305
|
+
groqArgGroup = parser.add_argument_group('Groq Options')
|
|
2306
|
+
groqArgGroup.add_argument(
|
|
2307
|
+
"--groq-api-key",
|
|
2308
|
+
dest="groqApiKey",
|
|
2309
|
+
metavar="<string>",
|
|
2310
|
+
type=str,
|
|
2311
|
+
default=None,
|
|
2312
|
+
help="Groq API key (default: GROQ_API_KEY env var, ~/.groq/config.json, or ./.groq_key)",
|
|
2313
|
+
)
|
|
2314
|
+
groqArgGroup.add_argument(
|
|
2315
|
+
"--groq-model",
|
|
2316
|
+
dest="groqModel",
|
|
2317
|
+
metavar="<string>",
|
|
2318
|
+
type=str,
|
|
2319
|
+
default="whisper-large-v3",
|
|
2320
|
+
help="Groq Whisper model (default: whisper-large-v3)",
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
try:
|
|
2324
|
+
parser.error = parser.exit
|
|
2325
|
+
args = parser.parse_args()
|
|
2326
|
+
except SystemExit as se:
|
|
2327
|
+
mmguero.eprint(se)
|
|
2328
|
+
exit(2)
|
|
2329
|
+
|
|
2330
|
+
# Handle --clean-cache early and exit
|
|
2331
|
+
if args.cleanCache:
|
|
2332
|
+
import shutil
|
|
2333
|
+
if os.path.isdir(MONKEYPLUG_CACHE_DIR):
|
|
2334
|
+
shutil.rmtree(MONKEYPLUG_CACHE_DIR)
|
|
2335
|
+
print(f"Deleted cache directory: {MONKEYPLUG_CACHE_DIR}")
|
|
2336
|
+
else:
|
|
2337
|
+
print(f"No cache directory found at: {MONKEYPLUG_CACHE_DIR}")
|
|
2338
|
+
return
|
|
2339
|
+
|
|
2340
|
+
# Set debug flag based on verbose level for backward compatibility
|
|
2341
|
+
if args.verbose == "full":
|
|
2342
|
+
args.debug = True
|
|
2343
|
+
args.verbose_level = "full"
|
|
2344
|
+
elif args.verbose == "concise":
|
|
2345
|
+
args.debug = True
|
|
2346
|
+
args.verbose_level = "concise"
|
|
2347
|
+
else:
|
|
2348
|
+
args.debug = False
|
|
2349
|
+
args.verbose_level = ""
|
|
2350
|
+
|
|
2351
|
+
if args.debug:
|
|
2352
|
+
mmguero.eprint(os.path.join(script_path, script_name))
|
|
2353
|
+
mmguero.eprint(f"Arguments: {sys.argv[1:]}")
|
|
2354
|
+
if args.verbose_level == "full":
|
|
2355
|
+
mmguero.eprint(f"Arguments: {args}")
|
|
2356
|
+
else:
|
|
2357
|
+
sys.tracebacklimit = 0
|
|
2358
|
+
|
|
2359
|
+
# Check if wildcards are present in input or output
|
|
2360
|
+
has_wildcards = '*' in args.input or '*' in args.output
|
|
2361
|
+
|
|
2362
|
+
# Process instrumental mode arguments
|
|
2363
|
+
auto_generate = False
|
|
2364
|
+
auto_mode_requested = False # Track if --instrumental auto was used
|
|
2365
|
+
skip_detection = False # Skip vocal detection in wildcard mode (--instrumental generate)
|
|
2366
|
+
|
|
2367
|
+
# Mode priority: mute > beep > instrumental
|
|
2368
|
+
if args.mute:
|
|
2369
|
+
# Mute mode: disable all instrumental processing
|
|
2370
|
+
if args.debug:
|
|
2371
|
+
mmguero.eprint('Mute mode - disabling instrumental processing')
|
|
2372
|
+
args.instrumentalPrefix = None
|
|
2373
|
+
args.instrumentalFile = None
|
|
2374
|
+
auto_generate = False
|
|
2375
|
+
|
|
2376
|
+
elif args.beep:
|
|
2377
|
+
# Beep mode: disable all instrumental processing (beep takes precedence)
|
|
2378
|
+
if args.debug:
|
|
2379
|
+
mmguero.eprint('Beep mode enabled - disabling instrumental mode')
|
|
2380
|
+
args.instrumentalPrefix = None
|
|
2381
|
+
args.instrumentalFile = None
|
|
2382
|
+
auto_generate = False
|
|
2383
|
+
|
|
2384
|
+
# Process instrumental mode arguments
|
|
2385
|
+
# Default to auto mode if no instrumental flag provided or instrumentalPrefix is default "AUTO"
|
|
2386
|
+
elif args.instrumentalFile is None and (args.instrumentalPrefix is None or args.instrumentalPrefix == "AUTO"):
|
|
2387
|
+
# No --instrumental flag provided, default to auto mode
|
|
2388
|
+
auto_mode_requested = True
|
|
2389
|
+
args.instrumentalPrefix = "AUTO"
|
|
2390
|
+
if args.debug:
|
|
2391
|
+
mmguero.eprint('Default: Auto mode (try prefix search → if not found, generate)')
|
|
2392
|
+
|
|
2393
|
+
elif args.instrumentalFile:
|
|
2394
|
+
# If --instrumental was provided with a value
|
|
2395
|
+
instrumental_mode = args.instrumentalFile.lower()
|
|
2396
|
+
|
|
2397
|
+
if instrumental_mode == "auto":
|
|
2398
|
+
# Auto mode: try prefix search first, if not found, generate
|
|
2399
|
+
auto_mode_requested = True # Track that auto mode was requested
|
|
2400
|
+
args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
|
|
2401
|
+
if not args.instrumentalPrefix:
|
|
2402
|
+
args.instrumentalPrefix = "AUTO" # Set default for auto mode
|
|
2403
|
+
|
|
2404
|
+
# The search will be done later; if not found, we'll set auto_generate
|
|
2405
|
+
if args.debug:
|
|
2406
|
+
mmguero.eprint('Auto mode: Will try prefix search first, then generate if needed')
|
|
2407
|
+
|
|
2408
|
+
elif instrumental_mode == "generate":
|
|
2409
|
+
# Generate mode: force AI generation, skip instrumental file search
|
|
2410
|
+
auto_generate = True
|
|
2411
|
+
skip_detection = True
|
|
2412
|
+
args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
|
|
2413
|
+
args.instrumentalPrefix = None # Skip instrumental file search entirely
|
|
2414
|
+
if args.debug:
|
|
2415
|
+
mmguero.eprint('Generate mode: Will use AI to generate instrumental')
|
|
2416
|
+
|
|
2417
|
+
elif instrumental_mode == "prefix":
|
|
2418
|
+
# Prefix mode: search with --instrumental-prefix value
|
|
2419
|
+
args.instrumentalFile = None # Clear mode keyword so it's not treated as filename
|
|
2420
|
+
if not args.instrumentalPrefix:
|
|
2421
|
+
args.instrumentalPrefix = "AUTO" # Default to AUTO if not specified
|
|
2422
|
+
if args.debug:
|
|
2423
|
+
mmguero.eprint(f'Prefix mode: Searching for instrumental with prefix "{args.instrumentalPrefix}"')
|
|
2424
|
+
|
|
2425
|
+
else:
|
|
2426
|
+
# Treat as filename - already set in args.instrumentalFile
|
|
2427
|
+
if args.debug:
|
|
2428
|
+
mmguero.eprint(f'Using specified instrumental file: {args.instrumentalFile}')
|
|
2429
|
+
|
|
2430
|
+
# --filter-instrumentals overrides generate mode's skip_detection
|
|
2431
|
+
if args.filterInstrumentals:
|
|
2432
|
+
skip_detection = False
|
|
2433
|
+
|
|
2434
|
+
if has_wildcards and args.speechRecMode == SPEECH_REC_MODE_GROQ:
|
|
2435
|
+
# Wildcard mode with vocal detection
|
|
2436
|
+
vocal_files, instrumental_files, output_files = expand_and_detect_vocals(
|
|
2437
|
+
args.input, args.output, args, skip_detection=skip_detection
|
|
2438
|
+
)
|
|
2439
|
+
|
|
2440
|
+
if not vocal_files:
|
|
2441
|
+
mmguero.eprint('No vocal files found to process. All files appear to be instrumentals.')
|
|
2442
|
+
sys.exit(0)
|
|
2443
|
+
|
|
2444
|
+
mmguero.eprint(f'\nProcessing {len(vocal_files)} file(s) with vocals...\n')
|
|
2445
|
+
|
|
2446
|
+
# Process each vocal file
|
|
2447
|
+
for idx, (input_file, output_file) in enumerate(zip(vocal_files, output_files), 1):
|
|
2448
|
+
mmguero.eprint(f'\n[{idx}/{len(vocal_files)}] Processing: {os.path.basename(input_file)}')
|
|
2449
|
+
|
|
2450
|
+
# Create a copy of args and modify input/output
|
|
2451
|
+
args_copy = argparse.Namespace(**vars(args))
|
|
2452
|
+
args_copy.input = input_file
|
|
2453
|
+
args_copy.output = output_file
|
|
2454
|
+
|
|
2455
|
+
# Find instrumental file for this specific file if using AUTO/prefix mode
|
|
2456
|
+
if args_copy.instrumentalPrefix and not args_copy.instrumentalFile:
|
|
2457
|
+
import glob
|
|
2458
|
+
from difflib import SequenceMatcher
|
|
2459
|
+
|
|
2460
|
+
input_dir = os.path.dirname(input_file)
|
|
2461
|
+
if not input_dir:
|
|
2462
|
+
input_dir = '.'
|
|
2463
|
+
|
|
2464
|
+
input_basename = os.path.basename(input_file)
|
|
2465
|
+
input_name, input_ext = os.path.splitext(input_basename)
|
|
2466
|
+
|
|
2467
|
+
# AUTO mode - fuzzy matching
|
|
2468
|
+
if args_copy.instrumentalPrefix.upper() == 'AUTO':
|
|
2469
|
+
if args_copy.debug:
|
|
2470
|
+
mmguero.eprint(f'AUTO mode: Searching for instrumental file using fuzzy matching')
|
|
2471
|
+
|
|
2472
|
+
# Get all audio files in the directory
|
|
2473
|
+
audio_extensions = ['.mp3', '.mp4', '.m4a', '.wav', '.flac', '.ogg', '.aac', '.wma']
|
|
2474
|
+
all_files = []
|
|
2475
|
+
|
|
2476
|
+
for ext in audio_extensions:
|
|
2477
|
+
all_files.extend(glob.glob(os.path.join(input_dir, f'*{ext}')))
|
|
2478
|
+
|
|
2479
|
+
# Filter out the input file itself and any files matching output pattern
|
|
2480
|
+
def pattern_to_regex(pattern):
|
|
2481
|
+
"""Convert wildcard pattern to regex for matching"""
|
|
2482
|
+
import re
|
|
2483
|
+
regex = re.escape(pattern)
|
|
2484
|
+
regex = regex.replace(r'\*', '.*')
|
|
2485
|
+
return f'^{regex}$'
|
|
2486
|
+
|
|
2487
|
+
# If output file is specified, get its pattern to exclude matches
|
|
2488
|
+
output_pattern_to_exclude = None
|
|
2489
|
+
if output_file:
|
|
2490
|
+
# For single file, check exact basename match
|
|
2491
|
+
output_basename = os.path.basename(output_file)
|
|
2492
|
+
else:
|
|
2493
|
+
output_basename = None
|
|
2494
|
+
|
|
2495
|
+
other_files = []
|
|
2496
|
+
for f in all_files:
|
|
2497
|
+
basename = os.path.basename(f)
|
|
2498
|
+
# Skip input file
|
|
2499
|
+
if basename == input_basename:
|
|
2500
|
+
continue
|
|
2501
|
+
# Skip exact output file match if specified
|
|
2502
|
+
if output_basename and basename == output_basename:
|
|
2503
|
+
continue
|
|
2504
|
+
other_files.append(f)
|
|
2505
|
+
|
|
2506
|
+
# Two-way fuzzy matching with validation
|
|
2507
|
+
candidates_with_scores = []
|
|
2508
|
+
for candidate in other_files:
|
|
2509
|
+
candidate_basename = os.path.basename(candidate)
|
|
2510
|
+
candidate_name, _ = os.path.splitext(candidate_basename)
|
|
2511
|
+
|
|
2512
|
+
ratio = SequenceMatcher(None, input_name.lower(), candidate_name.lower()).ratio()
|
|
2513
|
+
|
|
2514
|
+
if args_copy.debug:
|
|
2515
|
+
mmguero.eprint(f' {candidate_basename}: similarity={ratio:.3f}')
|
|
2516
|
+
|
|
2517
|
+
if ratio < 1.0:
|
|
2518
|
+
candidates_with_scores.append((candidate, ratio))
|
|
2519
|
+
|
|
2520
|
+
candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
|
|
2521
|
+
top_candidates = candidates_with_scores[:args_copy.instrumentalAutoCandidates]
|
|
2522
|
+
|
|
2523
|
+
validated_candidates = []
|
|
2524
|
+
for candidate, candidate_to_input_score in top_candidates:
|
|
2525
|
+
candidate_basename = os.path.basename(candidate)
|
|
2526
|
+
candidate_name, _ = os.path.splitext(candidate_basename)
|
|
2527
|
+
|
|
2528
|
+
best_other_score = 0.0
|
|
2529
|
+
best_other_match = None
|
|
2530
|
+
|
|
2531
|
+
for other_file in all_files:
|
|
2532
|
+
other_basename = os.path.basename(other_file)
|
|
2533
|
+
if other_basename != input_basename and other_basename != candidate_basename:
|
|
2534
|
+
other_name, _ = os.path.splitext(other_basename)
|
|
2535
|
+
other_score = SequenceMatcher(None, candidate_name.lower(), other_name.lower()).ratio()
|
|
2536
|
+
|
|
2537
|
+
if other_score > best_other_score:
|
|
2538
|
+
best_other_score = other_score
|
|
2539
|
+
best_other_match = other_basename
|
|
2540
|
+
|
|
2541
|
+
if args_copy.debug:
|
|
2542
|
+
mmguero.eprint(f' Validating {candidate_basename}:')
|
|
2543
|
+
mmguero.eprint(f' to input: {candidate_to_input_score:.3f}')
|
|
2544
|
+
mmguero.eprint(f' to best other ({best_other_match}): {best_other_score:.3f}')
|
|
2545
|
+
|
|
2546
|
+
if candidate_to_input_score > best_other_score:
|
|
2547
|
+
validated_candidates.append((candidate, candidate_to_input_score))
|
|
2548
|
+
if args_copy.debug:
|
|
2549
|
+
mmguero.eprint(f' ✓ PASSED validation')
|
|
2550
|
+
else:
|
|
2551
|
+
if args_copy.debug:
|
|
2552
|
+
mmguero.eprint(f' ✗ FAILED validation')
|
|
2553
|
+
|
|
2554
|
+
if validated_candidates:
|
|
2555
|
+
best_match, best_ratio = validated_candidates[0]
|
|
2556
|
+
if best_ratio >= 0.3:
|
|
2557
|
+
args_copy.instrumentalFile = best_match
|
|
2558
|
+
if args_copy.debug:
|
|
2559
|
+
mmguero.eprint(f'AUTO mode matched: {os.path.basename(best_match)} (similarity: {best_ratio:.3f})')
|
|
2560
|
+
else:
|
|
2561
|
+
# Auto mode: no valid match found, enable AI generation
|
|
2562
|
+
if auto_mode_requested:
|
|
2563
|
+
if args_copy.debug:
|
|
2564
|
+
mmguero.eprint(f' Auto mode: No validated match above threshold, will use AI generation')
|
|
2565
|
+
else:
|
|
2566
|
+
mmguero.eprint(f' No validated match above threshold, will use AI generation')
|
|
2567
|
+
else:
|
|
2568
|
+
# Auto mode: all candidates failed validation, enable AI generation
|
|
2569
|
+
if auto_mode_requested:
|
|
2570
|
+
if args_copy.debug:
|
|
2571
|
+
mmguero.eprint(f' Auto mode: All candidates failed validation, will use AI generation')
|
|
2572
|
+
else:
|
|
2573
|
+
mmguero.eprint(f' All candidates failed validation, will use AI generation')
|
|
2574
|
+
|
|
2575
|
+
# Process this file
|
|
2576
|
+
# Determine if AI generation should be used for this specific file
|
|
2577
|
+
file_auto_generate = auto_generate
|
|
2578
|
+
if auto_mode_requested and not args_copy.instrumentalFile:
|
|
2579
|
+
file_auto_generate = True
|
|
2580
|
+
|
|
2581
|
+
plug = GroqPlugger(
|
|
2582
|
+
args_copy.input,
|
|
2583
|
+
args_copy.output,
|
|
2584
|
+
args_copy.outputFormat,
|
|
2585
|
+
args_copy.swears,
|
|
2586
|
+
args_copy.groqApiKey,
|
|
2587
|
+
args_copy.groqModel,
|
|
2588
|
+
args_copy.outputJson,
|
|
2589
|
+
inputTranscript=args_copy.inputTranscript,
|
|
2590
|
+
saveTranscript=args_copy.saveTranscript,
|
|
2591
|
+
forceRetranscribe=args_copy.forceRetranscribe,
|
|
2592
|
+
aParams=args_copy.aParams,
|
|
2593
|
+
aChannels=args_copy.aChannels,
|
|
2594
|
+
aSampleRate=args_copy.aSampleRate,
|
|
2595
|
+
aBitRate=args_copy.aBitRate,
|
|
2596
|
+
aVorbisQscale=args_copy.aVorbisQscale,
|
|
2597
|
+
padMsecPre=args_copy.padMsecPre if args_copy.padMsecPre > 0 else args_copy.padMsec,
|
|
2598
|
+
padMsecPost=args_copy.padMsecPost if args_copy.padMsecPost > 0 else args_copy.padMsec,
|
|
2599
|
+
beep=args_copy.beep,
|
|
2600
|
+
beepHertz=args_copy.beepHertz,
|
|
2601
|
+
beepMixNormalize=args_copy.beepMixNormalize,
|
|
2602
|
+
beepAudioWeight=args_copy.beepAudioWeight,
|
|
2603
|
+
beepSineWeight=args_copy.beepSineWeight,
|
|
2604
|
+
beepDropTransition=args_copy.beepDropTransition,
|
|
2605
|
+
force=args_copy.forceDespiteTag,
|
|
2606
|
+
dbug=args_copy.debug,
|
|
2607
|
+
instrumentalFileSpec=args_copy.instrumentalFile,
|
|
2608
|
+
verbose_level=args_copy.verbose_level if hasattr(args_copy, 'verbose_level') else "",
|
|
2609
|
+
auto_generate=file_auto_generate,
|
|
2610
|
+
separation_padding=args_copy.separationPadding,
|
|
2611
|
+
)
|
|
2612
|
+
|
|
2613
|
+
print(plug.EncodeCleanAudio())
|
|
2614
|
+
|
|
2615
|
+
mmguero.eprint(f'\n✓ Completed processing {len(vocal_files)} file(s)')
|
|
2616
|
+
mmguero.eprint(f'Skipped {len(instrumental_files)} instrumental file(s)')
|
|
2617
|
+
sys.exit(0)
|
|
2618
|
+
|
|
2619
|
+
# Single file mode (no wildcards or not using Groq mode)
|
|
2620
|
+
# Find instrumental file if prefix is specified
|
|
2621
|
+
if args.instrumentalPrefix and not args.instrumentalFile:
|
|
2622
|
+
import glob
|
|
2623
|
+
from difflib import SequenceMatcher
|
|
2624
|
+
|
|
2625
|
+
input_dir = os.path.dirname(args.input)
|
|
2626
|
+
if not input_dir:
|
|
2627
|
+
input_dir = '.'
|
|
2628
|
+
|
|
2629
|
+
input_basename = os.path.basename(args.input)
|
|
2630
|
+
input_name, input_ext = os.path.splitext(input_basename)
|
|
2631
|
+
|
|
2632
|
+
# AUTO mode - fuzzy matching
|
|
2633
|
+
if args.instrumentalPrefix.upper() == 'AUTO':
|
|
2634
|
+
if args.debug:
|
|
2635
|
+
mmguero.eprint(f'AUTO mode: Searching for instrumental file using fuzzy matching')
|
|
2636
|
+
|
|
2637
|
+
# Get all audio files in the directory
|
|
2638
|
+
audio_extensions = ['.mp3', '.mp4', '.m4a', '.wav', '.flac', '.ogg', '.aac', '.wma']
|
|
2639
|
+
all_files = []
|
|
2640
|
+
|
|
2641
|
+
for ext in audio_extensions:
|
|
2642
|
+
all_files.extend(glob.glob(os.path.join(input_dir, f'*{ext}')))
|
|
2643
|
+
|
|
2644
|
+
# Filter out the input file itself and the output file
|
|
2645
|
+
output_basename = os.path.basename(args.output) if args.output else None
|
|
2646
|
+
other_files = []
|
|
2647
|
+
for f in all_files:
|
|
2648
|
+
basename = os.path.basename(f)
|
|
2649
|
+
# Skip input file
|
|
2650
|
+
if basename == input_basename:
|
|
2651
|
+
continue
|
|
2652
|
+
# Skip exact output file match if specified
|
|
2653
|
+
if output_basename and basename == output_basename:
|
|
2654
|
+
continue
|
|
2655
|
+
other_files.append(f)
|
|
2656
|
+
|
|
2657
|
+
if not other_files:
|
|
2658
|
+
mmguero.eprint(f'Warning: AUTO mode found no other audio files in directory')
|
|
2659
|
+
else:
|
|
2660
|
+
# Two-way fuzzy matching with validation
|
|
2661
|
+
# Step 1: Find top N candidates by similarity to input
|
|
2662
|
+
candidates_with_scores = []
|
|
2663
|
+
for candidate in other_files:
|
|
2664
|
+
candidate_basename = os.path.basename(candidate)
|
|
2665
|
+
candidate_name, _ = os.path.splitext(candidate_basename)
|
|
2666
|
+
|
|
2667
|
+
# Calculate similarity ratio (0 to 1)
|
|
2668
|
+
ratio = SequenceMatcher(None, input_name.lower(), candidate_name.lower()).ratio()
|
|
2669
|
+
|
|
2670
|
+
if args.debug:
|
|
2671
|
+
mmguero.eprint(f' {candidate_basename}: similarity={ratio:.3f}')
|
|
2672
|
+
|
|
2673
|
+
if ratio < 1.0: # Don't match the exact same file
|
|
2674
|
+
candidates_with_scores.append((candidate, ratio))
|
|
2675
|
+
|
|
2676
|
+
# Sort by score descending, take top N
|
|
2677
|
+
candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
|
|
2678
|
+
top_candidates = candidates_with_scores[:args.instrumentalAutoCandidates]
|
|
2679
|
+
|
|
2680
|
+
if args.debug and top_candidates:
|
|
2681
|
+
mmguero.eprint(f'Top {len(top_candidates)} candidates: {[os.path.basename(c[0]) for c in top_candidates]}')
|
|
2682
|
+
|
|
2683
|
+
# Step 2: Validate each candidate with two-way check
|
|
2684
|
+
validated_candidates = []
|
|
2685
|
+
for candidate, candidate_to_input_score in top_candidates:
|
|
2686
|
+
candidate_basename = os.path.basename(candidate)
|
|
2687
|
+
candidate_name, _ = os.path.splitext(candidate_basename)
|
|
2688
|
+
|
|
2689
|
+
# Find candidate's best match among ALL files (except input and itself)
|
|
2690
|
+
best_other_score = 0.0
|
|
2691
|
+
best_other_match = None
|
|
2692
|
+
|
|
2693
|
+
for other_file in all_files:
|
|
2694
|
+
other_basename = os.path.basename(other_file)
|
|
2695
|
+
if other_basename != input_basename and other_basename != candidate_basename:
|
|
2696
|
+
other_name, _ = os.path.splitext(other_basename)
|
|
2697
|
+
|
|
2698
|
+
# Calculate similarity between candidate and this other file
|
|
2699
|
+
other_score = SequenceMatcher(None, candidate_name.lower(), other_name.lower()).ratio()
|
|
2700
|
+
|
|
2701
|
+
if other_score > best_other_score:
|
|
2702
|
+
best_other_score = other_score
|
|
2703
|
+
best_other_match = other_basename
|
|
2704
|
+
|
|
2705
|
+
# Validation: candidate must be more similar to input than to any other file
|
|
2706
|
+
if args.debug:
|
|
2707
|
+
mmguero.eprint(f' Validating {candidate_basename}:')
|
|
2708
|
+
mmguero.eprint(f' to input: {candidate_to_input_score:.3f}')
|
|
2709
|
+
mmguero.eprint(f' to best other ({best_other_match}): {best_other_score:.3f}')
|
|
2710
|
+
|
|
2711
|
+
if candidate_to_input_score > best_other_score:
|
|
2712
|
+
validated_candidates.append((candidate, candidate_to_input_score))
|
|
2713
|
+
if args.debug:
|
|
2714
|
+
mmguero.eprint(f' ✓ PASSED validation')
|
|
2715
|
+
else:
|
|
2716
|
+
if args.debug:
|
|
2717
|
+
mmguero.eprint(f' ✗ FAILED validation (better match with {best_other_match})')
|
|
2718
|
+
|
|
2719
|
+
# Step 3: Use best validated candidate
|
|
2720
|
+
if validated_candidates:
|
|
2721
|
+
best_match, best_ratio = validated_candidates[0] # Already sorted by score
|
|
2722
|
+
if best_ratio >= 0.3: # 30% similarity threshold
|
|
2723
|
+
args.instrumentalFile = best_match
|
|
2724
|
+
if args.debug:
|
|
2725
|
+
mmguero.eprint(f'AUTO mode matched: {os.path.basename(best_match)} (similarity: {best_ratio:.3f})')
|
|
2726
|
+
else:
|
|
2727
|
+
# Auto mode: no valid match found, will use AI generation
|
|
2728
|
+
if auto_mode_requested:
|
|
2729
|
+
mmguero.eprint(f'Warning: AUTO mode found candidates but all below 30% threshold')
|
|
2730
|
+
mmguero.eprint(f'Best validated match was {os.path.basename(best_match)} with similarity {best_ratio:.3f}')
|
|
2731
|
+
mmguero.eprint(f'Auto mode: Will use AI to generate instrumental')
|
|
2732
|
+
else:
|
|
2733
|
+
mmguero.eprint(f'Warning: AUTO mode found candidates but all below 30% threshold')
|
|
2734
|
+
mmguero.eprint(f'Best validated match was {os.path.basename(best_match)} with similarity {best_ratio:.3f}')
|
|
2735
|
+
mmguero.eprint(f'No instrumental file found, will use AI generation')
|
|
2736
|
+
else:
|
|
2737
|
+
# Auto mode: all candidates failed validation, will use AI generation
|
|
2738
|
+
if auto_mode_requested:
|
|
2739
|
+
mmguero.eprint(f'Warning: AUTO mode could not find a validated instrumental file')
|
|
2740
|
+
mmguero.eprint(f'All top candidates failed two-way validation (likely belong to other songs)')
|
|
2741
|
+
mmguero.eprint(f'Auto mode: Will use AI to generate instrumental')
|
|
2742
|
+
else:
|
|
2743
|
+
mmguero.eprint(f'Warning: AUTO mode could not find a validated instrumental file')
|
|
2744
|
+
mmguero.eprint(f'All top candidates failed two-way validation (likely belong to other songs)')
|
|
2745
|
+
mmguero.eprint(f'No instrumental file found, will use AI generation')
|
|
2746
|
+
else:
|
|
2747
|
+
# Pattern-based search with specified prefix
|
|
2748
|
+
# Common patterns to search for
|
|
2749
|
+
patterns = [
|
|
2750
|
+
f"{input_name}_{args.instrumentalPrefix}{input_ext}", # song_instrumental.mp3
|
|
2751
|
+
f"{input_name}-{args.instrumentalPrefix}{input_ext}", # song-instrumental.mp3
|
|
2752
|
+
f"{input_name}{args.instrumentalPrefix}{input_ext}", # songinstrumental.mp3
|
|
2753
|
+
f"{args.instrumentalPrefix}_{input_name}{input_ext}", # instrumental_song.mp3
|
|
2754
|
+
f"{args.instrumentalPrefix}-{input_name}{input_ext}", # instrumental-song.mp3
|
|
2755
|
+
]
|
|
2756
|
+
|
|
2757
|
+
if args.debug:
|
|
2758
|
+
mmguero.eprint(f'Searching for instrumental file with prefix: {args.instrumentalPrefix}')
|
|
2759
|
+
mmguero.eprint(f'Patterns: {patterns}')
|
|
2760
|
+
|
|
2761
|
+
found = False
|
|
2762
|
+
for pattern in patterns:
|
|
2763
|
+
search_path = os.path.join(input_dir, pattern)
|
|
2764
|
+
matches = glob.glob(search_path)
|
|
2765
|
+
if matches:
|
|
2766
|
+
args.instrumentalFile = matches[0]
|
|
2767
|
+
found = True
|
|
2768
|
+
if args.debug:
|
|
2769
|
+
mmguero.eprint(f'Found instrumental file: {args.instrumentalFile}')
|
|
2770
|
+
break
|
|
2771
|
+
|
|
2772
|
+
if not found:
|
|
2773
|
+
mmguero.eprint(f'Warning: Could not find instrumental file matching prefix "{args.instrumentalPrefix}"')
|
|
2774
|
+
mmguero.eprint(f'Searched for patterns: {patterns}')
|
|
2775
|
+
# If auto mode was requested, enable AI generation
|
|
2776
|
+
if auto_mode_requested:
|
|
2777
|
+
auto_generate = True
|
|
2778
|
+
mmguero.eprint(f'Auto mode: No instrumental found, will use AI to generate instrumental')
|
|
2779
|
+
else:
|
|
2780
|
+
mmguero.eprint(f'Will use AI to generate instrumental instead')
|
|
2781
|
+
|
|
2782
|
+
# Single file mode: check if we should enable auto_generate after search
|
|
2783
|
+
# If auto mode was requested and no file was found, enable generation
|
|
2784
|
+
if auto_mode_requested and not args.instrumentalFile and not auto_generate:
|
|
2785
|
+
auto_generate = True
|
|
2786
|
+
if args.debug:
|
|
2787
|
+
mmguero.eprint('Auto mode: No instrumental file found, enabling AI generation')
|
|
2788
|
+
|
|
2789
|
+
if args.speechRecMode == SPEECH_REC_MODE_VOSK:
|
|
2790
|
+
pathlib.Path(args.voskModelDir).mkdir(parents=True, exist_ok=True)
|
|
2791
|
+
plug = VoskPlugger(
|
|
2792
|
+
args.input,
|
|
2793
|
+
args.output,
|
|
2794
|
+
args.outputFormat,
|
|
2795
|
+
args.swears,
|
|
2796
|
+
args.voskModelDir,
|
|
2797
|
+
args.outputJson,
|
|
2798
|
+
inputTranscript=args.inputTranscript,
|
|
2799
|
+
saveTranscript=args.saveTranscript,
|
|
2800
|
+
forceRetranscribe=args.forceRetranscribe,
|
|
2801
|
+
aParams=args.aParams,
|
|
2802
|
+
aChannels=args.aChannels,
|
|
2803
|
+
aSampleRate=args.aSampleRate,
|
|
2804
|
+
aBitRate=args.aBitRate,
|
|
2805
|
+
aVorbisQscale=args.aVorbisQscale,
|
|
2806
|
+
wChunk=args.voskReadFramesChunk,
|
|
2807
|
+
padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
|
|
2808
|
+
padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
|
|
2809
|
+
beep=args.beep,
|
|
2810
|
+
beepHertz=args.beepHertz,
|
|
2811
|
+
beepMixNormalize=args.beepMixNormalize,
|
|
2812
|
+
beepAudioWeight=args.beepAudioWeight,
|
|
2813
|
+
beepSineWeight=args.beepSineWeight,
|
|
2814
|
+
beepDropTransition=args.beepDropTransition,
|
|
2815
|
+
force=args.forceDespiteTag,
|
|
2816
|
+
dbug=args.debug,
|
|
2817
|
+
)
|
|
2818
|
+
|
|
2819
|
+
elif args.speechRecMode == SPEECH_REC_MODE_WHISPER:
|
|
2820
|
+
pathlib.Path(args.whisperModelDir).mkdir(parents=True, exist_ok=True)
|
|
2821
|
+
plug = WhisperPlugger(
|
|
2822
|
+
args.input,
|
|
2823
|
+
args.output,
|
|
2824
|
+
args.outputFormat,
|
|
2825
|
+
args.swears,
|
|
2826
|
+
args.whisperModelDir,
|
|
2827
|
+
args.whisperModelName,
|
|
2828
|
+
args.torchThreads,
|
|
2829
|
+
args.outputJson,
|
|
2830
|
+
inputTranscript=args.inputTranscript,
|
|
2831
|
+
saveTranscript=args.saveTranscript,
|
|
2832
|
+
forceRetranscribe=args.forceRetranscribe,
|
|
2833
|
+
aParams=args.aParams,
|
|
2834
|
+
aChannels=args.aChannels,
|
|
2835
|
+
aSampleRate=args.aSampleRate,
|
|
2836
|
+
aBitRate=args.aBitRate,
|
|
2837
|
+
aVorbisQscale=args.aVorbisQscale,
|
|
2838
|
+
padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
|
|
2839
|
+
padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
|
|
2840
|
+
beep=args.beep,
|
|
2841
|
+
beepHertz=args.beepHertz,
|
|
2842
|
+
beepMixNormalize=args.beepMixNormalize,
|
|
2843
|
+
beepAudioWeight=args.beepAudioWeight,
|
|
2844
|
+
beepSineWeight=args.beepSineWeight,
|
|
2845
|
+
beepDropTransition=args.beepDropTransition,
|
|
2846
|
+
force=args.forceDespiteTag,
|
|
2847
|
+
dbug=args.debug,
|
|
2848
|
+
)
|
|
2849
|
+
|
|
2850
|
+
elif args.speechRecMode == SPEECH_REC_MODE_GROQ:
|
|
2851
|
+
plug = GroqPlugger(
|
|
2852
|
+
args.input,
|
|
2853
|
+
args.output,
|
|
2854
|
+
args.outputFormat,
|
|
2855
|
+
args.swears,
|
|
2856
|
+
args.groqApiKey,
|
|
2857
|
+
args.groqModel,
|
|
2858
|
+
args.outputJson,
|
|
2859
|
+
inputTranscript=args.inputTranscript,
|
|
2860
|
+
saveTranscript=args.saveTranscript,
|
|
2861
|
+
forceRetranscribe=args.forceRetranscribe,
|
|
2862
|
+
aParams=args.aParams,
|
|
2863
|
+
aChannels=args.aChannels,
|
|
2864
|
+
aSampleRate=args.aSampleRate,
|
|
2865
|
+
aBitRate=args.aBitRate,
|
|
2866
|
+
aVorbisQscale=args.aVorbisQscale,
|
|
2867
|
+
padMsecPre=args.padMsecPre if args.padMsecPre > 0 else args.padMsec,
|
|
2868
|
+
padMsecPost=args.padMsecPost if args.padMsecPost > 0 else args.padMsec,
|
|
2869
|
+
beep=args.beep,
|
|
2870
|
+
beepHertz=args.beepHertz,
|
|
2871
|
+
beepMixNormalize=args.beepMixNormalize,
|
|
2872
|
+
beepAudioWeight=args.beepAudioWeight,
|
|
2873
|
+
beepSineWeight=args.beepSineWeight,
|
|
2874
|
+
beepDropTransition=args.beepDropTransition,
|
|
2875
|
+
force=args.forceDespiteTag,
|
|
2876
|
+
dbug=args.debug,
|
|
2877
|
+
instrumentalFileSpec=args.instrumentalFile,
|
|
2878
|
+
verbose_level=args.verbose_level if hasattr(args, 'verbose_level') else "",
|
|
2879
|
+
auto_generate=auto_generate,
|
|
2880
|
+
separation_padding=args.separationPadding,
|
|
2881
|
+
)
|
|
2882
|
+
else:
|
|
2883
|
+
raise ValueError(f"Unsupported speech recognition engine {args.speechRecMode}")
|
|
2884
|
+
|
|
2885
|
+
print(plug.EncodeCleanAudio())
|
|
2886
|
+
|
|
2887
|
+
sys.exit(0)
|
|
2888
|
+
|
|
2889
|
+
|
|
2890
|
+
###################################################################################################
|
|
2891
|
+
if __name__ == "__main__":
|
|
2892
|
+
RunMonkeyPlug()
|