GameSentenceMiner 2.9.0__py3-none-any.whl → 2.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GameSentenceMiner/gsm.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import shutil
2
3
  import sys
3
4
 
4
5
  from GameSentenceMiner.vad.result import VADResult
@@ -59,39 +60,28 @@ root = None
59
60
 
60
61
 
61
62
  class VideoToAudioHandler(FileSystemEventHandler):
63
+ def __init__(self):
64
+ super().__init__()
65
+
66
+
62
67
  def on_created(self, event):
63
68
  if event.is_directory or ("Replay" not in event.src_path and "GSM" not in event.src_path):
64
69
  return
65
70
  if event.src_path.endswith(".mkv") or event.src_path.endswith(".mp4"): # Adjust based on your OBS output format
66
71
  logger.info(f"MKV {event.src_path} FOUND, RUNNING LOGIC")
67
72
  wait_for_stable_file(event.src_path)
68
- self.convert_to_audio(event.src_path)
73
+ self.process_replay(event.src_path)
69
74
 
70
- @staticmethod
71
- def convert_to_audio(video_path):
75
+ def process_replay(self, video_path):
72
76
  vad_trimmed_audio = ''
73
- try:
74
- if texthooking_page.event_manager.line_for_audio:
75
- line: GameLine = texthooking_page.event_manager.line_for_audio
76
- texthooking_page.event_manager.line_for_audio = None
77
- if get_config().advanced.audio_player_path:
78
- audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path, temporary=True)
79
- play_audio_in_external(audio)
80
- os.remove(video_path)
81
- elif get_config().advanced.video_player_path:
82
- play_video_in_external(line, video_path)
83
- return
84
- if texthooking_page.event_manager.line_for_screenshot:
85
- line: GameLine = texthooking_page.event_manager.line_for_screenshot
86
- texthooking_page.event_manager.line_for_screenshot = None
87
- screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
88
- os.startfile(screenshot)
89
- os.remove(video_path)
90
- return
91
- except Exception as e:
92
- logger.error(f"Error Playing Audio/Video: {e}")
93
- logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
77
+ print(video_path)
78
+ if "previous.mkv" in video_path:
94
79
  os.remove(video_path)
80
+ video_path = gsm_state.previous_replay
81
+ else:
82
+ gsm_state.previous_replay = video_path
83
+ if gsm_state.line_for_audio or gsm_state.line_for_screenshot:
84
+ self.handle_texthooker_button(video_path)
95
85
  return
96
86
  try:
97
87
  if anki.card_queue and len(anki.card_queue) > 0:
@@ -148,7 +138,7 @@ class VideoToAudioHandler(FileSystemEventHandler):
148
138
  mined_line=mined_line)
149
139
  else:
150
140
  final_audio_output = ""
151
- vad_result = VADResult(False, 0, 0)
141
+ vad_result = VADResult(False, 0, 0, '')
152
142
  vad_trimmed_audio = ""
153
143
  if not get_config().audio.enabled:
154
144
  logger.info("Audio is disabled in config, skipping audio processing!")
@@ -184,6 +174,43 @@ class VideoToAudioHandler(FileSystemEventHandler):
184
174
  if vad_trimmed_audio and get_config().paths.remove_audio and os.path.exists(vad_trimmed_audio):
185
175
  os.remove(vad_trimmed_audio) # Optionally remove the screenshot after conversion
186
176
 
177
+ def handle_texthooker_button(self, video_path):
178
+ try:
179
+ if gsm_state.line_for_audio:
180
+ line: GameLine = gsm_state.line_for_audio
181
+ gsm_state.line_for_audio = None
182
+ if line == gsm_state.previous_line_for_audio:
183
+ logger.info("Line is the same as the last one, skipping processing.")
184
+ if get_config().advanced.audio_player_path:
185
+ play_audio_in_external(gsm_state.previous_audio)
186
+ elif get_config().advanced.video_player_path:
187
+ play_video_in_external(line, gsm_state.previous_audio)
188
+ return
189
+ gsm_state.previous_line_for_audio = line
190
+ if get_config().advanced.audio_player_path:
191
+ audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path,
192
+ temporary=True)
193
+ play_audio_in_external(audio)
194
+ gsm_state.previous_audio = audio
195
+ elif get_config().advanced.video_player_path:
196
+ new_video_path = play_video_in_external(line, video_path)
197
+ gsm_state.previous_audio = new_video_path
198
+ gsm_state.previous_replay = new_video_path
199
+ return
200
+ if gsm_state.line_for_screenshot:
201
+ line: GameLine = gsm_state.line_for_screenshot
202
+ gsm_state.line_for_screenshot = None
203
+ gsm_state.previous_line_for_screenshot = line
204
+ screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
205
+ os.startfile(screenshot)
206
+ return
207
+ except Exception as e:
208
+ logger.error(f"Error Playing Audio/Video: {e}")
209
+ logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
210
+ return
211
+ finally:
212
+ if video_path and get_config().paths.remove_video and os.path.exists(video_path):
213
+ os.remove(video_path)
187
214
 
188
215
  @staticmethod
189
216
  def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False, mined_line=None):
@@ -195,9 +222,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
195
222
  f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
196
223
  final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
197
224
  f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
198
- result = VADResult(False, 0, 0)
225
+ result = VADResult(False, 0, 0, "")
199
226
  if get_config().vad.do_vad_postprocessing:
200
- logger.info("Trimming audio with Voice Detection...")
201
227
  result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=mined_line)
202
228
  if not result.success:
203
229
  result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
@@ -209,6 +235,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
209
235
  else:
210
236
  logger.info("No voice activity detected.")
211
237
  return None, result, None
238
+ else:
239
+ logger.info(result.trim_successful_string())
212
240
  if timing_only:
213
241
  return result
214
242
  if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
@@ -223,6 +251,9 @@ def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, game_line=None, s
223
251
  match model:
224
252
  case configuration.OFF:
225
253
  pass
254
+ case configuration.GROQ:
255
+ from GameSentenceMiner.vad import groq_trim
256
+ return groq_trim.process_audio_with_groq(trimmed_audio, vad_trimmed_audio, game_line)
226
257
  case configuration.SILERO:
227
258
  from GameSentenceMiner.vad import silero_trim
228
259
  return silero_trim.process_audio_with_silero(trimmed_audio, vad_trimmed_audio, game_line)
@@ -239,7 +270,7 @@ def play_audio_in_external(filepath):
239
270
 
240
271
  filepath = os.path.normpath(filepath)
241
272
 
242
- command = [exe, filepath]
273
+ command = [exe, "--no-video", filepath]
243
274
 
244
275
  try:
245
276
  subprocess.Popen(command)
@@ -248,10 +279,13 @@ def play_audio_in_external(filepath):
248
279
  print(f"An error occurred: {e}")
249
280
 
250
281
  def play_video_in_external(line, filepath):
251
- def remove_video_when_closed(p, fp):
282
+ def move_video_when_closed(p, fp):
252
283
  p.wait()
253
284
  os.remove(fp)
254
285
 
286
+ shutil.move(filepath, get_temporary_directory())
287
+ new_filepath = os.path.join(get_temporary_directory(), os.path.basename(filepath))
288
+
255
289
  command = [get_config().advanced.video_player_path]
256
290
 
257
291
  start, _, _ = get_video_timings(filepath, line)
@@ -265,14 +299,17 @@ def play_video_in_external(line, filepath):
265
299
 
266
300
  logger.info(" ".join(command))
267
301
 
302
+
303
+
268
304
  try:
269
305
  proc = subprocess.Popen(command)
270
306
  print(f"Opened {filepath} in {get_config().advanced.video_player_path}.")
271
- threading.Thread(target=remove_video_when_closed, args=(proc, filepath)).start()
307
+ threading.Thread(target=move_video_when_closed, args=(proc, filepath)).start()
272
308
  except FileNotFoundError:
273
309
  print("VLC not found. Make sure it's installed and in your PATH.")
274
310
  except Exception as e:
275
311
  print(f"An error occurred: {e}")
312
+ return new_filepath
276
313
 
277
314
  def convert_to_vlc_seconds(time_str):
278
315
  """Converts HH:MM:SS.milliseconds to VLC-compatible seconds."""
@@ -326,21 +363,25 @@ def get_screenshot():
326
363
  logger.error(f"Failed to get Screenshot: {e}")
327
364
 
328
365
 
329
- def create_image():
330
- """Create a simple pickaxe icon."""
331
- width, height = 64, 64
332
- image = Image.new("RGBA", (width, height), (0, 0, 0, 0)) # Transparent background
333
- draw = ImageDraw.Draw(image)
334
-
335
- # Handle (rectangle)
336
- handle_color = (139, 69, 19) # Brown color
337
- draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
366
+ # def create_image():
367
+ # """Create a simple pickaxe icon."""
368
+ # width, height = 64, 64
369
+ # image = Image.new("RGBA", (width, height), (0, 0, 0, 0)) # Transparent background
370
+ # draw = ImageDraw.Draw(image)
371
+ #
372
+ # # Handle (rectangle)
373
+ # handle_color = (139, 69, 19) # Brown color
374
+ # draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
375
+ #
376
+ # # Blade (triangle-like shape)
377
+ # blade_color = (192, 192, 192) # Silver color
378
+ # draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
379
+ #
380
+ # return image
338
381
 
339
- # Blade (triangle-like shape)
340
- blade_color = (192, 192, 192) # Silver color
341
- draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
342
-
343
- return image
382
+ def create_image():
383
+ image_path = os.path.join(os.path.dirname(__file__), "assets", "pickaxe.png")
384
+ return Image.open(image_path)
344
385
 
345
386
 
346
387
  def open_settings():
@@ -351,7 +392,7 @@ def open_settings():
351
392
  def play_most_recent_audio():
352
393
  if get_config().advanced.audio_player_path or get_config().advanced.video_player_path and len(
353
394
  get_all_lines()) > 0:
354
- texthooking_page.event_manager.line_for_audio = get_all_lines()[-1]
395
+ gsm_state.line_for_audio = get_all_lines()[-1]
355
396
  obs.save_replay_buffer()
356
397
  else:
357
398
  logger.error("Feature Disabled. No audio or video player path set in config!")
@@ -405,7 +446,7 @@ def update_icon(profile=None):
405
446
  )
406
447
 
407
448
  menu = Menu(
408
- MenuItem("Open Settings", open_settings),
449
+ MenuItem("Open Settings", open_settings, default=True),
409
450
  MenuItem("Open Multi-Mine GUI", open_multimine),
410
451
  MenuItem("Open Log", open_log),
411
452
  MenuItem("Toggle Replay Buffer", play_pause),
@@ -442,7 +483,7 @@ def run_tray():
442
483
  )
443
484
 
444
485
  menu = Menu(
445
- MenuItem("Open Settings", open_settings),
486
+ MenuItem("Open Settings", open_settings, default=True),
446
487
  MenuItem("Open Texthooker", texthooking_page.open_texthooker),
447
488
  MenuItem("Open Log", open_log),
448
489
  MenuItem("Toggle Replay Buffer", play_pause),
@@ -451,7 +492,7 @@ def run_tray():
451
492
  MenuItem("Exit", exit_program)
452
493
  )
453
494
 
454
- icon = Icon("TrayApp", create_image(), "Game Sentence Miner", menu)
495
+ icon = Icon("TrayApp", create_image(), "GameSentenceMiner", menu)
455
496
  icon.run()
456
497
 
457
498
 
@@ -575,6 +616,18 @@ def handle_websocket_message(message: Message):
575
616
  close_obs()
576
617
  case FunctionName.START_OBS:
577
618
  obs.start_obs()
619
+ case FunctionName.OPEN_SETTINGS:
620
+ open_settings()
621
+ case FunctionName.OPEN_TEXTHOOKER:
622
+ texthooking_page.open_texthooker()
623
+ case FunctionName.OPEN_LOG:
624
+ open_log()
625
+ case FunctionName.TOGGLE_REPLAY_BUFFER:
626
+ play_pause(None, None)
627
+ case FunctionName.RESTART_OBS:
628
+ restart_obs()
629
+ case FunctionName.EXIT:
630
+ exit_program(None, None)
578
631
  case _:
579
632
  logger.debug(f"unknown message from electron websocket: {message.to_json()}")
580
633
 
@@ -627,7 +680,6 @@ async def register_scene_switcher_callback():
627
680
  settings_window.reload_settings()
628
681
  update_icon()
629
682
 
630
- logger.info("Registering scene switcher callback")
631
683
  await obs.register_scene_change_callback(scene_switcher_callback)
632
684
 
633
685
  async def main(reloading=False):
@@ -655,8 +707,8 @@ async def main(reloading=False):
655
707
 
656
708
 
657
709
  try:
658
- # if get_config().general.open_config_on_startup:
659
- # root.after(0, settings_window.show)
710
+ if get_config().general.open_config_on_startup:
711
+ root.after(50, settings_window.show)
660
712
  settings_window.add_save_hook(update_icon)
661
713
  settings_window.on_exit = exit_program
662
714
  root.mainloop()
GameSentenceMiner/obs.py CHANGED
@@ -99,12 +99,12 @@ async def check_obs_folder_is_correct():
99
99
  obs_record_directory = get_record_directory()
100
100
  if obs_record_directory and os.path.normpath(obs_record_directory) != os.path.normpath(
101
101
  get_config().paths.folder_to_watch):
102
- logger.info("OBS Path Setting wrong, OBS Recording folder in GSM Config")
102
+ logger.info("OBS Path wrong, Setting OBS Recording folder in GSM Config...")
103
103
  get_config().paths.folder_to_watch = os.path.normpath(obs_record_directory)
104
104
  get_master_config().sync_shared_fields()
105
105
  save_full_config(get_master_config())
106
106
  else:
107
- logger.info("OBS Recording path looks correct")
107
+ logger.debug("OBS Recording path looks correct")
108
108
 
109
109
 
110
110
  def get_obs_websocket_config_values():
@@ -194,8 +194,8 @@ class WebsocketServerThread(threading.Thread):
194
194
  self._stop_event = stop_event = asyncio.Event()
195
195
  self._event.set()
196
196
  self.server = start_server = websockets.serve(self.server_handler,
197
- get_config().general.websocket_uri.split(":")[0],
198
- get_config().general.websocket_uri.split(":")[1],
197
+ "0.0.0.0",
198
+ get_config().advanced.ocr_websocket_port,
199
199
  max_size=1000000000)
200
200
  async with start_server:
201
201
  await stop_event.wait()
@@ -313,20 +313,15 @@ def text_callback(text, orig_text, time, img=None, came_from_ss=False, filtering
313
313
  done = False
314
314
 
315
315
 
316
- def run_oneocr(ocr_config: OCRConfig, area=False):
316
+ def run_oneocr(ocr_config: OCRConfig, rectangles):
317
317
  global done
318
318
  print("Running OneOCR")
319
319
  screen_area = None
320
320
  screen_areas = []
321
+ exclusions = []
321
322
  if not ssonly:
322
- for rect_config in ocr_config.rectangles:
323
- if not rect_config.is_excluded:
324
- coords = rect_config.coordinates
325
- monitor_config = rect_config.monitor
326
- screen_area = ",".join(str(c) for c in coords) if area else None
327
- if screen_area:
328
- screen_areas.append(screen_area)
329
- exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, ocr_config.rectangles)))
323
+ screen_areas = [",".join(str(c) for c in rect_config.coordinates) for rect_config in rectangles if not rect_config.is_excluded]
324
+ exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, rectangles)))
330
325
 
331
326
  run.init_config(False)
332
327
  run.run(read_from="screencapture" if not ssonly else "clipboard",
@@ -334,13 +329,13 @@ def run_oneocr(ocr_config: OCRConfig, area=False):
334
329
  write_to="callback",
335
330
  screen_capture_area=screen_area,
336
331
  # screen_capture_monitor=monitor_config['index'],
337
- screen_capture_window=ocr_config.window,
332
+ screen_capture_window=ocr_config.window if ocr_config and ocr_config.window else None,
338
333
  screen_capture_only_active_windows=get_requires_open_window(),
339
334
  screen_capture_delay_secs=get_ocr_scan_rate(), engine=ocr1,
340
335
  text_callback=text_callback,
341
336
  screen_capture_exclusions=exclusions,
342
337
  language=language,
343
- monitor_index=ocr_config.window,
338
+ monitor_index=None,
344
339
  ocr1=ocr1,
345
340
  ocr2=ocr2,
346
341
  gsm_ocr_config=ocr_config,
@@ -380,7 +375,7 @@ if __name__ == "__main__":
380
375
  import sys
381
376
 
382
377
  args = sys.argv[1:]
383
- if len(args) == 4:
378
+ if len(args) >= 4:
384
379
  language = args[0]
385
380
  ocr1 = args[1]
386
381
  ocr2 = args[2]
@@ -418,22 +413,19 @@ if __name__ == "__main__":
418
413
  else:
419
414
  logger.error(f"Window '{ocr_config.window}' not found within 30 seconds.")
420
415
  sys.exit(1)
421
- logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
422
- if ocr_config:
423
- rectangles = list(filter(lambda rect: not rect.is_excluded, ocr_config.rectangles))
416
+ logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
417
+ if ocr_config or ssonly:
418
+ rectangles = ocr_config.rectangles if ocr_config and ocr_config.rectangles else []
424
419
  oneocr_threads = []
425
- single_ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,ocr_config.rectangles ), daemon=True)
426
- oneocr_threads.append(single_ocr_thread)
427
- single_ocr_thread.start()
428
- websocket_server_thread = WebsocketServerThread(read=True)
429
- websocket_server_thread.start()
420
+ ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,rectangles ), daemon=True)
421
+ ocr_thread.start()
422
+ if not ssonly:
423
+ websocket_server_thread = WebsocketServerThread(read=True)
424
+ websocket_server_thread.start()
430
425
  try:
431
426
  while not done:
432
427
  time.sleep(1)
433
428
  except KeyboardInterrupt as e:
434
429
  pass
435
- for thread in oneocr_threads:
436
- thread.join()
437
- # asyncio.run(websocket_client())
438
430
  else:
439
431
  print("Failed to load OCR configuration. Please check the logs.")
@@ -1043,7 +1043,7 @@ class GeminiOCR:
1043
1043
  }
1044
1044
  },
1045
1045
  {
1046
- 'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
1046
+ 'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
1047
1047
  }
1048
1048
  ]
1049
1049
  }
@@ -1096,13 +1096,14 @@ class GroqOCR:
1096
1096
  return (False, 'Error processing image for Groq.')
1097
1097
 
1098
1098
  prompt = (
1099
- "Analyze this image and extract text from it"
1099
+ "Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
1100
+ # "Analyze this i#mage and extract text from it"
1100
1101
  # "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
1101
1102
  # "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
1102
1103
  # "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
1103
1104
  # "If no text is found within dialogue boxes after applying filters, return an empty string. "
1104
1105
  # "OR, if there are no text bubbles or dialogue boxes found, return everything."
1105
- "Do not include any other output, formatting markers, or commentary, only the text from the image."
1106
+ # "Do not include any other output, formatting markers, or commentary, only the text from the image."
1106
1107
  )
1107
1108
 
1108
1109
  response = self.client.chat.completions.create(
@@ -125,7 +125,7 @@ def get_text_event(last_note) -> GameLine:
125
125
  if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
126
126
  return line
127
127
 
128
- logger.debug("Couldn't find a match in history, using last event")
128
+ logger.info("Could not find matching sentence from GSM's history. Using the latest line.")
129
129
  return lines[-1]
130
130
 
131
131
 
@@ -0,0 +1,82 @@
1
+ import os
2
+ import tempfile
3
+ import time
4
+
5
+ from groq import Groq
6
+
7
+ # Assuming these are available from GameSentenceMiner
8
+ from GameSentenceMiner import configuration, ffmpeg
9
+ from GameSentenceMiner.configuration import get_config, logger, GROQ # Import specific functions/objects
10
+ from GameSentenceMiner.vad.result import VADResult
11
+ from GameSentenceMiner.vad.vad_utils import get_audio_length
12
+
13
+ # Initialize Groq Client
14
+ client = Groq(api_key=get_config().ai.groq_api_key)
15
+
16
+ def detect_voice_with_groq(input_audio_path):
17
+ """
18
+ Detects voice activity and extracts speech timestamps using the Groq Whisper API.
19
+ """
20
+ try:
21
+ with open(input_audio_path, "rb") as file:
22
+ transcription = client.audio.transcriptions.create(
23
+ file=(os.path.basename(input_audio_path), file.read()),
24
+ model="whisper-large-v3-turbo",
25
+ response_format="verbose_json",
26
+ language=get_config().vad.language,
27
+ temperature=0.0,
28
+ timestamp_granularities=["segment"],
29
+ prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
30
+ )
31
+
32
+ logger.debug(transcription)
33
+
34
+ # print(transcription)
35
+
36
+ speech_segments = transcription.segments if hasattr(transcription, 'segments') else []
37
+ # print(f"Groq speech segments: {speech_segments}")
38
+
39
+ audio_length = get_audio_length(input_audio_path)
40
+ # print(f"FFPROBE Length of input audio: {audio_length}")
41
+
42
+ return speech_segments, audio_length
43
+ except Exception as e:
44
+ logger.error(f"Error detecting voice with Groq: {e}")
45
+ return [], 0.0
46
+
47
+ def process_audio_with_groq(input_audio, output_audio, game_line):
48
+ """
49
+ Processes an audio file by detecting voice activity using Groq Whisper API,
50
+ trimming the audio based on detected speech timestamps, and saving the trimmed audio.
51
+ """
52
+ start = time.time()
53
+ voice_activity, audio_length = detect_voice_with_groq(input_audio)
54
+ logger.info(f"Processing time for Groq: {time.time() - start:.2f} seconds")
55
+
56
+ if not voice_activity:
57
+ logger.info(f"No voice activity detected in {input_audio}")
58
+ return VADResult(False, 0, 0, GROQ)
59
+
60
+ start_time = voice_activity[0]['start']
61
+ end_time = voice_activity[-1]['end']
62
+
63
+ # Logic to potentially use the second-to-last timestamp if a next game line is expected
64
+ # and there's a significant pause before the very last segment.
65
+ if (game_line and hasattr(game_line, 'next') and game_line.next and
66
+ len(voice_activity) > 1 and
67
+ (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
68
+ end_time = voice_activity[-2]['end']
69
+ logger.info("Using the second last timestamp for trimming due to game_line.next and significant pause.")
70
+
71
+ # Apply offsets from configuration, ensuring times are within valid bounds
72
+ final_start_time = max(0, start_time + get_config().vad.beginning_offset)
73
+ final_end_time = min(audio_length, end_time + get_config().audio.end_offset)
74
+
75
+ logger.debug(f"Trimming {input_audio} from {final_start_time:.2f}s to {final_end_time:.2f}s into {output_audio}")
76
+
77
+ ffmpeg.trim_audio(input_audio, final_start_time, final_end_time, output_audio)
78
+
79
+ return VADResult(True, final_start_time, final_end_time, GROQ)
80
+
81
+ # Example usage (uncomment and modify with your actual file paths for testing)
82
+ # process_audio_with_groq("tmp6x81cy27.opus", "tmp6x81cy27_trimmed_groq.opus", None)
@@ -1,8 +1,21 @@
1
+ from GameSentenceMiner.configuration import get_config
2
+
3
+
1
4
  class VADResult:
2
- def __init__(self, success: bool, start: float, end: float):
5
+ def __init__(self, success: bool, start: float, end: float, model: str):
3
6
  self.success = success
4
7
  self.start = start
5
8
  self.end = end
9
+ self.model = model
6
10
 
7
11
  def __repr__(self):
8
- return f"VADResult(success={self.success}, start={self.start}, end={self.end})"
12
+ return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model})"
13
+
14
+ def trim_successful_string(self):
15
+ if self.success:
16
+ if get_config().vad.trim_beginning:
17
+ return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
18
+ else:
19
+ return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
20
+ else:
21
+ return f"Failed to trim audio using {self.model}."
@@ -5,6 +5,7 @@ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
5
5
  from GameSentenceMiner import configuration, ffmpeg
6
6
  from GameSentenceMiner.configuration import *
7
7
  from GameSentenceMiner.vad.result import VADResult
8
+ from GameSentenceMiner.vad.vad_utils import get_audio_length
8
9
 
9
10
  # Silero VAD setup
10
11
  vad_model = load_silero_vad()
@@ -17,32 +18,35 @@ def detect_voice_with_silero(input_audio):
17
18
  ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
18
19
 
19
20
  # Load the audio and detect speech timestamps
20
- wav = read_audio(temp_wav, sampling_rate=16000)
21
+ wav = read_audio(temp_wav)
21
22
  speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)
22
23
 
23
24
  logger.debug(speech_timestamps)
24
25
 
25
26
  # Return the speech timestamps (start and end in seconds)
26
- return speech_timestamps
27
+ return speech_timestamps, len(wav) / 16000
27
28
 
28
29
 
29
30
  # Example usage of Silero with trimming
30
31
  def process_audio_with_silero(input_audio, output_audio, game_line):
31
- voice_activity = detect_voice_with_silero(input_audio)
32
+ voice_activity, audio_length = detect_voice_with_silero(input_audio)
32
33
 
33
34
  if not voice_activity:
34
- return VADResult(False, 0, 0)
35
+ return VADResult(False, 0, 0, SILERO)
35
36
 
36
37
  # Trim based on the first and last speech detected
37
38
  start_time = voice_activity[0]['start'] if voice_activity else 0
38
- if (game_line.next and len(voice_activity) > 1
39
- and voice_activity[-1]['end'] - get_config().audio.beginning_offset > len(input_audio) / 16000
40
- and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
41
- end_time = voice_activity[-2]['end']
42
- logger.info("Using the second last timestamp for trimming")
39
+ if game_line and game_line.next and len(voice_activity) > 1 and 0 > audio_length - voice_activity[-1]['start'] + get_config().audio.beginning_offset:
40
+ # and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
41
+ end_time = voice_activity[-2]['end']
42
+ logger.info("Using the second last timestamp for trimming")
43
43
  else:
44
44
  end_time = voice_activity[-1]['end'] if voice_activity else 0
45
45
 
46
46
  # Trim the audio using FFmpeg
47
47
  ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
48
- return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
48
+ return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, SILERO)
49
+
50
+
51
+ # process_audio_with_silero("tmp6x81cy27.opus", "tmp6x81cy27_trimmed.opus", None)
52
+ # print(detect_voice_with_silero("tmp6x81cy27.opus"))
@@ -128,7 +128,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
128
128
 
129
129
  if not voice_activity:
130
130
  logger.info("No voice activity detected in the audio.")
131
- return VADResult(False, 0, 0)
131
+ return VADResult(False, 0, 0, VOSK)
132
132
 
133
133
  # Trim based on the first and last speech detected
134
134
  start_time = voice_activity[0]['start'] if voice_activity else 0
@@ -148,7 +148,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
148
148
 
149
149
  # Trim the audio using FFmpeg
150
150
  ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
151
- return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
151
+ return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, VOSK)
152
152
 
153
153
 
154
154
  def get_vosk_model():