GameSentenceMiner 2.8.54__py3-none-any.whl → 2.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GameSentenceMiner/gsm.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import shutil
2
3
  import sys
3
4
 
4
5
  from GameSentenceMiner.vad.result import VADResult
@@ -59,39 +60,28 @@ root = None
59
60
 
60
61
 
61
62
  class VideoToAudioHandler(FileSystemEventHandler):
63
+ def __init__(self):
64
+ super().__init__()
65
+
66
+
62
67
  def on_created(self, event):
63
68
  if event.is_directory or ("Replay" not in event.src_path and "GSM" not in event.src_path):
64
69
  return
65
70
  if event.src_path.endswith(".mkv") or event.src_path.endswith(".mp4"): # Adjust based on your OBS output format
66
71
  logger.info(f"MKV {event.src_path} FOUND, RUNNING LOGIC")
67
72
  wait_for_stable_file(event.src_path)
68
- self.convert_to_audio(event.src_path)
73
+ self.process_replay(event.src_path)
69
74
 
70
- @staticmethod
71
- def convert_to_audio(video_path):
75
+ def process_replay(self, video_path):
72
76
  vad_trimmed_audio = ''
73
- try:
74
- if texthooking_page.event_manager.line_for_audio:
75
- line: GameLine = texthooking_page.event_manager.line_for_audio
76
- texthooking_page.event_manager.line_for_audio = None
77
- if get_config().advanced.audio_player_path:
78
- audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path, temporary=True)
79
- play_audio_in_external(audio)
80
- os.remove(video_path)
81
- elif get_config().advanced.video_player_path:
82
- play_video_in_external(line, video_path)
83
- return
84
- if texthooking_page.event_manager.line_for_screenshot:
85
- line: GameLine = texthooking_page.event_manager.line_for_screenshot
86
- texthooking_page.event_manager.line_for_screenshot = None
87
- screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
88
- os.startfile(screenshot)
89
- os.remove(video_path)
90
- return
91
- except Exception as e:
92
- logger.error(f"Error Playing Audio/Video: {e}")
93
- logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
77
+ print(video_path)
78
+ if "previous.mkv" in video_path:
94
79
  os.remove(video_path)
80
+ video_path = gsm_state.previous_replay
81
+ else:
82
+ gsm_state.previous_replay = video_path
83
+ if gsm_state.line_for_audio or gsm_state.line_for_screenshot:
84
+ self.handle_texthooker_button(video_path)
95
85
  return
96
86
  try:
97
87
  if anki.card_queue and len(anki.card_queue) > 0:
@@ -144,10 +134,11 @@ class VideoToAudioHandler(FileSystemEventHandler):
144
134
  start_line,
145
135
  line_cutoff,
146
136
  video_path,
147
- anki_card_creation_time)
137
+ anki_card_creation_time,
138
+ mined_line=mined_line)
148
139
  else:
149
140
  final_audio_output = ""
150
- vad_result = VADResult(False, 0, 0)
141
+ vad_result = VADResult(False, 0, 0, '')
151
142
  vad_trimmed_audio = ""
152
143
  if not get_config().audio.enabled:
153
144
  logger.info("Audio is disabled in config, skipping audio processing!")
@@ -183,9 +174,46 @@ class VideoToAudioHandler(FileSystemEventHandler):
183
174
  if vad_trimmed_audio and get_config().paths.remove_audio and os.path.exists(vad_trimmed_audio):
184
175
  os.remove(vad_trimmed_audio) # Optionally remove the screenshot after conversion
185
176
 
177
+ def handle_texthooker_button(self, video_path):
178
+ try:
179
+ if gsm_state.line_for_audio:
180
+ line: GameLine = gsm_state.line_for_audio
181
+ gsm_state.line_for_audio = None
182
+ if line == gsm_state.previous_line_for_audio:
183
+ logger.info("Line is the same as the last one, skipping processing.")
184
+ if get_config().advanced.audio_player_path:
185
+ play_audio_in_external(gsm_state.previous_audio)
186
+ elif get_config().advanced.video_player_path:
187
+ play_video_in_external(line, gsm_state.previous_audio)
188
+ return
189
+ gsm_state.previous_line_for_audio = line
190
+ if get_config().advanced.audio_player_path:
191
+ audio = VideoToAudioHandler.get_audio(line, line.next.time if line.next else None, video_path,
192
+ temporary=True)
193
+ play_audio_in_external(audio)
194
+ gsm_state.previous_audio = audio
195
+ elif get_config().advanced.video_player_path:
196
+ new_video_path = play_video_in_external(line, video_path)
197
+ gsm_state.previous_audio = new_video_path
198
+ gsm_state.previous_replay = new_video_path
199
+ return
200
+ if gsm_state.line_for_screenshot:
201
+ line: GameLine = gsm_state.line_for_screenshot
202
+ gsm_state.line_for_screenshot = None
203
+ gsm_state.previous_line_for_screenshot = line
204
+ screenshot = ffmpeg.get_screenshot_for_line(video_path, line, True)
205
+ os.startfile(screenshot)
206
+ return
207
+ except Exception as e:
208
+ logger.error(f"Error Playing Audio/Video: {e}")
209
+ logger.debug(f"Error Playing Audio/Video: {e}", exc_info=True)
210
+ return
211
+ finally:
212
+ if video_path and get_config().paths.remove_video and os.path.exists(video_path):
213
+ os.remove(video_path)
186
214
 
187
215
  @staticmethod
188
- def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False):
216
+ def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False, mined_line=None):
189
217
  logger.info("Getting audio from video...")
190
218
  trimmed_audio = get_audio_and_trim(video_path, game_line, next_line_time, anki_card_creation_time)
191
219
  if temporary:
@@ -194,13 +222,12 @@ class VideoToAudioHandler(FileSystemEventHandler):
194
222
  f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
195
223
  final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
196
224
  f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
197
- result = VADResult(False, 0, 0)
225
+ result = VADResult(False, 0, 0, "")
198
226
  if get_config().vad.do_vad_postprocessing:
199
- logger.info("Trimming audio with Voice Detection...")
200
- result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=game_line)
227
+ result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio, game_line=mined_line)
201
228
  if not result.success:
202
229
  result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
203
- vad_trimmed_audio, game_line=game_line)
230
+ vad_trimmed_audio, game_line=mined_line)
204
231
  if not result.success:
205
232
  if get_config().vad.add_audio_on_no_results:
206
233
  logger.info("No voice activity detected, using full audio.")
@@ -208,6 +235,8 @@ class VideoToAudioHandler(FileSystemEventHandler):
208
235
  else:
209
236
  logger.info("No voice activity detected.")
210
237
  return None, result, None
238
+ else:
239
+ logger.info(result.trim_successful_string())
211
240
  if timing_only:
212
241
  return result
213
242
  if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
@@ -222,6 +251,9 @@ def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, game_line=None, s
222
251
  match model:
223
252
  case configuration.OFF:
224
253
  pass
254
+ case configuration.GROQ:
255
+ from GameSentenceMiner.vad import groq_trim
256
+ return groq_trim.process_audio_with_groq(trimmed_audio, vad_trimmed_audio, game_line)
225
257
  case configuration.SILERO:
226
258
  from GameSentenceMiner.vad import silero_trim
227
259
  return silero_trim.process_audio_with_silero(trimmed_audio, vad_trimmed_audio, game_line)
@@ -238,7 +270,7 @@ def play_audio_in_external(filepath):
238
270
 
239
271
  filepath = os.path.normpath(filepath)
240
272
 
241
- command = [exe, filepath]
273
+ command = [exe, "--no-video", filepath]
242
274
 
243
275
  try:
244
276
  subprocess.Popen(command)
@@ -247,10 +279,13 @@ def play_audio_in_external(filepath):
247
279
  print(f"An error occurred: {e}")
248
280
 
249
281
  def play_video_in_external(line, filepath):
250
- def remove_video_when_closed(p, fp):
282
+ def move_video_when_closed(p, fp):
251
283
  p.wait()
252
284
  os.remove(fp)
253
285
 
286
+ shutil.move(filepath, get_temporary_directory())
287
+ new_filepath = os.path.join(get_temporary_directory(), os.path.basename(filepath))
288
+
254
289
  command = [get_config().advanced.video_player_path]
255
290
 
256
291
  start, _, _ = get_video_timings(filepath, line)
@@ -264,14 +299,17 @@ def play_video_in_external(line, filepath):
264
299
 
265
300
  logger.info(" ".join(command))
266
301
 
302
+
303
+
267
304
  try:
268
305
  proc = subprocess.Popen(command)
269
306
  print(f"Opened {filepath} in {get_config().advanced.video_player_path}.")
270
- threading.Thread(target=remove_video_when_closed, args=(proc, filepath)).start()
307
+ threading.Thread(target=move_video_when_closed, args=(proc, filepath)).start()
271
308
  except FileNotFoundError:
272
309
  print("VLC not found. Make sure it's installed and in your PATH.")
273
310
  except Exception as e:
274
311
  print(f"An error occurred: {e}")
312
+ return new_filepath
275
313
 
276
314
  def convert_to_vlc_seconds(time_str):
277
315
  """Converts HH:MM:SS.milliseconds to VLC-compatible seconds."""
@@ -325,21 +363,25 @@ def get_screenshot():
325
363
  logger.error(f"Failed to get Screenshot: {e}")
326
364
 
327
365
 
328
- def create_image():
329
- """Create a simple pickaxe icon."""
330
- width, height = 64, 64
331
- image = Image.new("RGBA", (width, height), (0, 0, 0, 0)) # Transparent background
332
- draw = ImageDraw.Draw(image)
333
-
334
- # Handle (rectangle)
335
- handle_color = (139, 69, 19) # Brown color
336
- draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
366
+ # def create_image():
367
+ # """Create a simple pickaxe icon."""
368
+ # width, height = 64, 64
369
+ # image = Image.new("RGBA", (width, height), (0, 0, 0, 0)) # Transparent background
370
+ # draw = ImageDraw.Draw(image)
371
+ #
372
+ # # Handle (rectangle)
373
+ # handle_color = (139, 69, 19) # Brown color
374
+ # draw.rectangle([(30, 15), (34, 50)], fill=handle_color)
375
+ #
376
+ # # Blade (triangle-like shape)
377
+ # blade_color = (192, 192, 192) # Silver color
378
+ # draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
379
+ #
380
+ # return image
337
381
 
338
- # Blade (triangle-like shape)
339
- blade_color = (192, 192, 192) # Silver color
340
- draw.polygon([(15, 15), (49, 15), (32, 5)], fill=blade_color)
341
-
342
- return image
382
+ def create_image():
383
+ image_path = os.path.join(os.path.dirname(__file__), "assets", "pickaxe.png")
384
+ return Image.open(image_path)
343
385
 
344
386
 
345
387
  def open_settings():
@@ -350,7 +392,7 @@ def open_settings():
350
392
  def play_most_recent_audio():
351
393
  if get_config().advanced.audio_player_path or get_config().advanced.video_player_path and len(
352
394
  get_all_lines()) > 0:
353
- texthooking_page.event_manager.line_for_audio = get_all_lines()[-1]
395
+ gsm_state.line_for_audio = get_all_lines()[-1]
354
396
  obs.save_replay_buffer()
355
397
  else:
356
398
  logger.error("Feature Disabled. No audio or video player path set in config!")
@@ -404,7 +446,7 @@ def update_icon(profile=None):
404
446
  )
405
447
 
406
448
  menu = Menu(
407
- MenuItem("Open Settings", open_settings),
449
+ MenuItem("Open Settings", open_settings, default=True),
408
450
  MenuItem("Open Multi-Mine GUI", open_multimine),
409
451
  MenuItem("Open Log", open_log),
410
452
  MenuItem("Toggle Replay Buffer", play_pause),
@@ -441,7 +483,7 @@ def run_tray():
441
483
  )
442
484
 
443
485
  menu = Menu(
444
- MenuItem("Open Settings", open_settings),
486
+ MenuItem("Open Settings", open_settings, default=True),
445
487
  MenuItem("Open Texthooker", texthooking_page.open_texthooker),
446
488
  MenuItem("Open Log", open_log),
447
489
  MenuItem("Toggle Replay Buffer", play_pause),
@@ -450,7 +492,7 @@ def run_tray():
450
492
  MenuItem("Exit", exit_program)
451
493
  )
452
494
 
453
- icon = Icon("TrayApp", create_image(), "Game Sentence Miner", menu)
495
+ icon = Icon("TrayApp", create_image(), "GameSentenceMiner", menu)
454
496
  icon.run()
455
497
 
456
498
 
@@ -574,6 +616,18 @@ def handle_websocket_message(message: Message):
574
616
  close_obs()
575
617
  case FunctionName.START_OBS:
576
618
  obs.start_obs()
619
+ case FunctionName.OPEN_SETTINGS:
620
+ open_settings()
621
+ case FunctionName.OPEN_TEXTHOOKER:
622
+ texthooking_page.open_texthooker()
623
+ case FunctionName.OPEN_LOG:
624
+ open_log()
625
+ case FunctionName.TOGGLE_REPLAY_BUFFER:
626
+ play_pause(None, None)
627
+ case FunctionName.RESTART_OBS:
628
+ restart_obs()
629
+ case FunctionName.EXIT:
630
+ exit_program(None, None)
577
631
  case _:
578
632
  logger.debug(f"unknown message from electron websocket: {message.to_json()}")
579
633
 
@@ -626,7 +680,6 @@ async def register_scene_switcher_callback():
626
680
  settings_window.reload_settings()
627
681
  update_icon()
628
682
 
629
- logger.info("Registering scene switcher callback")
630
683
  await obs.register_scene_change_callback(scene_switcher_callback)
631
684
 
632
685
  async def main(reloading=False):
@@ -654,8 +707,8 @@ async def main(reloading=False):
654
707
 
655
708
 
656
709
  try:
657
- # if get_config().general.open_config_on_startup:
658
- # root.after(0, settings_window.show)
710
+ if get_config().general.open_config_on_startup:
711
+ root.after(50, settings_window.show)
659
712
  settings_window.add_save_hook(update_icon)
660
713
  settings_window.on_exit = exit_program
661
714
  root.mainloop()
GameSentenceMiner/obs.py CHANGED
@@ -99,12 +99,12 @@ async def check_obs_folder_is_correct():
99
99
  obs_record_directory = get_record_directory()
100
100
  if obs_record_directory and os.path.normpath(obs_record_directory) != os.path.normpath(
101
101
  get_config().paths.folder_to_watch):
102
- logger.info("OBS Path Setting wrong, OBS Recording folder in GSM Config")
102
+ logger.info("OBS Path wrong, Setting OBS Recording folder in GSM Config...")
103
103
  get_config().paths.folder_to_watch = os.path.normpath(obs_record_directory)
104
104
  get_master_config().sync_shared_fields()
105
105
  save_full_config(get_master_config())
106
106
  else:
107
- logger.info("OBS Recording path looks correct")
107
+ logger.debug("OBS Recording path looks correct")
108
108
 
109
109
 
110
110
  def get_obs_websocket_config_values():
@@ -194,8 +194,8 @@ class WebsocketServerThread(threading.Thread):
194
194
  self._stop_event = stop_event = asyncio.Event()
195
195
  self._event.set()
196
196
  self.server = start_server = websockets.serve(self.server_handler,
197
- get_config().general.websocket_uri.split(":")[0],
198
- get_config().general.websocket_uri.split(":")[1],
197
+ "0.0.0.0",
198
+ get_config().advanced.ocr_websocket_port,
199
199
  max_size=1000000000)
200
200
  async with start_server:
201
201
  await stop_event.wait()
@@ -313,20 +313,15 @@ def text_callback(text, orig_text, time, img=None, came_from_ss=False, filtering
313
313
  done = False
314
314
 
315
315
 
316
- def run_oneocr(ocr_config: OCRConfig, area=False):
316
+ def run_oneocr(ocr_config: OCRConfig, rectangles):
317
317
  global done
318
318
  print("Running OneOCR")
319
319
  screen_area = None
320
320
  screen_areas = []
321
+ exclusions = []
321
322
  if not ssonly:
322
- for rect_config in ocr_config.rectangles:
323
- if not rect_config.is_excluded:
324
- coords = rect_config.coordinates
325
- monitor_config = rect_config.monitor
326
- screen_area = ",".join(str(c) for c in coords) if area else None
327
- if screen_area:
328
- screen_areas.append(screen_area)
329
- exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, ocr_config.rectangles)))
323
+ screen_areas = [",".join(str(c) for c in rect_config.coordinates) for rect_config in rectangles if not rect_config.is_excluded]
324
+ exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, rectangles)))
330
325
 
331
326
  run.init_config(False)
332
327
  run.run(read_from="screencapture" if not ssonly else "clipboard",
@@ -334,13 +329,13 @@ def run_oneocr(ocr_config: OCRConfig, area=False):
334
329
  write_to="callback",
335
330
  screen_capture_area=screen_area,
336
331
  # screen_capture_monitor=monitor_config['index'],
337
- screen_capture_window=ocr_config.window,
332
+ screen_capture_window=ocr_config.window if ocr_config and ocr_config.window else None,
338
333
  screen_capture_only_active_windows=get_requires_open_window(),
339
334
  screen_capture_delay_secs=get_ocr_scan_rate(), engine=ocr1,
340
335
  text_callback=text_callback,
341
336
  screen_capture_exclusions=exclusions,
342
337
  language=language,
343
- monitor_index=ocr_config.window,
338
+ monitor_index=None,
344
339
  ocr1=ocr1,
345
340
  ocr2=ocr2,
346
341
  gsm_ocr_config=ocr_config,
@@ -380,7 +375,7 @@ if __name__ == "__main__":
380
375
  import sys
381
376
 
382
377
  args = sys.argv[1:]
383
- if len(args) == 4:
378
+ if len(args) >= 4:
384
379
  language = args[0]
385
380
  ocr1 = args[1]
386
381
  ocr2 = args[2]
@@ -418,22 +413,19 @@ if __name__ == "__main__":
418
413
  else:
419
414
  logger.error(f"Window '{ocr_config.window}' not found within 30 seconds.")
420
415
  sys.exit(1)
421
- logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
422
- if ocr_config:
423
- rectangles = list(filter(lambda rect: not rect.is_excluded, ocr_config.rectangles))
416
+ logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
417
+ if ocr_config or ssonly:
418
+ rectangles = ocr_config.rectangles if ocr_config and ocr_config.rectangles else []
424
419
  oneocr_threads = []
425
- single_ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,ocr_config.rectangles ), daemon=True)
426
- oneocr_threads.append(single_ocr_thread)
427
- single_ocr_thread.start()
428
- websocket_server_thread = WebsocketServerThread(read=True)
429
- websocket_server_thread.start()
420
+ ocr_thread = threading.Thread(target=run_oneocr, args=(ocr_config,rectangles ), daemon=True)
421
+ ocr_thread.start()
422
+ if not ssonly:
423
+ websocket_server_thread = WebsocketServerThread(read=True)
424
+ websocket_server_thread.start()
430
425
  try:
431
426
  while not done:
432
427
  time.sleep(1)
433
428
  except KeyboardInterrupt as e:
434
429
  pass
435
- for thread in oneocr_threads:
436
- thread.join()
437
- # asyncio.run(websocket_client())
438
430
  else:
439
431
  print("Failed to load OCR configuration. Please check the logs.")
@@ -1043,7 +1043,7 @@ class GeminiOCR:
1043
1043
  }
1044
1044
  },
1045
1045
  {
1046
- 'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
1046
+ 'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
1047
1047
  }
1048
1048
  ]
1049
1049
  }
@@ -1096,13 +1096,14 @@ class GroqOCR:
1096
1096
  return (False, 'Error processing image for Groq.')
1097
1097
 
1098
1098
  prompt = (
1099
- "Analyze this image and extract text from it"
1099
+ "Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
1100
+ # "Analyze this i#mage and extract text from it"
1100
1101
  # "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
1101
1102
  # "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
1102
1103
  # "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
1103
1104
  # "If no text is found within dialogue boxes after applying filters, return an empty string. "
1104
1105
  # "OR, if there are no text bubbles or dialogue boxes found, return everything."
1105
- "Do not include any other output, formatting markers, or commentary, only the text from the image."
1106
+ # "Do not include any other output, formatting markers, or commentary, only the text from the image."
1106
1107
  )
1107
1108
 
1108
1109
  response = self.client.chat.completions.create(
@@ -125,7 +125,7 @@ def get_text_event(last_note) -> GameLine:
125
125
  if lines_match(line.text, remove_html_and_cloze_tags(sentence)):
126
126
  return line
127
127
 
128
- logger.debug("Couldn't find a match in history, using last event")
128
+ logger.info("Could not find matching sentence from GSM's history. Using the latest line.")
129
129
  return lines[-1]
130
130
 
131
131
 
@@ -0,0 +1,82 @@
1
+ import os
2
+ import tempfile
3
+ import time
4
+
5
+ from groq import Groq
6
+
7
+ # Assuming these are available from GameSentenceMiner
8
+ from GameSentenceMiner import configuration, ffmpeg
9
+ from GameSentenceMiner.configuration import get_config, logger, GROQ # Import specific functions/objects
10
+ from GameSentenceMiner.vad.result import VADResult
11
+ from GameSentenceMiner.vad.vad_utils import get_audio_length
12
+
13
+ # Initialize Groq Client
14
+ client = Groq(api_key=get_config().ai.groq_api_key)
15
+
16
+ def detect_voice_with_groq(input_audio_path):
17
+ """
18
+ Detects voice activity and extracts speech timestamps using the Groq Whisper API.
19
+ """
20
+ try:
21
+ with open(input_audio_path, "rb") as file:
22
+ transcription = client.audio.transcriptions.create(
23
+ file=(os.path.basename(input_audio_path), file.read()),
24
+ model="whisper-large-v3-turbo",
25
+ response_format="verbose_json",
26
+ language=get_config().vad.language,
27
+ temperature=0.0,
28
+ timestamp_granularities=["segment"],
29
+ prompt=f"Start detecting speech from the first spoken word. If there is music or background noise, ignore it completely. Be very careful to not hallucinate on silence. If the transcription is anything but language:{get_config().vad.language}, ignore it completely. If the end of the audio seems like the start of a new sentence, ignore it completely.",
30
+ )
31
+
32
+ logger.debug(transcription)
33
+
34
+ # print(transcription)
35
+
36
+ speech_segments = transcription.segments if hasattr(transcription, 'segments') else []
37
+ # print(f"Groq speech segments: {speech_segments}")
38
+
39
+ audio_length = get_audio_length(input_audio_path)
40
+ # print(f"FFPROBE Length of input audio: {audio_length}")
41
+
42
+ return speech_segments, audio_length
43
+ except Exception as e:
44
+ logger.error(f"Error detecting voice with Groq: {e}")
45
+ return [], 0.0
46
+
47
+ def process_audio_with_groq(input_audio, output_audio, game_line):
48
+ """
49
+ Processes an audio file by detecting voice activity using Groq Whisper API,
50
+ trimming the audio based on detected speech timestamps, and saving the trimmed audio.
51
+ """
52
+ start = time.time()
53
+ voice_activity, audio_length = detect_voice_with_groq(input_audio)
54
+ logger.info(f"Processing time for Groq: {time.time() - start:.2f} seconds")
55
+
56
+ if not voice_activity:
57
+ logger.info(f"No voice activity detected in {input_audio}")
58
+ return VADResult(False, 0, 0, GROQ)
59
+
60
+ start_time = voice_activity[0]['start']
61
+ end_time = voice_activity[-1]['end']
62
+
63
+ # Logic to potentially use the second-to-last timestamp if a next game line is expected
64
+ # and there's a significant pause before the very last segment.
65
+ if (game_line and hasattr(game_line, 'next') and game_line.next and
66
+ len(voice_activity) > 1 and
67
+ (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
68
+ end_time = voice_activity[-2]['end']
69
+ logger.info("Using the second last timestamp for trimming due to game_line.next and significant pause.")
70
+
71
+ # Apply offsets from configuration, ensuring times are within valid bounds
72
+ final_start_time = max(0, start_time + get_config().vad.beginning_offset)
73
+ final_end_time = min(audio_length, end_time + get_config().audio.end_offset)
74
+
75
+ logger.debug(f"Trimming {input_audio} from {final_start_time:.2f}s to {final_end_time:.2f}s into {output_audio}")
76
+
77
+ ffmpeg.trim_audio(input_audio, final_start_time, final_end_time, output_audio)
78
+
79
+ return VADResult(True, final_start_time, final_end_time, GROQ)
80
+
81
+ # Example usage (uncomment and modify with your actual file paths for testing)
82
+ # process_audio_with_groq("tmp6x81cy27.opus", "tmp6x81cy27_trimmed_groq.opus", None)
@@ -1,8 +1,21 @@
1
+ from GameSentenceMiner.configuration import get_config
2
+
3
+
1
4
  class VADResult:
2
- def __init__(self, success: bool, start: float, end: float):
5
+ def __init__(self, success: bool, start: float, end: float, model: str):
3
6
  self.success = success
4
7
  self.start = start
5
8
  self.end = end
9
+ self.model = model
6
10
 
7
11
  def __repr__(self):
8
- return f"VADResult(success={self.success}, start={self.start}, end={self.end})"
12
+ return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model})"
13
+
14
+ def trim_successful_string(self):
15
+ if self.success:
16
+ if get_config().vad.trim_beginning:
17
+ return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
18
+ else:
19
+ return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
20
+ else:
21
+ return f"Failed to trim audio using {self.model}."
@@ -5,6 +5,7 @@ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
5
5
  from GameSentenceMiner import configuration, ffmpeg
6
6
  from GameSentenceMiner.configuration import *
7
7
  from GameSentenceMiner.vad.result import VADResult
8
+ from GameSentenceMiner.vad.vad_utils import get_audio_length
8
9
 
9
10
  # Silero VAD setup
10
11
  vad_model = load_silero_vad()
@@ -17,32 +18,35 @@ def detect_voice_with_silero(input_audio):
17
18
  ffmpeg.convert_audio_to_wav(input_audio, temp_wav)
18
19
 
19
20
  # Load the audio and detect speech timestamps
20
- wav = read_audio(temp_wav, sampling_rate=16000)
21
+ wav = read_audio(temp_wav)
21
22
  speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)
22
23
 
23
24
  logger.debug(speech_timestamps)
24
25
 
25
26
  # Return the speech timestamps (start and end in seconds)
26
- return speech_timestamps
27
+ return speech_timestamps, len(wav) / 16000
27
28
 
28
29
 
29
30
  # Example usage of Silero with trimming
30
31
  def process_audio_with_silero(input_audio, output_audio, game_line):
31
- voice_activity = detect_voice_with_silero(input_audio)
32
+ voice_activity, audio_length = detect_voice_with_silero(input_audio)
32
33
 
33
34
  if not voice_activity:
34
- return VADResult(False, 0, 0)
35
+ return VADResult(False, 0, 0, SILERO)
35
36
 
36
37
  # Trim based on the first and last speech detected
37
38
  start_time = voice_activity[0]['start'] if voice_activity else 0
38
- if (game_line.next and len(voice_activity) > 1
39
- and voice_activity[-1]['end'] - get_config().audio.beginning_offset > len(input_audio) / 16000
40
- and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
41
- end_time = voice_activity[-2]['end']
42
- logger.info("Using the second last timestamp for trimming")
39
+ if game_line and game_line.next and len(voice_activity) > 1 and 0 > get_config().audio.beginning_offset > audio_length - voice_activity[-1]['start']:
40
+ # and (voice_activity[-1]['start'] - voice_activity[-2]['end']) > 3.0):
41
+ end_time = voice_activity[-2]['end']
42
+ logger.info("Using the second last timestamp for trimming")
43
43
  else:
44
44
  end_time = voice_activity[-1]['end'] if voice_activity else 0
45
45
 
46
46
  # Trim the audio using FFmpeg
47
47
  ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
48
- return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
48
+ return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, SILERO)
49
+
50
+
51
+ # process_audio_with_silero("tmp6x81cy27.opus", "tmp6x81cy27_trimmed.opus", None)
52
+ # print(detect_voice_with_silero("tmp6x81cy27.opus"))
@@ -0,0 +1,13 @@
1
+ import subprocess
2
+
3
+ from GameSentenceMiner.ffmpeg import get_ffprobe_path
4
+
5
+
6
+ def get_audio_length(path):
7
+ result = subprocess.run(
8
+ [get_ffprobe_path(), "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path],
9
+ stdout=subprocess.PIPE,
10
+ stderr=subprocess.PIPE,
11
+ text=True
12
+ )
13
+ return float(result.stdout.strip())
@@ -128,7 +128,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
128
128
 
129
129
  if not voice_activity:
130
130
  logger.info("No voice activity detected in the audio.")
131
- return VADResult(False, 0, 0)
131
+ return VADResult(False, 0, 0, VOSK)
132
132
 
133
133
  # Trim based on the first and last speech detected
134
134
  start_time = voice_activity[0]['start'] if voice_activity else 0
@@ -148,7 +148,7 @@ def process_audio_with_vosk(input_audio, output_audio, game_line):
148
148
 
149
149
  # Trim the audio using FFmpeg
150
150
  ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio)
151
- return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset)
151
+ return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, VOSK)
152
152
 
153
153
 
154
154
  def get_vosk_model():