GameSentenceMiner 2.18.6__py3-none-any.whl → 2.18.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@ from rapidfuzz import fuzz
13
13
 
14
14
  # Local application imports
15
15
  from GameSentenceMiner.ocr.gsm_ocr_config import set_dpi_awareness
16
- from GameSentenceMiner.util.configuration import OverlayEngine, get_config, get_temporary_directory, is_windows, is_beangate, logger
16
+ from GameSentenceMiner.util.configuration import OverlayEngine, get_config, get_overlay_config, get_temporary_directory, is_windows, is_beangate, logger
17
17
  from GameSentenceMiner.util.electron_config import get_ocr_language
18
18
  from GameSentenceMiner.obs import get_screenshot_PIL
19
19
  from GameSentenceMiner.web.texthooking_page import send_word_coordinates_to_overlay
@@ -135,6 +135,7 @@ class OverlayProcessor:
135
135
  self.ready = False
136
136
  self.last_oneocr_result = None
137
137
  self.last_lens_result = None
138
+ self.current_task = None # Track current running task
138
139
 
139
140
  try:
140
141
  if self.config.overlay.websocket_port and all([GoogleLens, get_regex]):
@@ -163,8 +164,22 @@ class OverlayProcessor:
163
164
  async def find_box_and_send_to_overlay(self, sentence_to_check: str = None):
164
165
  """
165
166
  Sends the detected text boxes to the overlay via WebSocket.
167
+ Cancels any running OCR task before starting a new one.
166
168
  """
167
- await self.find_box_for_sentence(sentence_to_check)
169
+ # Cancel any existing task
170
+ if self.current_task and not self.current_task.done():
171
+ self.current_task.cancel()
172
+ try:
173
+ await self.current_task
174
+ except asyncio.CancelledError:
175
+ logger.info("Previous OCR task was cancelled")
176
+
177
+ # Start new task
178
+ self.current_task = asyncio.create_task(self.find_box_for_sentence(sentence_to_check))
179
+ try:
180
+ await self.current_task
181
+ except asyncio.CancelledError:
182
+ logger.info("OCR task was cancelled")
168
183
  # logger.info(f"Sending {len(boxes)} boxes to overlay.")
169
184
  # await send_word_coordinates_to_overlay(boxes)
170
185
 
@@ -191,38 +206,46 @@ class OverlayProcessor:
191
206
  # set_dpi_awareness()
192
207
  with mss.mss() as sct:
193
208
  monitors = sct.monitors[1:]
194
- # return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
195
- if is_windows() and monitor_index == 0:
196
- from ctypes import wintypes
197
- import ctypes
198
- # Get work area for primary monitor (ignores taskbar)
199
- SPI_GETWORKAREA = 0x0030
200
- rect = wintypes.RECT()
201
- res = ctypes.windll.user32.SystemParametersInfoW(
202
- SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
203
- )
204
- if not res:
205
- raise ctypes.WinError()
209
+ monitor = monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
210
+ # Return monitor but the Y is 1 less to avoid taskbar on Windows
211
+ return {
212
+ "left": monitor["left"],
213
+ "top": monitor["top"],
214
+ "width": monitor["width"],
215
+ "height": monitor["height"] - 1
216
+ }
217
+ # # return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
218
+ # if is_windows() and monitor_index == 0:
219
+ # from ctypes import wintypes
220
+ # import ctypes
221
+ # # Get work area for primary monitor (ignores taskbar)
222
+ # SPI_GETWORKAREA = 0x0030
223
+ # rect = wintypes.RECT()
224
+ # res = ctypes.windll.user32.SystemParametersInfoW(
225
+ # SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
226
+ # )
227
+ # if not res:
228
+ # raise ctypes.WinError()
206
229
 
207
- return {
208
- "left": rect.left,
209
- "top": rect.top,
210
- "width": rect.right - rect.left,
211
- "height": rect.bottom - rect.top,
212
- }
213
- elif is_windows() and monitor_index > 0:
214
- # Secondary monitors: just return with a guess of how tall the taskbar is
215
- taskbar_height_guess = 48 # A common taskbar height, may vary
216
- mon = monitors[monitor_index]
217
- return {
218
- "left": mon["left"],
219
- "top": mon["top"],
220
- "width": mon["width"],
221
- "height": mon["height"] - taskbar_height_guess
222
- }
223
- else:
224
- # For non-Windows systems or unspecified monitors, return the monitor area as-is
225
- return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
230
+ # return {
231
+ # "left": rect.left,
232
+ # "top": rect.top,
233
+ # "width": rect.right - rect.left,
234
+ # "height": rect.bottom - rect.top,
235
+ # }
236
+ # elif is_windows() and monitor_index > 0:
237
+ # # Secondary monitors: just return with a guess of how tall the taskbar is
238
+ # taskbar_height_guess = 48 # A common taskbar height, may vary
239
+ # mon = monitors[monitor_index]
240
+ # return {
241
+ # "left": mon["left"],
242
+ # "top": mon["top"],
243
+ # "width": mon["width"],
244
+ # "height": mon["height"] - taskbar_height_guess
245
+ # }
246
+ # else:
247
+ # # For non-Windows systems or unspecified monitors, return the monitor area as-is
248
+ # return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
226
249
 
227
250
 
228
251
  def _get_full_screenshot(self) -> Tuple[Image.Image | None, int, int]:
@@ -230,7 +253,8 @@ class OverlayProcessor:
230
253
  if not mss:
231
254
  raise RuntimeError("MSS screenshot library is not installed.")
232
255
  with mss.mss() as sct:
233
- monitor = self.get_monitor_workarea(0) # Get primary monitor work area
256
+ logger.info(get_overlay_config())
257
+ monitor = self.get_monitor_workarea(get_overlay_config().monitor_to_capture) # Get primary monitor work area
234
258
  sct_img = sct.grab(monitor)
235
259
  img = Image.frombytes('RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX')
236
260
 
@@ -281,19 +305,32 @@ class OverlayProcessor:
281
305
  return composite_img
282
306
 
283
307
  async def _do_work(self, sentence_to_check: str = None) -> Tuple[List[Dict[str, Any]], int]:
284
- """The main OCR workflow."""
308
+ """The main OCR workflow with cancellation support."""
285
309
  if not self.lens:
286
310
  logger.error("OCR engines are not initialized. Cannot perform OCR for Overlay.")
287
311
  return []
288
312
 
289
313
  if get_config().overlay.scan_delay > 0:
290
- await asyncio.sleep(get_config().overlay.scan_delay)
314
+ try:
315
+ await asyncio.sleep(get_config().overlay.scan_delay)
316
+ except asyncio.CancelledError:
317
+ logger.info("OCR task cancelled during scan delay")
318
+ raise
319
+
320
+ # Check for cancellation before taking screenshot
321
+ if asyncio.current_task().cancelled():
322
+ raise asyncio.CancelledError()
291
323
 
292
324
  # 1. Get screenshot
293
325
  full_screenshot, monitor_width, monitor_height = self._get_full_screenshot()
294
326
  if not full_screenshot:
295
327
  logger.warning("Failed to get a screenshot.")
296
328
  return []
329
+
330
+ # Check for cancellation after screenshot
331
+ if asyncio.current_task().cancelled():
332
+ raise asyncio.CancelledError()
333
+
297
334
  if self.oneocr:
298
335
  # 2. Use OneOCR to find general text areas (fast)
299
336
  res, text, oneocr_results, crop_coords_list = self.oneocr(
@@ -304,6 +341,10 @@ class OverlayProcessor:
304
341
  furigana_filter_sensitivity=None, # Disable furigana filtering
305
342
  )
306
343
 
344
+ # Check for cancellation after OneOCR
345
+ if asyncio.current_task().cancelled():
346
+ raise asyncio.CancelledError()
347
+
307
348
  text_str = "".join([text for text in text if self.regex.match(text)])
308
349
 
309
350
  # RapidFuzz fuzzy match 90% to not send the same results repeatedly
@@ -325,6 +366,10 @@ class OverlayProcessor:
325
366
  logger.info("Sent %d text boxes to overlay.", len(oneocr_results))
326
367
  return
327
368
 
369
+ # Check for cancellation before creating composite image
370
+ if asyncio.current_task().cancelled():
371
+ raise asyncio.CancelledError()
372
+
328
373
  # 3. Create a composite image with only the detected text regions
329
374
  composite_image = self._create_composite_image(
330
375
  full_screenshot,
@@ -335,6 +380,10 @@ class OverlayProcessor:
335
380
  else:
336
381
  composite_image = full_screenshot
337
382
 
383
+ # Check for cancellation before Google Lens processing
384
+ if asyncio.current_task().cancelled():
385
+ raise asyncio.CancelledError()
386
+
338
387
  # 4. Use Google Lens on the cleaner composite image for higher accuracy
339
388
  res = self.lens(
340
389
  composite_image,
@@ -342,6 +391,10 @@ class OverlayProcessor:
342
391
  furigana_filter_sensitivity=None # Disable furigana filtering
343
392
  )
344
393
 
394
+ # Check for cancellation after Google Lens
395
+ if asyncio.current_task().cancelled():
396
+ raise asyncio.CancelledError()
397
+
345
398
  if len(res) != 3:
346
399
  return
347
400
 
@@ -360,6 +413,10 @@ class OverlayProcessor:
360
413
  if not success or not coords:
361
414
  return
362
415
 
416
+ # Check for cancellation before final processing
417
+ if asyncio.current_task().cancelled():
418
+ raise asyncio.CancelledError()
419
+
363
420
  # 5. Process the high-accuracy results into the desired format
364
421
  extracted_data = self._extract_text_with_pixel_boxes(
365
422
  api_response=coords,
GameSentenceMiner/vad.py CHANGED
@@ -103,6 +103,10 @@ class VADProcessor(ABC):
103
103
 
104
104
  def process_audio(self, input_audio, output_audio, game_line, text_mined):
105
105
  voice_activity = self._detect_voice_activity(input_audio, text_mined)
106
+ text_similarity = 0
107
+
108
+ if voice_activity and isinstance(voice_activity, tuple):
109
+ voice_activity, text_similarity = voice_activity
106
110
 
107
111
  if not voice_activity:
108
112
  logger.info("No voice activity detected in the audio.")
@@ -117,16 +121,17 @@ class VADProcessor(ABC):
117
121
  if 0 > audio_length - voice_activity[-1]['start'] + get_config().audio.beginning_offset:
118
122
  end_time = voice_activity[-2]['end']
119
123
 
120
- # if detected text is much shorter than game_line.text, if no text, guess based on length
121
- if 'text' in voice_activity[0]:
122
- dectected_text = ''.join([item['text'] for item in voice_activity])
123
- if game_line and game_line.text and len(dectected_text) < len(game_line.text) / 2:
124
- logger.info(f"Detected text '{dectected_text}' is much shorter than expected '{game_line.text}', skipping.")
125
- return VADResult(False, 0, 0, self.vad_system_name)
126
- else:
127
- if game_line and game_line.text and (end_time - start_time) < max(0.5, len(game_line.text) * 0.05):
128
- logger.info(f"Detected audio length {end_time - start_time} is much shorter than expected for text '{game_line.text}', skipping.")
129
- return VADResult(False, 0, 0, self.vad_system_name)
124
+ # if detected text is much shorter than game_line.text, if no text, guess based on length, only check if text_similarity is low
125
+ if text_similarity < 50:
126
+ if 'text' in voice_activity[0]:
127
+ detected_text = ''.join([item['text'] for item in voice_activity])
128
+ if game_line and game_line.text and len(detected_text) < len(game_line.text) / 4:
129
+ logger.info(f"Detected text '{detected_text}' is much shorter than expected '{game_line.text}', skipping.")
130
+ return VADResult(False, 0, 0, self.vad_system_name)
131
+ else:
132
+ if game_line and game_line.text and (end_time - start_time) < max(0.5, len(game_line.text) * 0.05):
133
+ logger.info(f"Detected audio length {end_time - start_time} is much shorter than expected for text '{game_line.text}', skipping.")
134
+ return VADResult(False, 0, 0, self.vad_system_name)
130
135
 
131
136
  if get_config().vad.cut_and_splice_segments:
132
137
  self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
@@ -185,13 +190,14 @@ class WhisperVADProcessor(VADProcessor):
185
190
  logger.debug(json.dumps(result.to_dict()))
186
191
 
187
192
  text = result.text.strip()
193
+ text_similarity = 0
188
194
 
189
195
  # If both mined text and Whisper transcription are available, compare their similarity
190
196
  if text_mined and text:
191
197
  from rapidfuzz import fuzz
192
198
  similarity = fuzz.partial_ratio(text_mined, text)
193
199
  logger.info(f"Whisper transcription: '{text}' | Mined text: '{text_mined}' | Partial similarity: {similarity:.1f}")
194
- # If similarity is very low, treat as no voice activity detected
200
+ text_similarity = similarity
195
201
  if similarity < 20:
196
202
  logger.info(f"Partial similarity {similarity:.1f} is below threshold, skipping voice activity.")
197
203
  return []
@@ -247,7 +253,7 @@ class WhisperVADProcessor(VADProcessor):
247
253
 
248
254
  previous_segment = segment
249
255
  # Return the detected voice activity and the total duration
250
- return voice_activity
256
+ return voice_activity, text_similarity
251
257
 
252
258
  # Add a new class for Vosk-based VAD
253
259
  # class VoskVADProcessor(VADProcessor):
@@ -9,6 +9,7 @@ from GameSentenceMiner.util import ffmpeg, notification
9
9
  from GameSentenceMiner.util.configuration import gsm_state, logger, get_config, get_temporary_directory
10
10
  from GameSentenceMiner.util.ffmpeg import get_video_timings
11
11
  from GameSentenceMiner.util.text_log import GameLine
12
+ from GameSentenceMiner.util.audio_player import AudioPlayer
12
13
 
13
14
 
14
15
  def set_get_audio_from_video_callback(func):
@@ -16,41 +17,91 @@ def set_get_audio_from_video_callback(func):
16
17
  get_audio_from_video = func
17
18
 
18
19
 
20
+ # Global audio player instance
21
+ _audio_player = None
22
+
23
+
24
+ def get_audio_player():
25
+ """Get or create the global audio player instance."""
26
+ global _audio_player
27
+ if _audio_player is None:
28
+ _audio_player = AudioPlayer(finished_callback=_on_audio_finished)
29
+ return _audio_player
30
+
31
+
32
+ def _on_audio_finished():
33
+ """Callback when audio playback finishes."""
34
+ # Clear the current audio stream reference from gsm_state
35
+ gsm_state.current_audio_stream = None
36
+
37
+
38
+ def stop_current_audio():
39
+ """Stop the currently playing audio."""
40
+ player = get_audio_player()
41
+ player.stop_audio()
42
+ gsm_state.current_audio_stream = None
43
+
44
+
45
+ def play_audio_data_safe(data, samplerate):
46
+ """
47
+ Play audio data using the safe audio player.
48
+
49
+ Args:
50
+ data: Audio data as numpy array
51
+ samplerate: Sample rate of the audio
52
+
53
+ Returns:
54
+ True if playback started successfully, False otherwise
55
+ """
56
+ player = get_audio_player()
57
+ success = player.play_audio_data(data, samplerate)
58
+ if success:
59
+ # Store reference in gsm_state for compatibility
60
+ gsm_state.current_audio_stream = player.current_audio_stream
61
+ return success
62
+
63
+
19
64
  def handle_texthooker_button(video_path=''):
20
65
  try:
21
66
  if gsm_state.line_for_audio:
67
+ # Check if audio is currently playing and stop it
68
+ if gsm_state.current_audio_stream:
69
+ stop_current_audio()
70
+ gsm_state.line_for_audio = None
71
+ return
72
+
22
73
  line: GameLine = gsm_state.line_for_audio
23
74
  gsm_state.line_for_audio = None
75
+
24
76
  if line == gsm_state.previous_line_for_audio:
25
77
  logger.info("Line is the same as the last one, skipping processing.")
26
- if get_config().advanced.audio_player_path:
27
- play_audio_in_external(gsm_state.previous_audio)
28
- elif get_config().advanced.video_player_path:
78
+ if get_config().advanced.video_player_path:
29
79
  play_video_in_external(line, video_path)
30
80
  else:
31
- import sounddevice as sd
32
- data, samplerate = gsm_state.previous_audio
33
- sd.play(data, samplerate)
34
- sd.wait()
81
+ # Use cached audio data with safe playback
82
+ if gsm_state.previous_audio:
83
+ data, samplerate = gsm_state.previous_audio
84
+ play_audio_data_safe(data, samplerate)
35
85
  return
86
+
36
87
  gsm_state.previous_line_for_audio = line
37
- if get_config().advanced.audio_player_path:
38
- audio = get_audio_from_video(line, line.next.time if line.next else None, video_path,
39
- temporary=True)
40
- play_audio_in_external(audio)
41
- gsm_state.previous_audio = audio
42
- elif get_config().advanced.video_player_path:
88
+
89
+ if get_config().advanced.video_player_path:
43
90
  play_video_in_external(line, video_path)
44
91
  else:
45
- import sounddevice as sd
92
+ # Extract audio and play with safe method
46
93
  import soundfile as sf
47
94
  audio = get_audio_from_video(line, line.next.time if line.next else None, video_path,
48
95
  temporary=True)
49
96
  data, samplerate = sf.read(audio)
50
- sd.play(data, samplerate)
51
- sd.wait()
52
- gsm_state.previous_audio = (data, samplerate)
97
+ data = data.astype('float32')
98
+
99
+ # Use safe audio playback
100
+ success = play_audio_data_safe(data, samplerate)
101
+ if success:
102
+ gsm_state.previous_audio = (data, samplerate)
53
103
  return
104
+
54
105
  if gsm_state.line_for_screenshot:
55
106
  line: GameLine = gsm_state.line_for_screenshot
56
107
  gsm_state.line_for_screenshot = None