GameSentenceMiner 2.18.6__py3-none-any.whl → 2.18.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/anki.py +21 -18
- GameSentenceMiner/locales/en_us.json +3 -3
- GameSentenceMiner/locales/ja_jp.json +4 -4
- GameSentenceMiner/locales/zh_cn.json +3 -3
- GameSentenceMiner/ui/anki_confirmation.py +72 -17
- GameSentenceMiner/ui/config_gui.py +16 -13
- GameSentenceMiner/util/audio_player.py +220 -0
- GameSentenceMiner/util/configuration.py +22 -3
- GameSentenceMiner/util/get_overlay_coords.py +93 -36
- GameSentenceMiner/vad.py +18 -12
- GameSentenceMiner/web/service.py +68 -17
- GameSentenceMiner/web/static/js/overview.js +253 -33
- GameSentenceMiner/web/templates/overview.html +38 -0
- GameSentenceMiner/web/texthooking_page.py +17 -2
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/METADATA +1 -1
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/RECORD +20 -19
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.18.6.dist-info → gamesentenceminer-2.18.7.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@ from rapidfuzz import fuzz
|
|
|
13
13
|
|
|
14
14
|
# Local application imports
|
|
15
15
|
from GameSentenceMiner.ocr.gsm_ocr_config import set_dpi_awareness
|
|
16
|
-
from GameSentenceMiner.util.configuration import OverlayEngine, get_config, get_temporary_directory, is_windows, is_beangate, logger
|
|
16
|
+
from GameSentenceMiner.util.configuration import OverlayEngine, get_config, get_overlay_config, get_temporary_directory, is_windows, is_beangate, logger
|
|
17
17
|
from GameSentenceMiner.util.electron_config import get_ocr_language
|
|
18
18
|
from GameSentenceMiner.obs import get_screenshot_PIL
|
|
19
19
|
from GameSentenceMiner.web.texthooking_page import send_word_coordinates_to_overlay
|
|
@@ -135,6 +135,7 @@ class OverlayProcessor:
|
|
|
135
135
|
self.ready = False
|
|
136
136
|
self.last_oneocr_result = None
|
|
137
137
|
self.last_lens_result = None
|
|
138
|
+
self.current_task = None # Track current running task
|
|
138
139
|
|
|
139
140
|
try:
|
|
140
141
|
if self.config.overlay.websocket_port and all([GoogleLens, get_regex]):
|
|
@@ -163,8 +164,22 @@ class OverlayProcessor:
|
|
|
163
164
|
async def find_box_and_send_to_overlay(self, sentence_to_check: str = None):
|
|
164
165
|
"""
|
|
165
166
|
Sends the detected text boxes to the overlay via WebSocket.
|
|
167
|
+
Cancels any running OCR task before starting a new one.
|
|
166
168
|
"""
|
|
167
|
-
|
|
169
|
+
# Cancel any existing task
|
|
170
|
+
if self.current_task and not self.current_task.done():
|
|
171
|
+
self.current_task.cancel()
|
|
172
|
+
try:
|
|
173
|
+
await self.current_task
|
|
174
|
+
except asyncio.CancelledError:
|
|
175
|
+
logger.info("Previous OCR task was cancelled")
|
|
176
|
+
|
|
177
|
+
# Start new task
|
|
178
|
+
self.current_task = asyncio.create_task(self.find_box_for_sentence(sentence_to_check))
|
|
179
|
+
try:
|
|
180
|
+
await self.current_task
|
|
181
|
+
except asyncio.CancelledError:
|
|
182
|
+
logger.info("OCR task was cancelled")
|
|
168
183
|
# logger.info(f"Sending {len(boxes)} boxes to overlay.")
|
|
169
184
|
# await send_word_coordinates_to_overlay(boxes)
|
|
170
185
|
|
|
@@ -191,38 +206,46 @@ class OverlayProcessor:
|
|
|
191
206
|
# set_dpi_awareness()
|
|
192
207
|
with mss.mss() as sct:
|
|
193
208
|
monitors = sct.monitors[1:]
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
209
|
+
monitor = monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
|
|
210
|
+
# Return monitor but the Y is 1 less to avoid taskbar on Windows
|
|
211
|
+
return {
|
|
212
|
+
"left": monitor["left"],
|
|
213
|
+
"top": monitor["top"],
|
|
214
|
+
"width": monitor["width"],
|
|
215
|
+
"height": monitor["height"] - 1
|
|
216
|
+
}
|
|
217
|
+
# # return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
|
|
218
|
+
# if is_windows() and monitor_index == 0:
|
|
219
|
+
# from ctypes import wintypes
|
|
220
|
+
# import ctypes
|
|
221
|
+
# # Get work area for primary monitor (ignores taskbar)
|
|
222
|
+
# SPI_GETWORKAREA = 0x0030
|
|
223
|
+
# rect = wintypes.RECT()
|
|
224
|
+
# res = ctypes.windll.user32.SystemParametersInfoW(
|
|
225
|
+
# SPI_GETWORKAREA, 0, ctypes.byref(rect), 0
|
|
226
|
+
# )
|
|
227
|
+
# if not res:
|
|
228
|
+
# raise ctypes.WinError()
|
|
206
229
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
elif is_windows() and monitor_index > 0:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
else:
|
|
224
|
-
|
|
225
|
-
|
|
230
|
+
# return {
|
|
231
|
+
# "left": rect.left,
|
|
232
|
+
# "top": rect.top,
|
|
233
|
+
# "width": rect.right - rect.left,
|
|
234
|
+
# "height": rect.bottom - rect.top,
|
|
235
|
+
# }
|
|
236
|
+
# elif is_windows() and monitor_index > 0:
|
|
237
|
+
# # Secondary monitors: just return with a guess of how tall the taskbar is
|
|
238
|
+
# taskbar_height_guess = 48 # A common taskbar height, may vary
|
|
239
|
+
# mon = monitors[monitor_index]
|
|
240
|
+
# return {
|
|
241
|
+
# "left": mon["left"],
|
|
242
|
+
# "top": mon["top"],
|
|
243
|
+
# "width": mon["width"],
|
|
244
|
+
# "height": mon["height"] - taskbar_height_guess
|
|
245
|
+
# }
|
|
246
|
+
# else:
|
|
247
|
+
# # For non-Windows systems or unspecified monitors, return the monitor area as-is
|
|
248
|
+
# return monitors[monitor_index] if 0 <= monitor_index < len(monitors) else monitors[0]
|
|
226
249
|
|
|
227
250
|
|
|
228
251
|
def _get_full_screenshot(self) -> Tuple[Image.Image | None, int, int]:
|
|
@@ -230,7 +253,8 @@ class OverlayProcessor:
|
|
|
230
253
|
if not mss:
|
|
231
254
|
raise RuntimeError("MSS screenshot library is not installed.")
|
|
232
255
|
with mss.mss() as sct:
|
|
233
|
-
|
|
256
|
+
logger.info(get_overlay_config())
|
|
257
|
+
monitor = self.get_monitor_workarea(get_overlay_config().monitor_to_capture) # Get primary monitor work area
|
|
234
258
|
sct_img = sct.grab(monitor)
|
|
235
259
|
img = Image.frombytes('RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX')
|
|
236
260
|
|
|
@@ -281,19 +305,32 @@ class OverlayProcessor:
|
|
|
281
305
|
return composite_img
|
|
282
306
|
|
|
283
307
|
async def _do_work(self, sentence_to_check: str = None) -> Tuple[List[Dict[str, Any]], int]:
|
|
284
|
-
"""The main OCR workflow."""
|
|
308
|
+
"""The main OCR workflow with cancellation support."""
|
|
285
309
|
if not self.lens:
|
|
286
310
|
logger.error("OCR engines are not initialized. Cannot perform OCR for Overlay.")
|
|
287
311
|
return []
|
|
288
312
|
|
|
289
313
|
if get_config().overlay.scan_delay > 0:
|
|
290
|
-
|
|
314
|
+
try:
|
|
315
|
+
await asyncio.sleep(get_config().overlay.scan_delay)
|
|
316
|
+
except asyncio.CancelledError:
|
|
317
|
+
logger.info("OCR task cancelled during scan delay")
|
|
318
|
+
raise
|
|
319
|
+
|
|
320
|
+
# Check for cancellation before taking screenshot
|
|
321
|
+
if asyncio.current_task().cancelled():
|
|
322
|
+
raise asyncio.CancelledError()
|
|
291
323
|
|
|
292
324
|
# 1. Get screenshot
|
|
293
325
|
full_screenshot, monitor_width, monitor_height = self._get_full_screenshot()
|
|
294
326
|
if not full_screenshot:
|
|
295
327
|
logger.warning("Failed to get a screenshot.")
|
|
296
328
|
return []
|
|
329
|
+
|
|
330
|
+
# Check for cancellation after screenshot
|
|
331
|
+
if asyncio.current_task().cancelled():
|
|
332
|
+
raise asyncio.CancelledError()
|
|
333
|
+
|
|
297
334
|
if self.oneocr:
|
|
298
335
|
# 2. Use OneOCR to find general text areas (fast)
|
|
299
336
|
res, text, oneocr_results, crop_coords_list = self.oneocr(
|
|
@@ -304,6 +341,10 @@ class OverlayProcessor:
|
|
|
304
341
|
furigana_filter_sensitivity=None, # Disable furigana filtering
|
|
305
342
|
)
|
|
306
343
|
|
|
344
|
+
# Check for cancellation after OneOCR
|
|
345
|
+
if asyncio.current_task().cancelled():
|
|
346
|
+
raise asyncio.CancelledError()
|
|
347
|
+
|
|
307
348
|
text_str = "".join([text for text in text if self.regex.match(text)])
|
|
308
349
|
|
|
309
350
|
# RapidFuzz fuzzy match 90% to not send the same results repeatedly
|
|
@@ -325,6 +366,10 @@ class OverlayProcessor:
|
|
|
325
366
|
logger.info("Sent %d text boxes to overlay.", len(oneocr_results))
|
|
326
367
|
return
|
|
327
368
|
|
|
369
|
+
# Check for cancellation before creating composite image
|
|
370
|
+
if asyncio.current_task().cancelled():
|
|
371
|
+
raise asyncio.CancelledError()
|
|
372
|
+
|
|
328
373
|
# 3. Create a composite image with only the detected text regions
|
|
329
374
|
composite_image = self._create_composite_image(
|
|
330
375
|
full_screenshot,
|
|
@@ -335,6 +380,10 @@ class OverlayProcessor:
|
|
|
335
380
|
else:
|
|
336
381
|
composite_image = full_screenshot
|
|
337
382
|
|
|
383
|
+
# Check for cancellation before Google Lens processing
|
|
384
|
+
if asyncio.current_task().cancelled():
|
|
385
|
+
raise asyncio.CancelledError()
|
|
386
|
+
|
|
338
387
|
# 4. Use Google Lens on the cleaner composite image for higher accuracy
|
|
339
388
|
res = self.lens(
|
|
340
389
|
composite_image,
|
|
@@ -342,6 +391,10 @@ class OverlayProcessor:
|
|
|
342
391
|
furigana_filter_sensitivity=None # Disable furigana filtering
|
|
343
392
|
)
|
|
344
393
|
|
|
394
|
+
# Check for cancellation after Google Lens
|
|
395
|
+
if asyncio.current_task().cancelled():
|
|
396
|
+
raise asyncio.CancelledError()
|
|
397
|
+
|
|
345
398
|
if len(res) != 3:
|
|
346
399
|
return
|
|
347
400
|
|
|
@@ -360,6 +413,10 @@ class OverlayProcessor:
|
|
|
360
413
|
if not success or not coords:
|
|
361
414
|
return
|
|
362
415
|
|
|
416
|
+
# Check for cancellation before final processing
|
|
417
|
+
if asyncio.current_task().cancelled():
|
|
418
|
+
raise asyncio.CancelledError()
|
|
419
|
+
|
|
363
420
|
# 5. Process the high-accuracy results into the desired format
|
|
364
421
|
extracted_data = self._extract_text_with_pixel_boxes(
|
|
365
422
|
api_response=coords,
|
GameSentenceMiner/vad.py
CHANGED
|
@@ -103,6 +103,10 @@ class VADProcessor(ABC):
|
|
|
103
103
|
|
|
104
104
|
def process_audio(self, input_audio, output_audio, game_line, text_mined):
|
|
105
105
|
voice_activity = self._detect_voice_activity(input_audio, text_mined)
|
|
106
|
+
text_similarity = 0
|
|
107
|
+
|
|
108
|
+
if voice_activity and isinstance(voice_activity, tuple):
|
|
109
|
+
voice_activity, text_similarity = voice_activity
|
|
106
110
|
|
|
107
111
|
if not voice_activity:
|
|
108
112
|
logger.info("No voice activity detected in the audio.")
|
|
@@ -117,16 +121,17 @@ class VADProcessor(ABC):
|
|
|
117
121
|
if 0 > audio_length - voice_activity[-1]['start'] + get_config().audio.beginning_offset:
|
|
118
122
|
end_time = voice_activity[-2]['end']
|
|
119
123
|
|
|
120
|
-
# if detected text is much shorter than game_line.text, if no text, guess based on length
|
|
121
|
-
if
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
124
|
+
# if detected text is much shorter than game_line.text, if no text, guess based on length, only check if text_similarity is low
|
|
125
|
+
if text_similarity < 50:
|
|
126
|
+
if 'text' in voice_activity[0]:
|
|
127
|
+
detected_text = ''.join([item['text'] for item in voice_activity])
|
|
128
|
+
if game_line and game_line.text and len(detected_text) < len(game_line.text) / 4:
|
|
129
|
+
logger.info(f"Detected text '{detected_text}' is much shorter than expected '{game_line.text}', skipping.")
|
|
130
|
+
return VADResult(False, 0, 0, self.vad_system_name)
|
|
131
|
+
else:
|
|
132
|
+
if game_line and game_line.text and (end_time - start_time) < max(0.5, len(game_line.text) * 0.05):
|
|
133
|
+
logger.info(f"Detected audio length {end_time - start_time} is much shorter than expected for text '{game_line.text}', skipping.")
|
|
134
|
+
return VADResult(False, 0, 0, self.vad_system_name)
|
|
130
135
|
|
|
131
136
|
if get_config().vad.cut_and_splice_segments:
|
|
132
137
|
self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
|
|
@@ -185,13 +190,14 @@ class WhisperVADProcessor(VADProcessor):
|
|
|
185
190
|
logger.debug(json.dumps(result.to_dict()))
|
|
186
191
|
|
|
187
192
|
text = result.text.strip()
|
|
193
|
+
text_similarity = 0
|
|
188
194
|
|
|
189
195
|
# If both mined text and Whisper transcription are available, compare their similarity
|
|
190
196
|
if text_mined and text:
|
|
191
197
|
from rapidfuzz import fuzz
|
|
192
198
|
similarity = fuzz.partial_ratio(text_mined, text)
|
|
193
199
|
logger.info(f"Whisper transcription: '{text}' | Mined text: '{text_mined}' | Partial similarity: {similarity:.1f}")
|
|
194
|
-
|
|
200
|
+
text_similarity = similarity
|
|
195
201
|
if similarity < 20:
|
|
196
202
|
logger.info(f"Partial similarity {similarity:.1f} is below threshold, skipping voice activity.")
|
|
197
203
|
return []
|
|
@@ -247,7 +253,7 @@ class WhisperVADProcessor(VADProcessor):
|
|
|
247
253
|
|
|
248
254
|
previous_segment = segment
|
|
249
255
|
# Return the detected voice activity and the total duration
|
|
250
|
-
return voice_activity
|
|
256
|
+
return voice_activity, text_similarity
|
|
251
257
|
|
|
252
258
|
# Add a new class for Vosk-based VAD
|
|
253
259
|
# class VoskVADProcessor(VADProcessor):
|
GameSentenceMiner/web/service.py
CHANGED
|
@@ -9,6 +9,7 @@ from GameSentenceMiner.util import ffmpeg, notification
|
|
|
9
9
|
from GameSentenceMiner.util.configuration import gsm_state, logger, get_config, get_temporary_directory
|
|
10
10
|
from GameSentenceMiner.util.ffmpeg import get_video_timings
|
|
11
11
|
from GameSentenceMiner.util.text_log import GameLine
|
|
12
|
+
from GameSentenceMiner.util.audio_player import AudioPlayer
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def set_get_audio_from_video_callback(func):
|
|
@@ -16,41 +17,91 @@ def set_get_audio_from_video_callback(func):
|
|
|
16
17
|
get_audio_from_video = func
|
|
17
18
|
|
|
18
19
|
|
|
20
|
+
# Global audio player instance
|
|
21
|
+
_audio_player = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_audio_player():
|
|
25
|
+
"""Get or create the global audio player instance."""
|
|
26
|
+
global _audio_player
|
|
27
|
+
if _audio_player is None:
|
|
28
|
+
_audio_player = AudioPlayer(finished_callback=_on_audio_finished)
|
|
29
|
+
return _audio_player
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _on_audio_finished():
|
|
33
|
+
"""Callback when audio playback finishes."""
|
|
34
|
+
# Clear the current audio stream reference from gsm_state
|
|
35
|
+
gsm_state.current_audio_stream = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def stop_current_audio():
|
|
39
|
+
"""Stop the currently playing audio."""
|
|
40
|
+
player = get_audio_player()
|
|
41
|
+
player.stop_audio()
|
|
42
|
+
gsm_state.current_audio_stream = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def play_audio_data_safe(data, samplerate):
|
|
46
|
+
"""
|
|
47
|
+
Play audio data using the safe audio player.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
data: Audio data as numpy array
|
|
51
|
+
samplerate: Sample rate of the audio
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
True if playback started successfully, False otherwise
|
|
55
|
+
"""
|
|
56
|
+
player = get_audio_player()
|
|
57
|
+
success = player.play_audio_data(data, samplerate)
|
|
58
|
+
if success:
|
|
59
|
+
# Store reference in gsm_state for compatibility
|
|
60
|
+
gsm_state.current_audio_stream = player.current_audio_stream
|
|
61
|
+
return success
|
|
62
|
+
|
|
63
|
+
|
|
19
64
|
def handle_texthooker_button(video_path=''):
|
|
20
65
|
try:
|
|
21
66
|
if gsm_state.line_for_audio:
|
|
67
|
+
# Check if audio is currently playing and stop it
|
|
68
|
+
if gsm_state.current_audio_stream:
|
|
69
|
+
stop_current_audio()
|
|
70
|
+
gsm_state.line_for_audio = None
|
|
71
|
+
return
|
|
72
|
+
|
|
22
73
|
line: GameLine = gsm_state.line_for_audio
|
|
23
74
|
gsm_state.line_for_audio = None
|
|
75
|
+
|
|
24
76
|
if line == gsm_state.previous_line_for_audio:
|
|
25
77
|
logger.info("Line is the same as the last one, skipping processing.")
|
|
26
|
-
if get_config().advanced.
|
|
27
|
-
play_audio_in_external(gsm_state.previous_audio)
|
|
28
|
-
elif get_config().advanced.video_player_path:
|
|
78
|
+
if get_config().advanced.video_player_path:
|
|
29
79
|
play_video_in_external(line, video_path)
|
|
30
80
|
else:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
81
|
+
# Use cached audio data with safe playback
|
|
82
|
+
if gsm_state.previous_audio:
|
|
83
|
+
data, samplerate = gsm_state.previous_audio
|
|
84
|
+
play_audio_data_safe(data, samplerate)
|
|
35
85
|
return
|
|
86
|
+
|
|
36
87
|
gsm_state.previous_line_for_audio = line
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
temporary=True)
|
|
40
|
-
play_audio_in_external(audio)
|
|
41
|
-
gsm_state.previous_audio = audio
|
|
42
|
-
elif get_config().advanced.video_player_path:
|
|
88
|
+
|
|
89
|
+
if get_config().advanced.video_player_path:
|
|
43
90
|
play_video_in_external(line, video_path)
|
|
44
91
|
else:
|
|
45
|
-
|
|
92
|
+
# Extract audio and play with safe method
|
|
46
93
|
import soundfile as sf
|
|
47
94
|
audio = get_audio_from_video(line, line.next.time if line.next else None, video_path,
|
|
48
95
|
temporary=True)
|
|
49
96
|
data, samplerate = sf.read(audio)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
97
|
+
data = data.astype('float32')
|
|
98
|
+
|
|
99
|
+
# Use safe audio playback
|
|
100
|
+
success = play_audio_data_safe(data, samplerate)
|
|
101
|
+
if success:
|
|
102
|
+
gsm_state.previous_audio = (data, samplerate)
|
|
53
103
|
return
|
|
104
|
+
|
|
54
105
|
if gsm_state.line_for_screenshot:
|
|
55
106
|
line: GameLine = gsm_state.line_for_screenshot
|
|
56
107
|
gsm_state.line_for_screenshot = None
|