GameSentenceMiner 2.9.22__py3-none-any.whl → 2.9.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/config_gui.py +15 -0
- GameSentenceMiner/gsm.py +2 -1
- GameSentenceMiner/ocr/owocr_area_selector.py +3 -1
- GameSentenceMiner/ocr/owocr_helper.py +49 -26
- GameSentenceMiner/owocr/owocr/__init__.py +1 -1
- GameSentenceMiner/owocr/owocr/ocr.py +186 -25
- GameSentenceMiner/owocr/owocr/run.py +5 -5
- GameSentenceMiner/util/configuration.py +2 -0
- GameSentenceMiner/util/ffmpeg.py +48 -12
- GameSentenceMiner/util/model.py +22 -0
- GameSentenceMiner/vad.py +55 -33
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/METADATA +1 -1
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/RECORD +17 -17
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.9.22.dist-info → gamesentenceminer-2.9.24.dist-info}/top_level.txt +0 -0
GameSentenceMiner/config_gui.py
CHANGED
@@ -226,6 +226,9 @@ class ConfigApp:
|
|
226
226
|
trim_beginning=self.vad_trim_beginning.get(),
|
227
227
|
beginning_offset=float(self.vad_beginning_offset.get()),
|
228
228
|
add_audio_on_no_results=self.add_audio_on_no_results.get(),
|
229
|
+
language=self.language.get(),
|
230
|
+
cut_and_splice_segments=self.cut_and_splice_segments.get(),
|
231
|
+
splice_padding=float(self.splice_padding.get()) if self.splice_padding.get() else 0.0,
|
229
232
|
),
|
230
233
|
advanced=Advanced(
|
231
234
|
audio_player_path=self.audio_player_path.get(),
|
@@ -497,6 +500,18 @@ class ConfigApp:
|
|
497
500
|
ttk.Checkbutton(vad_frame, variable=self.add_audio_on_no_results).grid(row=self.current_row, column=1, sticky='W')
|
498
501
|
self.add_label_and_increment_row(vad_frame, "Add audio even if no results are found by VAD.", row=self.current_row, column=2)
|
499
502
|
|
503
|
+
ttk.Label(vad_frame, text="Cut and Splice Segments:").grid(row=self.current_row, column=0, sticky='W')
|
504
|
+
self.cut_and_splice_segments = tk.BooleanVar(value=self.settings.vad.cut_and_splice_segments)
|
505
|
+
ttk.Checkbutton(vad_frame, variable=self.cut_and_splice_segments).grid(row=self.current_row, column=1, sticky='W')
|
506
|
+
self.add_label_and_increment_row(vad_frame, "Enable to cut and splice audio segments together based on VAD results.", row=self.current_row, column=2)
|
507
|
+
|
508
|
+
ttk.Label(vad_frame, text="Splice Padding (seconds):").grid(row=self.current_row, column=0, sticky='W')
|
509
|
+
self.splice_padding = ttk.Entry(vad_frame)
|
510
|
+
self.splice_padding.insert(0, str(self.settings.vad.splice_padding))
|
511
|
+
self.splice_padding.grid(row=self.current_row, column=1)
|
512
|
+
self.add_label_and_increment_row(vad_frame, "Padding in seconds added to spliced audio segments. WARNING: This may result in duplicated voicelines if too high!", row=self.current_row, column=2)
|
513
|
+
|
514
|
+
|
500
515
|
|
501
516
|
@new_tab
|
502
517
|
def create_paths_tab(self):
|
GameSentenceMiner/gsm.py
CHANGED
@@ -10,7 +10,8 @@ os.environ.pop('TCL_LIBRARY', None)
|
|
10
10
|
from GameSentenceMiner.util.gsm_utils import wait_for_stable_file, make_unique_file_name, run_new_thread
|
11
11
|
from GameSentenceMiner.util.communication.send import send_restart_signal
|
12
12
|
from GameSentenceMiner.util.downloader.download_tools import download_obs_if_needed, download_ffmpeg_if_needed
|
13
|
-
from GameSentenceMiner.vad import vad_processor
|
13
|
+
from GameSentenceMiner.vad import vad_processor
|
14
|
+
from GameSentenceMiner.util.model import VADResult
|
14
15
|
|
15
16
|
try:
|
16
17
|
import os.path
|
@@ -269,7 +269,7 @@ class ScreenSelector:
|
|
269
269
|
"""Saves rectangles to the config file, using percentages if window is targeted."""
|
270
270
|
# Use the window geometry found during __init__ for consistency during save
|
271
271
|
window_geom_to_save = self.target_window_geometry
|
272
|
-
save_coord_system =
|
272
|
+
save_coord_system = COORD_SYSTEM_PERCENTAGE # Default if no window
|
273
273
|
|
274
274
|
config_path = self.get_scene_ocr_config()
|
275
275
|
print(f"Saving rectangles to: {config_path}")
|
@@ -334,6 +334,8 @@ class ScreenSelector:
|
|
334
334
|
serializable_rects.append(rect_data)
|
335
335
|
|
336
336
|
# Prepare final data structure for JSON
|
337
|
+
if not self.rectangles or len(self.rectangles) == 0:
|
338
|
+
save_coord_system = COORD_SYSTEM_PERCENTAGE
|
337
339
|
save_data = {
|
338
340
|
"scene": obs.get_current_scene() or "default_scene",
|
339
341
|
"window": self.window_name, # Store targeted window name
|
@@ -82,7 +82,7 @@ def get_ocr_config(window=None) -> OCRConfig:
|
|
82
82
|
scene = sanitize_filename(obs.get_current_scene())
|
83
83
|
config_path = ocr_config_dir / f"{scene}.json"
|
84
84
|
if not config_path.exists():
|
85
|
-
ocr_config = OCRConfig(scene=scene, window=window, rectangles=[], coordinate_system="")
|
85
|
+
ocr_config = OCRConfig(scene=scene, window=window, rectangles=[], coordinate_system="percentage")
|
86
86
|
with open(config_path, 'w', encoding="utf-8") as f:
|
87
87
|
json.dump(ocr_config.to_dict(), f, indent=4)
|
88
88
|
return ocr_config
|
@@ -212,7 +212,7 @@ def do_second_ocr(ocr1_text, time, img, filtering):
|
|
212
212
|
global twopassocr, ocr2, last_ocr2_result
|
213
213
|
try:
|
214
214
|
orig_text, text = run.process_and_write_results(img, None, last_ocr2_result, filtering, None,
|
215
|
-
engine=ocr2)
|
215
|
+
engine=ocr2, furigana_filter_sensitivity=furigana_filter_sensitivity)
|
216
216
|
if fuzz.ratio(last_ocr2_result, text) >= 90:
|
217
217
|
logger.info("Seems like the same text from previous ocr2 result, not sending")
|
218
218
|
return
|
@@ -341,27 +341,31 @@ def run_oneocr(ocr_config: OCRConfig, rectangles):
|
|
341
341
|
exclusions = list(rect.coordinates for rect in list(filter(lambda x: x.is_excluded, rectangles)))
|
342
342
|
|
343
343
|
run.init_config(False)
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
344
|
+
try:
|
345
|
+
run.run(read_from="screencapture" if not ssonly else "clipboard" if ss_clipboard else "",
|
346
|
+
read_from_secondary="clipboard" if ss_clipboard and not ssonly else None,
|
347
|
+
write_to="callback",
|
348
|
+
screen_capture_area=screen_area,
|
349
|
+
# screen_capture_monitor=monitor_config['index'],
|
350
|
+
screen_capture_window=ocr_config.window if ocr_config and ocr_config.window else None,
|
351
|
+
screen_capture_only_active_windows=get_requires_open_window(),
|
352
|
+
screen_capture_delay_secs=get_ocr_scan_rate(), engine=ocr1,
|
353
|
+
text_callback=text_callback,
|
354
|
+
screen_capture_exclusions=exclusions,
|
355
|
+
language=language,
|
356
|
+
monitor_index=None,
|
357
|
+
ocr1=ocr1,
|
358
|
+
ocr2=ocr2,
|
359
|
+
gsm_ocr_config=ocr_config,
|
360
|
+
screen_capture_areas=screen_areas,
|
361
|
+
furigana_filter_sensitivity=furigana_filter_sensitivity)
|
362
|
+
except Exception as e:
|
363
|
+
logger.exception(f"Error running OneOCR: {e}")
|
360
364
|
done = True
|
361
365
|
|
362
366
|
|
363
367
|
|
364
|
-
def add_ss_hotkey():
|
368
|
+
def add_ss_hotkey(ss_hotkey="ctrl+shift+g"):
|
365
369
|
import keyboard
|
366
370
|
cropper = ScreenCropper()
|
367
371
|
filtering = TextFiltering()
|
@@ -369,9 +373,23 @@ def add_ss_hotkey():
|
|
369
373
|
print("Taking screenshot...")
|
370
374
|
img = cropper.run()
|
371
375
|
do_second_ocr("", datetime.now(), img, filtering)
|
372
|
-
|
373
|
-
|
374
|
-
|
376
|
+
try:
|
377
|
+
raise Exception("Forcing keyboard hotkey setup, this is a test.")
|
378
|
+
keyboard.add_hotkey(ss_hotkey, capture)
|
379
|
+
print(f"Press {ss_hotkey} to take a screenshot.")
|
380
|
+
except Exception as e:
|
381
|
+
logger.error(f"Error setting up screenshot hotkey with keyboard, Attempting Backup: {e}")
|
382
|
+
logger.debug(e)
|
383
|
+
pynput_hotkey = ss_hotkey.replace("ctrl", "<ctrl>").replace("shift", "<shift>").replace("alt", "<alt>")
|
384
|
+
try:
|
385
|
+
from pynput import keyboard as pynput_keyboard
|
386
|
+
listener = pynput_keyboard.GlobalHotKeys({
|
387
|
+
pynput_hotkey: capture
|
388
|
+
})
|
389
|
+
listener.start()
|
390
|
+
print(f"Press {pynput_hotkey} to take a screenshot.")
|
391
|
+
except Exception as e:
|
392
|
+
logger.error(f"Error setting up screenshot hotkey with pynput, Screenshot Hotkey Will not work: {e}")
|
375
393
|
|
376
394
|
|
377
395
|
def get_window(window_name):
|
@@ -402,7 +420,7 @@ def set_force_stable_hotkey():
|
|
402
420
|
print("Press Ctrl+Shift+F to toggle force stable mode.")
|
403
421
|
|
404
422
|
if __name__ == "__main__":
|
405
|
-
global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config
|
423
|
+
global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config, furigana_filter_sensitivity
|
406
424
|
import sys
|
407
425
|
|
408
426
|
import argparse
|
@@ -415,6 +433,8 @@ if __name__ == "__main__":
|
|
415
433
|
parser.add_argument("--ssonly", action="store_true", help="Use screenshot-only mode")
|
416
434
|
parser.add_argument("--clipboard", action="store_true", help="Use clipboard for input")
|
417
435
|
parser.add_argument("--window", type=str, help="Specify the window name for OCR")
|
436
|
+
parser.add_argument("--furigana_filter_sensitivity", type=float, default=0, help="Furigana Filter Sensitivity for OCR (default: 0)")
|
437
|
+
parser.add_argument("--manual_ocr_hotkey", type=str, default="ctrl+shift+g", help="Hotkey for manual OCR (default: ctrl+shift+g)")
|
418
438
|
|
419
439
|
args = parser.parse_args()
|
420
440
|
|
@@ -425,9 +445,11 @@ if __name__ == "__main__":
|
|
425
445
|
ssonly = args.ssonly
|
426
446
|
ss_clipboard = args.clipboard
|
427
447
|
window_name = args.window
|
428
|
-
|
448
|
+
furigana_filter_sensitivity = args.furigana_filter_sensitivity
|
449
|
+
ss_hotkey = args.manual_ocr_hotkey.lower()
|
450
|
+
|
451
|
+
logger.info(f"Received arguments: {vars(args)}")
|
429
452
|
# set_force_stable_hotkey()
|
430
|
-
set_dpi_awareness()
|
431
453
|
ocr_config = None
|
432
454
|
if not ssonly:
|
433
455
|
ocr_config: OCRConfig = get_ocr_config(window=window_name)
|
@@ -443,6 +465,7 @@ if __name__ == "__main__":
|
|
443
465
|
logger.error(f"Window '{ocr_config.window}' not found within 30 seconds.")
|
444
466
|
sys.exit(1)
|
445
467
|
logger.info(f"Starting OCR with configuration: Window: {ocr_config.window}, Rectangles: {ocr_config.rectangles}, Engine 1: {ocr1}, Engine 2: {ocr2}, Two-pass OCR: {twopassocr}")
|
468
|
+
set_dpi_awareness()
|
446
469
|
if ssonly or ocr_config:
|
447
470
|
rectangles = ocr_config.rectangles if ocr_config and ocr_config.rectangles else []
|
448
471
|
oneocr_threads = []
|
@@ -453,7 +476,7 @@ if __name__ == "__main__":
|
|
453
476
|
worker_thread.start()
|
454
477
|
websocket_server_thread = WebsocketServerThread(read=True)
|
455
478
|
websocket_server_thread.start()
|
456
|
-
add_ss_hotkey()
|
479
|
+
add_ss_hotkey(ss_hotkey)
|
457
480
|
try:
|
458
481
|
while not done:
|
459
482
|
time.sleep(1)
|
@@ -1 +1 @@
|
|
1
|
-
from owocr.ocr import *
|
1
|
+
from GameSentenceMiner.owocr.owocr.ocr import *
|
@@ -17,6 +17,8 @@ from google.generativeai import GenerationConfig
|
|
17
17
|
from loguru import logger
|
18
18
|
import requests
|
19
19
|
|
20
|
+
from GameSentenceMiner.util.configuration import get_app_directory, get_temporary_directory
|
21
|
+
|
20
22
|
try:
|
21
23
|
from manga_ocr import MangaOcr as MOCR
|
22
24
|
except ImportError:
|
@@ -73,9 +75,10 @@ except ImportError:
|
|
73
75
|
|
74
76
|
try:
|
75
77
|
import betterproto
|
76
|
-
from .lens_betterproto import *
|
78
|
+
from GameSentenceMiner.owocr.owocr.lens_betterproto import *
|
77
79
|
import random
|
78
80
|
except ImportError:
|
81
|
+
print('Google Lens not available, please install betterproto package!')
|
79
82
|
pass
|
80
83
|
|
81
84
|
try:
|
@@ -174,7 +177,7 @@ class MangaOcr:
|
|
174
177
|
self.available = True
|
175
178
|
logger.info('Manga OCR ready')
|
176
179
|
|
177
|
-
def __call__(self, img):
|
180
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
178
181
|
img = input_to_pil_image(img)
|
179
182
|
if not img:
|
180
183
|
return (False, 'Invalid image provided')
|
@@ -204,7 +207,7 @@ class GoogleVision:
|
|
204
207
|
except:
|
205
208
|
logger.warning('Error parsing Google credentials, Google Vision will not work!')
|
206
209
|
|
207
|
-
def __call__(self, img):
|
210
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
208
211
|
img = input_to_pil_image(img)
|
209
212
|
if not img:
|
210
213
|
return (False, 'Invalid image provided')
|
@@ -234,13 +237,14 @@ class GoogleLens:
|
|
234
237
|
available = False
|
235
238
|
|
236
239
|
def __init__(self):
|
240
|
+
self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
237
241
|
if 'betterproto' not in sys.modules:
|
238
242
|
logger.warning('betterproto not available, Google Lens will not work!')
|
239
243
|
else:
|
240
244
|
self.available = True
|
241
245
|
logger.info('Google Lens ready')
|
242
246
|
|
243
|
-
def __call__(self, img):
|
247
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
244
248
|
img = input_to_pil_image(img)
|
245
249
|
if not img:
|
246
250
|
return (False, 'Invalid image provided')
|
@@ -269,7 +273,7 @@ class GoogleLens:
|
|
269
273
|
image_data = self._preprocess(img)
|
270
274
|
request.objects_request.image_data.payload.image_bytes = image_data[0]
|
271
275
|
request.objects_request.image_data.image_metadata.width = image_data[1]
|
272
|
-
request.objects_request.image_data.image_metadata.height = image_data[2]
|
276
|
+
request.objects_request.image_data.image_metadata.height = image_data[2]
|
273
277
|
|
274
278
|
payload = request.SerializeToString()
|
275
279
|
|
@@ -299,15 +303,59 @@ class GoogleLens:
|
|
299
303
|
response_proto = LensOverlayServerResponse().FromString(res.content)
|
300
304
|
response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
|
301
305
|
|
306
|
+
# with open(os.path.join(get_temporary_directory(), 'glens_response.json'), 'w', encoding='utf-8') as f:
|
307
|
+
# json.dump(response_dict, f, indent=4, ensure_ascii=False)
|
302
308
|
res = ''
|
303
309
|
text = response_dict['objects_response']['text']
|
304
|
-
if
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
310
|
+
if furigana_filter_sensitivity > 0:
|
311
|
+
if 'text_layout' in text:
|
312
|
+
for paragraph in text['text_layout']['paragraphs']:
|
313
|
+
for line in paragraph['lines']:
|
314
|
+
if furigana_filter_sensitivity < line['geometry']['bounding_box']['width'] * img.width and furigana_filter_sensitivity < line['geometry']['bounding_box']['height'] * img.height:
|
315
|
+
for word in line['words']:
|
316
|
+
res += word['plain_text'] + word['text_separator']
|
317
|
+
else:
|
318
|
+
continue
|
319
|
+
res += '\n'
|
320
|
+
|
321
|
+
# widths = []
|
322
|
+
# heights = []
|
323
|
+
# if 'text_layout' in text:
|
324
|
+
# paragraphs = text['text_layout']['paragraphs']
|
325
|
+
# for paragraph in paragraphs:
|
326
|
+
# for line in paragraph['lines']:
|
327
|
+
# for word in line['words']:
|
328
|
+
# if self.kana_kanji_regex.search(word['plain_text']) is None:
|
329
|
+
# continue
|
330
|
+
# widths.append(word['geometry']['bounding_box']['width'])
|
331
|
+
# heights.append(word['geometry']['bounding_box']['height'])
|
332
|
+
#
|
333
|
+
# max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
|
334
|
+
# max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
|
335
|
+
#
|
336
|
+
# required_width = max_width * furigana_filter_sensitivity
|
337
|
+
# required_height = max_height * furigana_filter_sensitivity
|
338
|
+
#
|
339
|
+
# if 'text_layout' in text:
|
340
|
+
# paragraphs = text['text_layout']['paragraphs']
|
341
|
+
# for paragraph in paragraphs:
|
342
|
+
# for line in paragraph['lines']:
|
343
|
+
# if furigana_filter_sensitivity == 0 or line['geometry']['bounding_box']['width'] > required_width or line['geometry']['bounding_box']['height'] > required_height:
|
344
|
+
# for word in line['words']:
|
345
|
+
# res += word['plain_text'] + word['text_separator']
|
346
|
+
# else:
|
347
|
+
# continue
|
348
|
+
# res += '\n'
|
349
|
+
else:
|
350
|
+
if 'text_layout' in text:
|
351
|
+
paragraphs = text['text_layout']['paragraphs']
|
352
|
+
for paragraph in paragraphs:
|
353
|
+
for line in paragraph['lines']:
|
354
|
+
for word in line['words']:
|
355
|
+
res += word['plain_text'] + word['text_separator']
|
356
|
+
else:
|
357
|
+
continue
|
358
|
+
res += '\n'
|
311
359
|
|
312
360
|
x = (True, res)
|
313
361
|
|
@@ -339,7 +387,7 @@ class GoogleLensWeb:
|
|
339
387
|
self.available = True
|
340
388
|
logger.info('Google Lens (web) ready')
|
341
389
|
|
342
|
-
def __call__(self, img):
|
390
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
343
391
|
img = input_to_pil_image(img)
|
344
392
|
if not img:
|
345
393
|
return (False, 'Invalid image provided')
|
@@ -432,7 +480,7 @@ class Bing:
|
|
432
480
|
self.available = True
|
433
481
|
logger.info('Bing ready')
|
434
482
|
|
435
|
-
def __call__(self, img):
|
483
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
436
484
|
img = input_to_pil_image(img)
|
437
485
|
if not img:
|
438
486
|
return (False, 'Invalid image provided')
|
@@ -568,7 +616,7 @@ class AppleVision:
|
|
568
616
|
self.available = True
|
569
617
|
logger.info('Apple Vision ready')
|
570
618
|
|
571
|
-
def __call__(self, img):
|
619
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
572
620
|
img = input_to_pil_image(img)
|
573
621
|
if not img:
|
574
622
|
return (False, 'Invalid image provided')
|
@@ -647,7 +695,7 @@ class AppleLiveText:
|
|
647
695
|
self.available = True
|
648
696
|
logger.info('Apple Live Text ready')
|
649
697
|
|
650
|
-
def __call__(self, img):
|
698
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
651
699
|
img = input_to_pil_image(img)
|
652
700
|
if not img:
|
653
701
|
return (False, 'Invalid image provided')
|
@@ -704,7 +752,7 @@ class WinRTOCR:
|
|
704
752
|
except:
|
705
753
|
logger.warning('Error reading URL from config, WinRT OCR will not work!')
|
706
754
|
|
707
|
-
def __call__(self, img):
|
755
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
708
756
|
img = input_to_pil_image(img)
|
709
757
|
if not img:
|
710
758
|
return (False, 'Invalid image provided')
|
@@ -727,6 +775,7 @@ class WinRTOCR:
|
|
727
775
|
|
728
776
|
x = (True, res)
|
729
777
|
|
778
|
+
|
730
779
|
# img.close()
|
731
780
|
return x
|
732
781
|
|
@@ -740,6 +789,7 @@ class OneOCR:
|
|
740
789
|
available = False
|
741
790
|
|
742
791
|
def __init__(self, config={}):
|
792
|
+
self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
743
793
|
if sys.platform == 'win32':
|
744
794
|
if int(platform.release()) < 10:
|
745
795
|
logger.warning('OneOCR is not supported on Windows older than 10!')
|
@@ -761,7 +811,7 @@ class OneOCR:
|
|
761
811
|
except:
|
762
812
|
logger.warning('Error reading URL from config, OneOCR will not work!')
|
763
813
|
|
764
|
-
def __call__(self, img):
|
814
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
765
815
|
img = input_to_pil_image(img)
|
766
816
|
if img.width < 51 or img.height < 51:
|
767
817
|
new_width = max(img.width, 51)
|
@@ -782,7 +832,60 @@ class OneOCR:
|
|
782
832
|
if x_coords and y_coords:
|
783
833
|
crop_coords = (min(x_coords) - 5, min(y_coords) - 5, max(x_coords) + 5, max(y_coords) + 5)
|
784
834
|
|
785
|
-
|
835
|
+
# with open(os.path.join(get_temporary_directory(), 'oneocr_response.json'), 'w',
|
836
|
+
# encoding='utf-8') as f:
|
837
|
+
# json.dump(ocr_resp, f, indent=4, ensure_ascii=False)
|
838
|
+
res = ''
|
839
|
+
if furigana_filter_sensitivity > 0:
|
840
|
+
for line in ocr_resp['lines']:
|
841
|
+
if self.kana_kanji_regex.search(line['text']) is None:
|
842
|
+
continue
|
843
|
+
x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], \
|
844
|
+
line['bounding_rect']['x3'], line['bounding_rect']['x4']
|
845
|
+
y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], \
|
846
|
+
line['bounding_rect']['y3'], line['bounding_rect']['y4']
|
847
|
+
width = max(x2 - x1, x3 - x4)
|
848
|
+
height = max(y3 - y1, y4 - y2)
|
849
|
+
if width > furigana_filter_sensitivity and height > furigana_filter_sensitivity:
|
850
|
+
res += line['text']
|
851
|
+
else:
|
852
|
+
continue
|
853
|
+
res += '\n'
|
854
|
+
# widths, heights = [], []
|
855
|
+
# for line in ocr_resp['lines']:
|
856
|
+
# for word in line['words']:
|
857
|
+
# if self.kana_kanji_regex.search(word['text']) is None:
|
858
|
+
# continue
|
859
|
+
# # x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], line['bounding_rect']['x3'], line['bounding_rect']['x4']
|
860
|
+
# # y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], line['bounding_rect']['y3'], line['bounding_rect']['y4']
|
861
|
+
# x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
|
862
|
+
# word['bounding_rect']['x3'], word['bounding_rect']['x4']
|
863
|
+
# y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
|
864
|
+
# word['bounding_rect']['y3'], word['bounding_rect']['y4']
|
865
|
+
# widths.append(max(x2 - x1, x3 - x4))
|
866
|
+
# heights.append(max(y2 - y1, y3 - y4))
|
867
|
+
#
|
868
|
+
#
|
869
|
+
# max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
|
870
|
+
# max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
|
871
|
+
#
|
872
|
+
# required_width = max_width * furigana_filter_sensitivity
|
873
|
+
# required_height = max_height * furigana_filter_sensitivity
|
874
|
+
# for line in ocr_resp['lines']:
|
875
|
+
# for word in line['words']:
|
876
|
+
# x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
|
877
|
+
# word['bounding_rect']['x3'], word['bounding_rect']['x4']
|
878
|
+
# y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
|
879
|
+
# word['bounding_rect']['y3'], word['bounding_rect']['y4']
|
880
|
+
# width = max(x2 - x1, x3 - x4)
|
881
|
+
# height = max(y2 - y1, y3 - y4)
|
882
|
+
# if furigana_filter_sensitivity == 0 or width > required_width or height > required_height:
|
883
|
+
# res += word['text']
|
884
|
+
# else:
|
885
|
+
# continue
|
886
|
+
# res += '\n'
|
887
|
+
else:
|
888
|
+
res = ocr_resp['text']
|
786
889
|
except RuntimeError as e:
|
787
890
|
return (False, e)
|
788
891
|
else:
|
@@ -825,7 +928,7 @@ class AzureImageAnalysis:
|
|
825
928
|
except:
|
826
929
|
logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
|
827
930
|
|
828
|
-
def __call__(self, img):
|
931
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
829
932
|
img = input_to_pil_image(img)
|
830
933
|
if not img:
|
831
934
|
return (False, 'Invalid image provided')
|
@@ -877,7 +980,7 @@ class EasyOCR:
|
|
877
980
|
self.available = True
|
878
981
|
logger.info('EasyOCR ready')
|
879
982
|
|
880
|
-
def __call__(self, img):
|
983
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
881
984
|
img = input_to_pil_image(img)
|
882
985
|
if not img:
|
883
986
|
return (False, 'Invalid image provided')
|
@@ -923,7 +1026,7 @@ class RapidOCR:
|
|
923
1026
|
self.available = True
|
924
1027
|
logger.info('RapidOCR ready')
|
925
1028
|
|
926
|
-
def __call__(self, img):
|
1029
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
927
1030
|
img = input_to_pil_image(img)
|
928
1031
|
if not img:
|
929
1032
|
return (False, 'Invalid image provided')
|
@@ -957,7 +1060,7 @@ class OCRSpace:
|
|
957
1060
|
except:
|
958
1061
|
logger.warning('Error reading API key from config, OCRSpace will not work!')
|
959
1062
|
|
960
|
-
def __call__(self, img):
|
1063
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
961
1064
|
img = input_to_pil_image(img)
|
962
1065
|
if not img:
|
963
1066
|
return (False, 'Invalid image provided')
|
@@ -1027,7 +1130,7 @@ class GeminiOCR:
|
|
1027
1130
|
except Exception as e:
|
1028
1131
|
logger.error(f'Error configuring google-generativeai: {e}')
|
1029
1132
|
|
1030
|
-
def __call__(self, img):
|
1133
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
1031
1134
|
if not self.available:
|
1032
1135
|
return (False, 'GeminiOCR is not available due to missing API key or configuration error.')
|
1033
1136
|
|
@@ -1089,7 +1192,7 @@ class GroqOCR:
|
|
1089
1192
|
except Exception as e:
|
1090
1193
|
logger.error(f'Error initializing Groq client: {e}')
|
1091
1194
|
|
1092
|
-
def __call__(self, img):
|
1195
|
+
def __call__(self, img, furigana_filter_sensitivity=0):
|
1093
1196
|
if not self.available:
|
1094
1197
|
return (False, 'GroqOCR is not available due to missing API key or configuration error.')
|
1095
1198
|
|
@@ -1139,3 +1242,61 @@ class GroqOCR:
|
|
1139
1242
|
|
1140
1243
|
def _preprocess(self, img):
|
1141
1244
|
return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
|
1245
|
+
|
1246
|
+
# class LocalOCR:
|
1247
|
+
# name = 'local_ocr'
|
1248
|
+
# readable_name = 'Local OCR'
|
1249
|
+
# key = '-'
|
1250
|
+
# available = False
|
1251
|
+
#
|
1252
|
+
# def __init__(self):
|
1253
|
+
# self.requests_session = requests.Session()
|
1254
|
+
# self.available = True
|
1255
|
+
# # logger.info('Local OCR ready') # Uncomment if you have a logger defined
|
1256
|
+
#
|
1257
|
+
# def __call__(self, img, furigana_filter_sensitivity=0):
|
1258
|
+
# if not isinstance(img, Image.Image):
|
1259
|
+
# try:
|
1260
|
+
# img = Image.open(io.BytesIO(img))
|
1261
|
+
# except Exception:
|
1262
|
+
# return (False, 'Invalid image provided')
|
1263
|
+
#
|
1264
|
+
# img = input_to_pil_image(img)
|
1265
|
+
#
|
1266
|
+
# img_base64 = self._preprocess(img)
|
1267
|
+
# if not img_base64:
|
1268
|
+
# return (False, 'Image preprocessing failed (e.g., too big after resize)!')
|
1269
|
+
#
|
1270
|
+
# api_url = 'http://localhost:2333/api/ocr'
|
1271
|
+
# # Send as JSON with base64 encoded image
|
1272
|
+
# json_data = {
|
1273
|
+
# 'image': img_base64
|
1274
|
+
# }
|
1275
|
+
#
|
1276
|
+
# try:
|
1277
|
+
# res = self.requests_session.post(api_url, json=json_data, timeout=5)
|
1278
|
+
# print(res.content)
|
1279
|
+
# except requests.exceptions.Timeout:
|
1280
|
+
# return (False, 'Request timeout!')
|
1281
|
+
# except requests.exceptions.ConnectionError:
|
1282
|
+
# return (False, 'Connection error!')
|
1283
|
+
#
|
1284
|
+
# if res.status_code != 200:
|
1285
|
+
# return (False, f'Error: {res.status_code} - {res.text}')
|
1286
|
+
#
|
1287
|
+
# try:
|
1288
|
+
# data = res.json()
|
1289
|
+
# # Assuming the local OCR service returns text in a 'text' key
|
1290
|
+
# extracted_text = data.get('text', '')
|
1291
|
+
# return (True, extracted_text)
|
1292
|
+
# except requests.exceptions.JSONDecodeError:
|
1293
|
+
# return (False, 'Invalid JSON response from OCR service!')
|
1294
|
+
#
|
1295
|
+
# def _preprocess(self, img):
|
1296
|
+
# return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
|
1297
|
+
|
1298
|
+
# lens = GoogleLens()
|
1299
|
+
#
|
1300
|
+
# res, text = lens(Image.open('test_furigana.png'), furigana_filter_sensitivity=.6) # Example usage
|
1301
|
+
#
|
1302
|
+
# print(text)
|
@@ -315,7 +315,6 @@ class TextFiltering:
|
|
315
315
|
self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
316
316
|
self.chinese_common_regex = re.compile(r'[\u4E00-\u9FFF]')
|
317
317
|
self.english_regex = re.compile(r'[a-zA-Z0-9.,!?;:"\'()\[\]{}]')
|
318
|
-
self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
319
318
|
self.chinese_common_regex = re.compile(r'[\u4E00-\u9FFF]')
|
320
319
|
self.english_regex = re.compile(r'[a-zA-Z0-9.,!?;:"\'()\[\]{}]')
|
321
320
|
self.korean_regex = re.compile(r'[\uAC00-\uD7AF]')
|
@@ -880,7 +879,7 @@ def on_window_minimized(minimized):
|
|
880
879
|
screencapture_window_visible = not minimized
|
881
880
|
|
882
881
|
|
883
|
-
def process_and_write_results(img_or_path, write_to=None, last_result=None, filtering=None, notify=None, engine=None, ocr_start_time=None):
|
882
|
+
def process_and_write_results(img_or_path, write_to=None, last_result=None, filtering=None, notify=None, engine=None, ocr_start_time=None, furigana_filter_sensitivity=0):
|
884
883
|
global engine_index
|
885
884
|
if auto_pause_handler:
|
886
885
|
auto_pause_handler.stop()
|
@@ -896,7 +895,7 @@ def process_and_write_results(img_or_path, write_to=None, last_result=None, filt
|
|
896
895
|
engine_color = config.get_general('engine_color')
|
897
896
|
|
898
897
|
start_time = time.time()
|
899
|
-
result = engine_instance(img_or_path)
|
898
|
+
result = engine_instance(img_or_path, furigana_filter_sensitivity)
|
900
899
|
res, text, crop_coords = (*result, None)[:3]
|
901
900
|
|
902
901
|
|
@@ -909,7 +908,7 @@ def process_and_write_results(img_or_path, write_to=None, last_result=None, filt
|
|
909
908
|
last_result = []
|
910
909
|
break
|
911
910
|
start_time = time.time()
|
912
|
-
result = engine_instance(img_or_path)
|
911
|
+
result = engine_instance(img_or_path, furigana_filter_sensitivity)
|
913
912
|
res, text, crop_coords = (*result, None)[:3]
|
914
913
|
|
915
914
|
end_time = time.time()
|
@@ -989,6 +988,7 @@ def run(read_from=None,
|
|
989
988
|
ocr1=None,
|
990
989
|
ocr2=None,
|
991
990
|
gsm_ocr_config=None,
|
991
|
+
furigana_filter_sensitivity=None,
|
992
992
|
):
|
993
993
|
"""
|
994
994
|
Japanese OCR client
|
@@ -1243,7 +1243,7 @@ def run(read_from=None,
|
|
1243
1243
|
break
|
1244
1244
|
elif img:
|
1245
1245
|
if filter_img:
|
1246
|
-
res, _ = process_and_write_results(img, write_to, last_result, filtering, notify, ocr_start_time=ocr_start_time)
|
1246
|
+
res, _ = process_and_write_results(img, write_to, last_result, filtering, notify, ocr_start_time=ocr_start_time, furigana_filter_sensitivity=furigana_filter_sensitivity)
|
1247
1247
|
if res:
|
1248
1248
|
last_result = (res, engine_index)
|
1249
1249
|
else:
|
@@ -207,6 +207,8 @@ class VAD:
|
|
207
207
|
trim_beginning: bool = False
|
208
208
|
beginning_offset: float = -0.25
|
209
209
|
add_audio_on_no_results: bool = False
|
210
|
+
cut_and_splice_segments: bool = False
|
211
|
+
splice_padding: float = 0.1
|
210
212
|
|
211
213
|
def is_silero(self):
|
212
214
|
return self.selected_vad_model == SILERO or self.backup_vad_model == SILERO
|
GameSentenceMiner/util/ffmpeg.py
CHANGED
@@ -7,6 +7,7 @@ from GameSentenceMiner import obs
|
|
7
7
|
from GameSentenceMiner.util.gsm_utils import make_unique_file_name, get_file_modification_time
|
8
8
|
from GameSentenceMiner.util import configuration
|
9
9
|
from GameSentenceMiner.util.configuration import *
|
10
|
+
from GameSentenceMiner.util.model import VADResult
|
10
11
|
from GameSentenceMiner.util.text_log import initial_time
|
11
12
|
|
12
13
|
|
@@ -18,6 +19,13 @@ def get_ffprobe_path():
|
|
18
19
|
|
19
20
|
ffmpeg_base_command_list = [get_ffmpeg_path(), "-hide_banner", "-loglevel", "error", '-nostdin']
|
20
21
|
|
22
|
+
supported_formats = {
|
23
|
+
'opus': 'libopus',
|
24
|
+
'mp3': 'libmp3lame',
|
25
|
+
'ogg': 'libvorbis',
|
26
|
+
'aac': 'aac',
|
27
|
+
'm4a': 'aac',
|
28
|
+
}
|
21
29
|
|
22
30
|
def call_frame_extractor(video_path, timestamp):
|
23
31
|
"""
|
@@ -247,14 +255,6 @@ def get_audio_codec(video_path):
|
|
247
255
|
|
248
256
|
|
249
257
|
def get_audio_and_trim(video_path, game_line, next_line_time, anki_card_creation_time):
|
250
|
-
supported_formats = {
|
251
|
-
'opus': 'libopus',
|
252
|
-
'mp3': 'libmp3lame',
|
253
|
-
'ogg': 'libvorbis',
|
254
|
-
'aac': 'aac',
|
255
|
-
'm4a': 'aac',
|
256
|
-
}
|
257
|
-
|
258
258
|
codec = get_audio_codec(video_path)
|
259
259
|
|
260
260
|
if codec == get_config().audio.extension:
|
@@ -528,23 +528,59 @@ def convert_audio_to_mp3(input_audio):
|
|
528
528
|
|
529
529
|
|
530
530
|
# Trim the audio using FFmpeg based on detected speech timestamps
|
531
|
-
def trim_audio(input_audio, start_time, end_time, output_audio
|
531
|
+
def trim_audio(input_audio, start_time, end_time, output_audio, trim_beginning=False, fade_in_duration=0.05,
|
532
|
+
fade_out_duration=0.05):
|
532
533
|
command = ffmpeg_base_command_list.copy()
|
533
534
|
|
534
535
|
command.extend(['-i', input_audio])
|
535
536
|
|
536
|
-
if
|
537
|
+
if trim_beginning and start_time > 0:
|
537
538
|
logger.debug(f"trimming beginning to {start_time}")
|
538
539
|
command.extend(['-ss', f"{start_time:.2f}"])
|
539
540
|
|
541
|
+
fade_filter = []
|
542
|
+
if fade_in_duration > 0:
|
543
|
+
fade_filter.append(f'afade=t=in:d={fade_in_duration}')
|
544
|
+
if fade_out_duration > 0:
|
545
|
+
fade_filter.append(f'afade=t=out:st={end_time - fade_out_duration:.2f}:d={fade_out_duration}')
|
546
|
+
# fade_filter.append(f'afade=t=out:d={fade_out_duration}')
|
547
|
+
|
540
548
|
command.extend([
|
541
549
|
'-to', f"{end_time:.2f}",
|
542
|
-
'-c', 'copy',
|
543
|
-
output_audio
|
544
550
|
])
|
545
551
|
|
552
|
+
if fade_filter:
|
553
|
+
command.extend(['-af', f'afade=t=in:d={fade_in_duration},afade=t=out:st={end_time - fade_out_duration:.2f}:d={fade_out_duration}'])
|
554
|
+
command.extend(['-c:a', supported_formats[get_config().audio.extension]])
|
555
|
+
else:
|
556
|
+
command.extend(['-c', 'copy'])
|
557
|
+
|
558
|
+
command.append(output_audio)
|
559
|
+
|
546
560
|
logger.debug(" ".join(command))
|
547
561
|
|
562
|
+
try:
|
563
|
+
subprocess.run(command, check=True)
|
564
|
+
except subprocess.CalledProcessError as e:
|
565
|
+
logger.error(f"FFmpeg command failed with error: {e}")
|
566
|
+
logger.error(f"Command: {' '.join(command)}")
|
567
|
+
except FileNotFoundError:
|
568
|
+
logger.error("FFmpeg not found. Please ensure FFmpeg is installed and in your PATH.")
|
569
|
+
|
570
|
+
|
571
|
+
def combine_audio_files(audio_files, output_file):
|
572
|
+
if not audio_files:
|
573
|
+
logger.error("No audio files provided for combination.")
|
574
|
+
return
|
575
|
+
|
576
|
+
command = ffmpeg_base_command_list + [
|
577
|
+
"-i", "concat:" + "|".join(audio_files),
|
578
|
+
"-c", "copy",
|
579
|
+
output_file
|
580
|
+
]
|
581
|
+
|
582
|
+
logger.debug("Combining audio files with command: " + " ".join(command))
|
583
|
+
|
548
584
|
subprocess.run(command)
|
549
585
|
|
550
586
|
|
GameSentenceMiner/util/model.py
CHANGED
@@ -175,3 +175,25 @@ class AnkiCard:
|
|
175
175
|
return True, key
|
176
176
|
|
177
177
|
return False, None
|
178
|
+
|
179
|
+
|
180
|
+
class VADResult:
|
181
|
+
def __init__(self, success: bool, start: float, end: float, model: str, segments: list = None, output_audio: str = None):
|
182
|
+
self.success = success
|
183
|
+
self.start = start
|
184
|
+
self.end = end
|
185
|
+
self.model = model
|
186
|
+
self.segments = segments if segments is not None else []
|
187
|
+
self.output_audio = None
|
188
|
+
|
189
|
+
def __repr__(self):
|
190
|
+
return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model}, output_audio={self.output_audio})"
|
191
|
+
|
192
|
+
def trim_successful_string(self):
|
193
|
+
if self.success:
|
194
|
+
if get_config().vad.trim_beginning:
|
195
|
+
return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
|
196
|
+
else:
|
197
|
+
return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
|
198
|
+
else:
|
199
|
+
return f"Failed to trim audio using {self.model}."
|
GameSentenceMiner/vad.py
CHANGED
@@ -1,31 +1,15 @@
|
|
1
1
|
import subprocess
|
2
2
|
import tempfile
|
3
|
+
import time
|
3
4
|
import warnings
|
4
5
|
from abc import abstractmethod, ABC
|
5
6
|
|
6
7
|
from GameSentenceMiner.util import configuration, ffmpeg
|
7
8
|
from GameSentenceMiner.util.configuration import *
|
8
9
|
from GameSentenceMiner.util.ffmpeg import get_ffprobe_path
|
10
|
+
from GameSentenceMiner.util.gsm_utils import make_unique_file_name, run_new_thread
|
11
|
+
from GameSentenceMiner.util.model import VADResult
|
9
12
|
|
10
|
-
class VADResult:
|
11
|
-
def __init__(self, success: bool, start: float, end: float, model: str, output_audio: str = None):
|
12
|
-
self.success = success
|
13
|
-
self.start = start
|
14
|
-
self.end = end
|
15
|
-
self.model = model
|
16
|
-
self.output_audio = None
|
17
|
-
|
18
|
-
def __repr__(self):
|
19
|
-
return f"VADResult(success={self.success}, start={self.start}, end={self.end}, model={self.model}, output_audio={self.output_audio})"
|
20
|
-
|
21
|
-
def trim_successful_string(self):
|
22
|
-
if self.success:
|
23
|
-
if get_config().vad.trim_beginning:
|
24
|
-
return f"Trimmed audio from {self.start:.2f} to {self.end:.2f} seconds using {self.model}."
|
25
|
-
else:
|
26
|
-
return f"Trimmed end of audio to {self.end:.2f} seconds using {self.model}."
|
27
|
-
else:
|
28
|
-
return f"Failed to trim audio using {self.model}."
|
29
13
|
|
30
14
|
class VADSystem:
|
31
15
|
def __init__(self):
|
@@ -108,6 +92,29 @@ class VADProcessor(ABC):
|
|
108
92
|
)
|
109
93
|
return float(result.stdout.strip())
|
110
94
|
|
95
|
+
@staticmethod
|
96
|
+
def extract_audio_and_combine_segments(input_audio, segments, output_audio, padding=0.2):
|
97
|
+
files = []
|
98
|
+
ffmpeg_threads = []
|
99
|
+
logger.info(f"Extracting {len(segments)} segments from {input_audio} with padding {padding} seconds.")
|
100
|
+
for segment in segments:
|
101
|
+
logger.info(segment)
|
102
|
+
temp_file = make_unique_file_name(os.path.join(get_temporary_directory(), "segment." + get_config().audio.extension))
|
103
|
+
files.append(temp_file)
|
104
|
+
ffmpeg_threads.append(run_new_thread(lambda: ffmpeg.trim_audio(input_audio, segment['start'] - padding, segment['end'] + padding, temp_file, trim_beginning=True)))
|
105
|
+
time.sleep(0.1) # Small delay to ensure unique file names
|
106
|
+
|
107
|
+
for thread in ffmpeg_threads:
|
108
|
+
thread.join()
|
109
|
+
|
110
|
+
if len(files) > 1:
|
111
|
+
ffmpeg.combine_audio_files(files, output_audio)
|
112
|
+
# for file in files:
|
113
|
+
# os.remove(file)
|
114
|
+
else:
|
115
|
+
shutil.move(files[0], output_audio)
|
116
|
+
|
117
|
+
|
111
118
|
def process_audio(self, input_audio, output_audio, game_line):
|
112
119
|
voice_activity = self._detect_voice_activity(input_audio)
|
113
120
|
|
@@ -124,8 +131,11 @@ class VADProcessor(ABC):
|
|
124
131
|
if 0 > audio_length - voice_activity[-1]['start'] + get_config().audio.beginning_offset:
|
125
132
|
end_time = voice_activity[-2]['end']
|
126
133
|
|
127
|
-
|
128
|
-
|
134
|
+
if get_config().vad.cut_and_splice_segments:
|
135
|
+
self.extract_audio_and_combine_segments(input_audio, voice_activity, output_audio, padding=get_config().vad.splice_padding)
|
136
|
+
else:
|
137
|
+
ffmpeg.trim_audio(input_audio, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, output_audio, trim_beginning=get_config().vad.trim_beginning, fade_in_duration=0, fade_out_duration=0)
|
138
|
+
return VADResult(True, start_time + get_config().vad.beginning_offset, end_time + get_config().audio.end_offset, self.vad_system_name, voice_activity, output_audio)
|
129
139
|
|
130
140
|
class SileroVADProcessor(VADProcessor):
|
131
141
|
def __init__(self):
|
@@ -176,17 +186,23 @@ class WhisperVADProcessor(VADProcessor):
|
|
176
186
|
# Process the segments to extract tokens, timestamps, and confidence
|
177
187
|
for segment in result.segments:
|
178
188
|
logger.debug(segment.to_dict())
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
189
|
+
voice_activity.append({
|
190
|
+
'text': segment.text,
|
191
|
+
'start': segment.start,
|
192
|
+
'end': segment.end,
|
193
|
+
'confidence': segment.avg_logprob
|
194
|
+
})
|
195
|
+
# for word in segment.words:
|
196
|
+
# logger.debug(word.to_dict())
|
197
|
+
# confidence = word.probability
|
198
|
+
# if confidence > .1:
|
199
|
+
# logger.debug(word)
|
200
|
+
# voice_activity.append({
|
201
|
+
# 'text': word.word,
|
202
|
+
# 'start': word.start,
|
203
|
+
# 'end': word.end,
|
204
|
+
# 'confidence': word.probability
|
205
|
+
# })
|
190
206
|
|
191
207
|
# Analyze the detected words to decide whether to use the audio
|
192
208
|
should_use = False
|
@@ -342,4 +358,10 @@ class GroqVADProcessor(VADProcessor):
|
|
342
358
|
logger.error(f"Error detecting voice with Groq: {e}")
|
343
359
|
return [], 0.0
|
344
360
|
|
345
|
-
vad_processor = VADSystem()
|
361
|
+
vad_processor = VADSystem()
|
362
|
+
|
363
|
+
# test_vad = SileroVADProcessor()
|
364
|
+
#
|
365
|
+
# if os.path.exists(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus"):
|
366
|
+
# os.remove(r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus")
|
367
|
+
# test_vad.process_audio(r"C:\Users\Beangate\GSM\Electron App\test\before_splice.opus", r"C:\Users\Beangate\GSM\Electron App\test\after_splice.opus", None)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
GameSentenceMiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
GameSentenceMiner/anki.py,sha256=hNHBIoJRrsWIhLe0sehOYPXTWzPREeXl4gYCPHUCaiE,16331
|
3
|
-
GameSentenceMiner/config_gui.py,sha256=
|
3
|
+
GameSentenceMiner/config_gui.py,sha256=xQXFrDfsjbWxuX0PwGn2jjv0mfRZEZ_MdUU3iMASiB4,84788
|
4
4
|
GameSentenceMiner/gametext.py,sha256=nAwGMQSrmc6sUAw-OAURK2n6MG5Ecm6psJ7YF9q5KTA,6623
|
5
|
-
GameSentenceMiner/gsm.py,sha256=
|
5
|
+
GameSentenceMiner/gsm.py,sha256=oM4l0UP6_h66AVEZqdIShs264A3LEgmSDTlp-gp048o,24167
|
6
6
|
GameSentenceMiner/obs.py,sha256=YG8LwBf9BTsGbROm_Uq6LhFDSrbf3jgogp78rBbJq94,14728
|
7
|
-
GameSentenceMiner/vad.py,sha256=
|
7
|
+
GameSentenceMiner/vad.py,sha256=kWJ6c3v0iJInIiwU9ANYQvweqGi3PpeN_eegJLybfIM,16148
|
8
8
|
GameSentenceMiner/ai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
GameSentenceMiner/ai/ai_prompting.py,sha256=0jBAnngNwmc3dqJiVWe_QRy4Syr-muV-ML2rq0FiUtU,10215
|
10
10
|
GameSentenceMiner/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,22 +18,22 @@ GameSentenceMiner/assets/pickaxe.png,sha256=VfIGyXyIZdzEnVcc4PmG3wszPMO1W4KCT7Q_
|
|
18
18
|
GameSentenceMiner/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
GameSentenceMiner/ocr/gsm_ocr_config.py,sha256=fEQ2o2NXksGRHpueO8c4TfAp75GEdAtAr1ngTFOsdpg,2257
|
20
20
|
GameSentenceMiner/ocr/ocrconfig.py,sha256=_tY8mjnzHMJrLS8E5pHqYXZjMuLoGKYgJwdhYgN-ny4,6466
|
21
|
-
GameSentenceMiner/ocr/owocr_area_selector.py,sha256=
|
22
|
-
GameSentenceMiner/ocr/owocr_helper.py,sha256=
|
21
|
+
GameSentenceMiner/ocr/owocr_area_selector.py,sha256=BXjHh1-NWwDeQ623MkwdR2hu6HBQAk5stpbiLE2WX_4,47289
|
22
|
+
GameSentenceMiner/ocr/owocr_helper.py,sha256=idjVB8Ln9PwNUFZMOtRkq7wAhbteU7-haRM7Cf_wmY4,20979
|
23
23
|
GameSentenceMiner/ocr/ss_picker.py,sha256=Ck2K9JliXhY_7a7nS0BVsoAmzwPFFtsW_TxHCcfxT00,4274
|
24
|
-
GameSentenceMiner/owocr/owocr/__init__.py,sha256=
|
24
|
+
GameSentenceMiner/owocr/owocr/__init__.py,sha256=87hfN5u_PbL_onLfMACbc0F5j4KyIK9lKnRCj6oZgR0,49
|
25
25
|
GameSentenceMiner/owocr/owocr/__main__.py,sha256=XQaqZY99EKoCpU-gWQjNbTs7Kg17HvBVE7JY8LqIE0o,157
|
26
26
|
GameSentenceMiner/owocr/owocr/config.py,sha256=qM7kISHdUhuygGXOxmgU6Ef2nwBShrZtdqu4InDCViE,8103
|
27
27
|
GameSentenceMiner/owocr/owocr/lens_betterproto.py,sha256=oNoISsPilVVRBBPVDtb4-roJtAhp8ZAuFTci3TGXtMc,39141
|
28
|
-
GameSentenceMiner/owocr/owocr/ocr.py,sha256=
|
29
|
-
GameSentenceMiner/owocr/owocr/run.py,sha256=
|
28
|
+
GameSentenceMiner/owocr/owocr/ocr.py,sha256=uEv0Pjvq7n6XJOkV4i_EiR0wxXqzi2EVpWsK2-i3SXs,51932
|
29
|
+
GameSentenceMiner/owocr/owocr/run.py,sha256=bNh9nF0tzFa7BuW-ixcipyf6vTrNyaNwtgG6W4a-uIo,55063
|
30
30
|
GameSentenceMiner/owocr/owocr/screen_coordinate_picker.py,sha256=Na6XStbQBtpQUSdbN3QhEswtKuU1JjReFk_K8t5ezQE,3395
|
31
31
|
GameSentenceMiner/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
|
-
GameSentenceMiner/util/configuration.py,sha256=
|
32
|
+
GameSentenceMiner/util/configuration.py,sha256=TAdd-aLMCMvpu7OS1Y5brKQC4svcK29J94nvrEb4Ue4,27535
|
33
33
|
GameSentenceMiner/util/electron_config.py,sha256=3VmIrcXhC-wIMMc4uqV85NrNenRl4ZUbnQfSjWEwuig,9852
|
34
|
-
GameSentenceMiner/util/ffmpeg.py,sha256=
|
34
|
+
GameSentenceMiner/util/ffmpeg.py,sha256=iU0AuVikKrgJ1fvtAHAgm0j31r1GQtzg2h_1nzUbGS8,24802
|
35
35
|
GameSentenceMiner/util/gsm_utils.py,sha256=Z_Lu4jSIfUaM2VljIJXQkSJD0UsyJ5hMB46H2NS0gZo,8819
|
36
|
-
GameSentenceMiner/util/model.py,sha256=
|
36
|
+
GameSentenceMiner/util/model.py,sha256=ROH-uO55im7H4COonyyPZQ8l9-8EPtyOk7l_DNEonbk,6630
|
37
37
|
GameSentenceMiner/util/notification.py,sha256=0OnEYjn3DUEZ6c6OtPjdVZe-DG-QSoMAl9fetjjCvNU,3874
|
38
38
|
GameSentenceMiner/util/package.py,sha256=u1ym5z869lw5EHvIviC9h9uH97bzUXSXXA8KIn8rUvk,1157
|
39
39
|
GameSentenceMiner/util/ss_selector.py,sha256=oCzmDbpEGvVselF-oDPIrBcQktGIZT0Zt16uDLDAHMQ,4493
|
@@ -61,9 +61,9 @@ GameSentenceMiner/web/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
61
61
|
GameSentenceMiner/web/templates/index.html,sha256=HZKiIjiGJV8PGQ9T2aLDUNSfJn71qOwbYCjbRuSIjpY,213583
|
62
62
|
GameSentenceMiner/web/templates/text_replacements.html,sha256=tV5c8mCaWSt_vKuUpbdbLAzXZ3ATZeDvQ9PnnAfqY0M,8598
|
63
63
|
GameSentenceMiner/web/templates/utility.html,sha256=3flZinKNqUJ7pvrZk6xu__v67z44rXnaK7UTZ303R-8,16946
|
64
|
-
gamesentenceminer-2.9.
|
65
|
-
gamesentenceminer-2.9.
|
66
|
-
gamesentenceminer-2.9.
|
67
|
-
gamesentenceminer-2.9.
|
68
|
-
gamesentenceminer-2.9.
|
69
|
-
gamesentenceminer-2.9.
|
64
|
+
gamesentenceminer-2.9.24.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
65
|
+
gamesentenceminer-2.9.24.dist-info/METADATA,sha256=mmAUr_sxdKxY2_Vdt83NkFBqB4LfCdqY6m8ze8-vrqQ,7276
|
66
|
+
gamesentenceminer-2.9.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
67
|
+
gamesentenceminer-2.9.24.dist-info/entry_points.txt,sha256=2APEP25DbfjSxGeHtwBstMH8mulVhLkqF_b9bqzU6vQ,65
|
68
|
+
gamesentenceminer-2.9.24.dist-info/top_level.txt,sha256=V1hUY6xVSyUEohb0uDoN4UIE6rUZ_JYx8yMyPGX4PgQ,18
|
69
|
+
gamesentenceminer-2.9.24.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|