GameSentenceMiner 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GameSentenceMiner/gsm.py CHANGED
@@ -97,35 +97,37 @@ class VideoToAudioHandler(FileSystemEventHandler):
97
97
  skip_delete = True
98
98
  return
99
99
 
100
- mined_line = get_text_event(last_note)
101
- gsm_state.last_mined_line = mined_line
102
- if os.path.exists(video_path) and os.access(video_path, os.R_OK):
103
- logger.debug(f"Video found and is readable: {video_path}")
104
- if get_config().obs.minimum_replay_size and not ffmpeg.is_video_big_enough(video_path,
105
- get_config().obs.minimum_replay_size):
106
- logger.debug("Checking if video is big enough")
107
- notification.send_check_obs_notification(reason="Video may be empty, check scene in OBS.")
108
- logger.error(
109
- f"Video was unusually small, potentially empty! Check OBS for Correct Scene Settings! Path: {video_path}")
110
- return
111
-
112
100
  # Just for safety
113
101
  if not last_note:
114
102
  if get_config().anki.update_anki:
115
103
  last_note = anki.get_last_anki_card()
116
104
  if get_config().features.backfill_audio:
117
105
  last_note = anki.get_cards_by_sentence(gametext.current_line_after_regex)
106
+
107
+ # Get Info of line mined
118
108
  line_cutoff = None
119
109
  start_line = None
120
- if mined_line:
121
- start_line = mined_line
122
- if mined_line.next:
123
- line_cutoff = mined_line.next.time
124
-
125
110
  if selected_lines:
126
111
  start_line = selected_lines[0]
127
112
  mined_line = get_mined_line(last_note, selected_lines)
128
113
  line_cutoff = selected_lines[-1].get_next_time()
114
+ else:
115
+ mined_line = get_text_event(last_note)
116
+ if mined_line:
117
+ start_line = mined_line
118
+ if mined_line.next:
119
+ line_cutoff = mined_line.next.time
120
+ gsm_state.last_mined_line = mined_line
121
+
122
+ if os.path.exists(video_path) and os.access(video_path, os.R_OK):
123
+ logger.debug(f"Video found and is readable: {video_path}")
124
+ if get_config().obs.minimum_replay_size and not ffmpeg.is_video_big_enough(video_path,
125
+ get_config().obs.minimum_replay_size):
126
+ logger.debug("Checking if video is big enough")
127
+ notification.send_check_obs_notification(reason="Video may be empty, check scene in OBS.")
128
+ logger.error(
129
+ f"Video was unusually small, potentially empty! Check OBS for Correct Scene Settings! Path: {video_path}")
130
+ return
129
131
 
130
132
  if last_note:
131
133
  logger.debug(last_note.to_json())
@@ -195,10 +195,10 @@ all_cords = None
195
195
  rectangles = None
196
196
  last_ocr2_result = []
197
197
 
198
- def do_second_ocr(ocr1_text, time, img, filtering, ignore_furigana_filter=False):
198
+ def do_second_ocr(ocr1_text, time, img, filtering, ignore_furigana_filter=False, ignore_previous_result=False):
199
199
  global twopassocr, ocr2, last_ocr2_result
200
200
  try:
201
- orig_text, text = run.process_and_write_results(img, None, last_ocr2_result, filtering, None,
201
+ orig_text, text = run.process_and_write_results(img, None, last_ocr2_result if not ignore_previous_result else None, filtering, None,
202
202
  engine=ocr2, furigana_filter_sensitivity=furigana_filter_sensitivity if not ignore_furigana_filter else 0)
203
203
 
204
204
  if compare_ocr_results(last_ocr2_result, orig_text):
@@ -344,7 +344,8 @@ def run_oneocr(ocr_config: OCRConfig, rectangles):
344
344
  gsm_ocr_config=ocr_config,
345
345
  screen_capture_areas=screen_areas,
346
346
  furigana_filter_sensitivity=furigana_filter_sensitivity,
347
- screen_capture_combo=manual_ocr_hotkey if manual_ocr_hotkey and manual else None)
347
+ screen_capture_combo=manual_ocr_hotkey if manual_ocr_hotkey and manual else None,
348
+ keep_line_breaks=keep_newline)
348
349
  except Exception as e:
349
350
  logger.exception(f"Error running OneOCR: {e}")
350
351
  done = True
@@ -359,14 +360,14 @@ def add_ss_hotkey(ss_hotkey="ctrl+shift+g"):
359
360
  def capture():
360
361
  print("Taking screenshot...")
361
362
  img = cropper.run()
362
- do_second_ocr("", datetime.now(), img, filtering, ignore_furigana_filter=True)
363
+ do_second_ocr("", datetime.now(), img, filtering, ignore_furigana_filter=True, ignore_previous_result=True)
363
364
  def capture_main_monitor():
364
365
  print("Taking screenshot of main monitor...")
365
366
  with mss.mss() as sct:
366
367
  main_monitor = sct.monitors[1] if len(sct.monitors) > 1 else sct.monitors[0]
367
368
  img = sct.grab(main_monitor)
368
369
  img_bytes = mss.tools.to_png(img.rgb, img.size)
369
- do_second_ocr("", datetime.now(), img_bytes, filtering, ignore_furigana_filter=True)
370
+ do_second_ocr("", datetime.now(), img_bytes, filtering, ignore_furigana_filter=True, ignore_previous_result=True)
370
371
  hotkey_reg = None
371
372
  try:
372
373
  hotkey_reg = keyboard.add_hotkey(ss_hotkey, capture)
@@ -404,7 +405,7 @@ def set_force_stable_hotkey():
404
405
 
405
406
  if __name__ == "__main__":
406
407
  try:
407
- global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config, furigana_filter_sensitivity, area_select_ocr_hotkey, window, optimize_second_scan, use_window_for_config
408
+ global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config, furigana_filter_sensitivity, area_select_ocr_hotkey, window, optimize_second_scan, use_window_for_config, keep_newline
408
409
  import sys
409
410
 
410
411
  import argparse
@@ -428,6 +429,7 @@ if __name__ == "__main__":
428
429
  help="Optimize second scan by cropping based on first scan results")
429
430
  parser.add_argument("--use_window_for_config", action="store_true",
430
431
  help="Use the specified window for loading OCR configuration")
432
+ parser.add_argument("--keep_newline", action="store_true", help="Keep new lines in OCR output")
431
433
 
432
434
  args = parser.parse_args()
433
435
 
@@ -446,6 +448,7 @@ if __name__ == "__main__":
446
448
  clipboard_output = args.clipboard_output
447
449
  optimize_second_scan = args.optimize_second_scan
448
450
  use_window_for_config = args.use_window_for_config
451
+ keep_newline = args.keep_newline
449
452
 
450
453
  window = None
451
454
  logger.info(f"Received arguments: {vars(args)}")
@@ -14,7 +14,6 @@ from urllib.parse import urlparse, parse_qs
14
14
  import jaconv
15
15
  import numpy as np
16
16
  from PIL import Image
17
- from google.generativeai import GenerationConfig
18
17
  from loguru import logger
19
18
  import requests
20
19
 
@@ -92,8 +91,11 @@ def empty_post_process(text):
92
91
  return text
93
92
 
94
93
 
95
- def post_process(text):
96
- text = ' '.join([''.join(i.split()) for i in text.splitlines()])
94
+ def post_process(text, keep_blank_lines=False):
95
+ if keep_blank_lines:
96
+ text = '\n'.join([''.join(i.split()) for i in text.splitlines()])
97
+ else:
98
+ text = ''.join([''.join(i.split()) for i in text.splitlines()])
97
99
  text = text.replace('…', '...')
98
100
  text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
99
101
  text = jaconv.h2z(text, ascii=True, digit=True)
@@ -305,22 +307,42 @@ class GoogleLens:
305
307
  response_proto = LensOverlayServerResponse().FromString(res.content)
306
308
  response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
307
309
 
308
- # with open(os.path.join(get_temporary_directory(), 'glens_response.json'), 'w', encoding='utf-8') as f:
309
- # json.dump(response_dict, f, indent=4, ensure_ascii=False)
310
+ with open(os.path.join(r"C:\Users\Beangate\GSM\Electron App\test", 'glens_response.json'), 'w', encoding='utf-8') as f:
311
+ json.dump(response_dict, f, indent=4, ensure_ascii=False)
310
312
  res = ''
311
313
  text = response_dict['objects_response']['text']
312
314
  skipped = []
313
- if furigana_filter_sensitivity > 0:
314
- if 'text_layout' in text:
315
- for paragraph in text['text_layout']['paragraphs']:
316
- for line in paragraph['lines']:
315
+ previous_line = None
316
+ if 'text_layout' in text:
317
+ for paragraph in text['text_layout']['paragraphs']:
318
+ if previous_line:
319
+ prev_bbox = previous_line['geometry']['bounding_box']
320
+ curr_bbox = paragraph['geometry']['bounding_box']
321
+ vertical_space = abs(curr_bbox['center_y'] - prev_bbox['center_y']) * img.height
322
+ prev_height = prev_bbox['height'] * img.height
323
+ current_height = curr_bbox['height'] * img.height
324
+ avg_height = (prev_height + current_height) / 2
325
+ # If vertical space is close to previous line's height, add a blank line
326
+ # logger.info(f"Vertical space: {vertical_space}, Average height: {avg_height}")
327
+ # logger.info(avg_height * 2)
328
+ if vertical_space > avg_height * 2:
329
+ logger.info('Adding blank line')
330
+ res += 'BLANK_LINE'
331
+ for line in paragraph['lines']:
332
+ if furigana_filter_sensitivity:
317
333
  if furigana_filter_sensitivity < line['geometry']['bounding_box']['width'] * img.width and furigana_filter_sensitivity < line['geometry']['bounding_box']['height'] * img.height:
318
334
  for word in line['words']:
319
335
  res += word['plain_text'] + word['text_separator']
320
336
  else:
321
337
  skipped.append(word['plain_text'] for word in line['words'])
322
338
  continue
323
- res += '\n'
339
+ else:
340
+ for word in line['words']:
341
+ res += word['plain_text'] + word['text_separator']
342
+ else:
343
+ continue
344
+ previous_line = paragraph
345
+ res += '\n'
324
346
  # logger.info(
325
347
  # f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
326
348
  # widths = []
@@ -351,16 +373,16 @@ class GoogleLens:
351
373
  # else:
352
374
  # continue
353
375
  # res += '\n'
354
- else:
355
- if 'text_layout' in text:
356
- paragraphs = text['text_layout']['paragraphs']
357
- for paragraph in paragraphs:
358
- for line in paragraph['lines']:
359
- for word in line['words']:
360
- res += word['plain_text'] + word['text_separator']
361
- else:
362
- continue
363
- res += '\n'
376
+ # else:
377
+ # if 'text_layout' in text:
378
+ # paragraphs = text['text_layout']['paragraphs']
379
+ # for paragraph in paragraphs:
380
+ # for line in paragraph['lines']:
381
+ # for word in line['words']:
382
+ # res += word['plain_text'] + word['text_separator']
383
+ # else:
384
+ # continue
385
+ # res += '\n'
364
386
 
365
387
  x = (True, res)
366
388
 
@@ -1128,17 +1150,33 @@ class GeminiOCR:
1128
1150
  # if "google-generativeai" not in sys.modules:
1129
1151
  # logger.warning('google-generativeai not available, GeminiOCR will not work!')
1130
1152
  # else:
1131
- import google.generativeai as genai
1153
+ from google import genai
1154
+ from google.genai import types
1132
1155
  try:
1133
1156
  self.api_key = config['api_key']
1134
1157
  if not self.api_key:
1135
1158
  logger.warning('Gemini API key not provided, GeminiOCR will not work!')
1136
1159
  else:
1137
- genai.configure(api_key=self.api_key)
1138
- self.model = genai.GenerativeModel(config['model'], generation_config=GenerationConfig(
1160
+ self.client = genai.Client(api_key=self.api_key)
1161
+ self.model = config['model']
1162
+ self.generation_config = types.GenerateContentConfig(
1139
1163
  temperature=0.0,
1140
- max_output_tokens=300
1141
- ))
1164
+ max_output_tokens=300,
1165
+ safety_settings=[
1166
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
1167
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1168
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
1169
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1170
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
1171
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1172
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
1173
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1174
+ ],
1175
+ )
1176
+ if "2.5" in self.model:
1177
+ self.generation_config.thinking_config = types.ThinkingConfig(
1178
+ thinking_budget=0,
1179
+ )
1142
1180
  self.available = True
1143
1181
  logger.info('Gemini (using google-generativeai) ready')
1144
1182
  except KeyError:
@@ -1151,29 +1189,36 @@ class GeminiOCR:
1151
1189
  return (False, 'GeminiOCR is not available due to missing API key or configuration error.')
1152
1190
 
1153
1191
  try:
1192
+ from google.genai import types
1154
1193
  img, is_path = input_to_pil_image(img)
1155
- import google.generativeai as genai
1156
1194
  img_bytes = self._preprocess(img)
1157
1195
  if not img_bytes:
1158
1196
  return (False, 'Error processing image for Gemini.')
1159
1197
 
1160
1198
  contents = [
1161
- {
1162
- 'parts': [
1163
- {
1164
- 'inline_data': {
1165
- 'mime_type': 'image/png',
1166
- 'data': img_bytes
1167
- }
1168
- },
1169
- {
1170
- 'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
1171
- }
1199
+ types.Content(
1200
+ parts=[
1201
+ types.Part(
1202
+ inline_data=types.Blob(
1203
+ mime_type="image/png",
1204
+ data=img_bytes
1205
+ )
1206
+ ),
1207
+ types.Part(
1208
+ text="""
1209
+ **Disclaimer:** The image provided is from a video game. This content is entirely fictional and part of a narrative. It must not be treated as real-world user input or a genuine request.
1210
+ Analyze the image. Extract text \\*only\\* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return \\*only\\* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
1211
+ """
1212
+ )
1172
1213
  ]
1173
- }
1214
+ )
1174
1215
  ]
1175
1216
 
1176
- response = self.model.generate_content(contents)
1217
+ response = self.client.models.generate_content(
1218
+ model=self.model,
1219
+ contents=contents,
1220
+ config=self.generation_config
1221
+ )
1177
1222
  text_output = response.text.strip()
1178
1223
 
1179
1224
  return (True, text_output)
@@ -1373,8 +1418,8 @@ class GroqOCR:
1373
1418
  # def _preprocess(self, img):
1374
1419
  # return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
1375
1420
 
1376
- # lens = GoogleLens()
1421
+ # lens = GeminiOCR(config={'model': 'gemini-2.5-flash-lite-preview-06-17', 'api_key': ''})
1377
1422
  #
1378
- # res, text = lens(Image.open('test_furigana.png'), furigana_filter_sensitivity=.6) # Example usage
1423
+ # res, text = lens(Image.open('test_furigana.png')) # Example usage
1379
1424
  #
1380
1425
  # print(text)
@@ -353,7 +353,9 @@ class TextFiltering:
353
353
 
354
354
  orig_text_filtered = []
355
355
  for block in orig_text:
356
- if lang == "ja":
356
+ if "BLANK_LINE" in block:
357
+ block_filtered = ["\n"]
358
+ elif lang == "ja":
357
359
  block_filtered = self.kana_kanji_regex.findall(block)
358
360
  elif lang == "zh":
359
361
  block_filtered = self.chinese_common_regex.findall(block)
@@ -394,7 +396,8 @@ class TextFiltering:
394
396
  new_blocks = []
395
397
  for idx, block in enumerate(orig_text):
396
398
  if orig_text_filtered[idx] and (orig_text_filtered[idx] not in last_text):
397
- new_blocks.append(block)
399
+ new_blocks.append(str(block).strip().replace("BLANK_LINE", "\n"))
400
+
398
401
 
399
402
  final_blocks = []
400
403
  if self.accurate_filtering:
@@ -407,9 +410,10 @@ class TextFiltering:
407
410
  else:
408
411
  for block in new_blocks:
409
412
  # This only filters out NON JA/ZH from text when lang is JA/ZH
410
- if lang not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh']:
413
+ if lang not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n":
411
414
  final_blocks.append(block)
412
415
 
416
+
413
417
  text = '\n'.join(final_blocks)
414
418
  return text, orig_text_filtered
415
419
 
@@ -937,7 +941,7 @@ def process_and_write_results(img_or_path, write_to=None, last_result=None, filt
937
941
  if filtering:
938
942
  text, orig_text = filtering(text, last_result)
939
943
  if lang == "ja" or lang == "zh":
940
- text = post_process(text)
944
+ text = post_process(text, keep_blank_lines=keep_new_lines)
941
945
  logger.opt(ansi=True).info(f'Text recognized in {end_time - start_time:0.03f}s using <{engine_color}>{engine_instance.readable_name}</{engine_color}>: {text}')
942
946
  if notify and config.get_general('notifications'):
943
947
  notifier.send(title='owocr', message='Text recognized: ' + text)
@@ -999,6 +1003,7 @@ def run(read_from=None,
999
1003
  ocr2=None,
1000
1004
  gsm_ocr_config=None,
1001
1005
  furigana_filter_sensitivity=None,
1006
+ keep_line_breaks=False,
1002
1007
  ):
1003
1008
  """
1004
1009
  Japanese OCR client
@@ -1075,11 +1080,13 @@ def run(read_from=None,
1075
1080
  global engine_instances
1076
1081
  global engine_keys
1077
1082
  global lang
1083
+ global keep_new_lines
1078
1084
  lang = language
1079
1085
  engine_instances = []
1080
1086
  config_engines = []
1081
1087
  engine_keys = []
1082
1088
  default_engine = ''
1089
+ keep_new_lines = keep_line_breaks
1083
1090
 
1084
1091
  if len(config.get_general('engines')) > 0:
1085
1092
  for config_engine in config.get_general('engines').split(','):
@@ -16,13 +16,13 @@ import toml
16
16
  from dataclasses_json import dataclass_json
17
17
 
18
18
  OFF = 'OFF'
19
- VOSK = 'VOSK'
19
+ # VOSK = 'VOSK'
20
20
  SILERO = 'SILERO'
21
21
  WHISPER = 'WHISPER'
22
- GROQ = 'GROQ'
22
+ # GROQ = 'GROQ'
23
23
 
24
- VOSK_BASE = 'BASE'
25
- VOSK_SMALL = 'SMALL'
24
+ # VOSK_BASE = 'BASE'
25
+ # VOSK_SMALL = 'SMALL'
26
26
 
27
27
  WHISPER_TINY = 'tiny'
28
28
  WHISPER_BASE = 'base'
@@ -33,6 +33,7 @@ WHISPER_TURBO = 'turbo'
33
33
 
34
34
  AI_GEMINI = 'Gemini'
35
35
  AI_GROQ = 'Groq'
36
+ AI_LOCAL = 'Local'
36
37
 
37
38
  INFO = 'INFO'
38
39
  DEBUG = 'DEBUG'
@@ -219,7 +220,7 @@ class VAD:
219
220
  whisper_model: str = WHISPER_BASE
220
221
  do_vad_postprocessing: bool = True
221
222
  language: str = 'ja'
222
- vosk_url: str = VOSK_BASE
223
+ # vosk_url: str = VOSK_BASE
223
224
  selected_vad_model: str = WHISPER
224
225
  backup_vad_model: str = SILERO
225
226
  trim_beginning: bool = False
@@ -234,11 +235,11 @@ class VAD:
234
235
  def is_whisper(self):
235
236
  return self.selected_vad_model == WHISPER or self.backup_vad_model == WHISPER
236
237
 
237
- def is_vosk(self):
238
- return self.selected_vad_model == VOSK or self.backup_vad_model == VOSK
238
+ # def is_vosk(self):
239
+ # return self.selected_vad_model == VOSK or self.backup_vad_model == VOSK
239
240
 
240
- def is_groq(self):
241
- return self.selected_vad_model == GROQ or self.backup_vad_model == GROQ
241
+ # def is_groq(self):
242
+ # return self.selected_vad_model == GROQ or self.backup_vad_model == GROQ
242
243
 
243
244
 
244
245
  @dataclass_json
@@ -266,6 +267,7 @@ class Ai:
266
267
  anki_field: str = ''
267
268
  provider: str = AI_GEMINI
268
269
  gemini_model: str = 'gemini-2.5-flash'
270
+ local_model: str = OFF
269
271
  groq_model: str = 'meta-llama/llama-4-scout-17b-16e-instruct'
270
272
  api_key: str = '' # Deprecated
271
273
  gemini_api_key: str = ''
@@ -20,6 +20,7 @@ class GameLine:
20
20
  next: 'GameLine | None'
21
21
  index: int = 0
22
22
  scene: str = ""
23
+ TL: str = ""
23
24
 
24
25
  def get_previous_time(self):
25
26
  if self.prev:
@@ -31,6 +32,9 @@ class GameLine:
31
32
  return self.next.time
32
33
  return 0
33
34
 
35
+ def set_TL(self, tl: str):
36
+ self.TL = tl
37
+
34
38
  def __str__(self):
35
39
  return str({"text": self.text, "time": self.time})
36
40