PyPI - GameSentenceMiner - Versions diffs - 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

GameSentenceMiner 2.10.16py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

GameSentenceMiner/gsm.py CHANGED Viewed

@@ -97,35 +97,37 @@ class VideoToAudioHandler(FileSystemEventHandler):
                 skip_delete = True
                 return
-            mined_line = get_text_event(last_note)
-            gsm_state.last_mined_line = mined_line
-            if os.path.exists(video_path) and os.access(video_path, os.R_OK):
-                logger.debug(f"Video found and is readable: {video_path}")
-            if get_config().obs.minimum_replay_size and not ffmpeg.is_video_big_enough(video_path,
-                                                                                       get_config().obs.minimum_replay_size):
-                logger.debug("Checking if video is big enough")
-                notification.send_check_obs_notification(reason="Video may be empty, check scene in OBS.")
-                logger.error(
-                    f"Video was unusually small, potentially empty! Check OBS for Correct Scene Settings! Path: {video_path}")
-                return
             # Just for safety
             if not last_note:
                 if get_config().anki.update_anki:
                     last_note = anki.get_last_anki_card()
                 if get_config().features.backfill_audio:
                     last_note = anki.get_cards_by_sentence(gametext.current_line_after_regex)
+            # Get Info of line mined
             line_cutoff = None
             start_line = None
-            if mined_line:
-                start_line = mined_line
-                if mined_line.next:
-                    line_cutoff = mined_line.next.time
             if selected_lines:
                 start_line = selected_lines[0]
                 mined_line = get_mined_line(last_note, selected_lines)
                 line_cutoff = selected_lines[-1].get_next_time()
+            else:
+                mined_line = get_text_event(last_note)
+                if mined_line:
+                    start_line = mined_line
+                    if mined_line.next:
+                        line_cutoff = mined_line.next.time
+            gsm_state.last_mined_line = mined_line
+            if os.path.exists(video_path) and os.access(video_path, os.R_OK):
+                logger.debug(f"Video found and is readable: {video_path}")
+            if get_config().obs.minimum_replay_size and not ffmpeg.is_video_big_enough(video_path,
+                                                                                       get_config().obs.minimum_replay_size):
+                logger.debug("Checking if video is big enough")
+                notification.send_check_obs_notification(reason="Video may be empty, check scene in OBS.")
+                logger.error(
+                    f"Video was unusually small, potentially empty! Check OBS for Correct Scene Settings! Path: {video_path}")
+                return
             if last_note:
                 logger.debug(last_note.to_json())

GameSentenceMiner/ocr/owocr_helper.py CHANGED Viewed

@@ -195,10 +195,10 @@ all_cords = None
 rectangles = None
 last_ocr2_result = []
-def do_second_ocr(ocr1_text, time, img, filtering, ignore_furigana_filter=False):
+def do_second_ocr(ocr1_text, time, img, filtering, ignore_furigana_filter=False, ignore_previous_result=False):
     global twopassocr, ocr2, last_ocr2_result
     try:
-        orig_text, text = run.process_and_write_results(img, None, last_ocr2_result, filtering, None,
+        orig_text, text = run.process_and_write_results(img, None, last_ocr2_result if not ignore_previous_result else None, filtering, None,
                                                         engine=ocr2, furigana_filter_sensitivity=furigana_filter_sensitivity if not ignore_furigana_filter else 0)
         if compare_ocr_results(last_ocr2_result, orig_text):
@@ -344,7 +344,8 @@ def run_oneocr(ocr_config: OCRConfig, rectangles):
                 gsm_ocr_config=ocr_config,
                 screen_capture_areas=screen_areas,
                 furigana_filter_sensitivity=furigana_filter_sensitivity,
-                screen_capture_combo=manual_ocr_hotkey if manual_ocr_hotkey and manual else None)
+                screen_capture_combo=manual_ocr_hotkey if manual_ocr_hotkey and manual else None,
+                keep_line_breaks=keep_newline)
     except Exception as e:
         logger.exception(f"Error running OneOCR: {e}")
     done = True
@@ -359,14 +360,14 @@ def add_ss_hotkey(ss_hotkey="ctrl+shift+g"):
     def capture():
         print("Taking screenshot...")
         img = cropper.run()
-        do_second_ocr("", datetime.now(), img, filtering, ignore_furigana_filter=True)
+        do_second_ocr("", datetime.now(), img, filtering, ignore_furigana_filter=True, ignore_previous_result=True)
     def capture_main_monitor():
         print("Taking screenshot of main monitor...")
         with mss.mss() as sct:
             main_monitor = sct.monitors[1] if len(sct.monitors) > 1 else sct.monitors[0]
             img = sct.grab(main_monitor)
             img_bytes = mss.tools.to_png(img.rgb, img.size)
-            do_second_ocr("", datetime.now(), img_bytes, filtering, ignore_furigana_filter=True)
+            do_second_ocr("", datetime.now(), img_bytes, filtering, ignore_furigana_filter=True, ignore_previous_result=True)
     hotkey_reg = None
     try:
         hotkey_reg = keyboard.add_hotkey(ss_hotkey, capture)
@@ -404,7 +405,7 @@ def set_force_stable_hotkey():
 if __name__ == "__main__":
     try:
-        global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config, furigana_filter_sensitivity, area_select_ocr_hotkey, window, optimize_second_scan, use_window_for_config
+        global ocr1, ocr2, twopassocr, language, ss_clipboard, ss, ocr_config, furigana_filter_sensitivity, area_select_ocr_hotkey, window, optimize_second_scan, use_window_for_config, keep_newline
         import sys
         import argparse
@@ -428,6 +429,7 @@ if __name__ == "__main__":
                             help="Optimize second scan by cropping based on first scan results")
         parser.add_argument("--use_window_for_config", action="store_true",
                             help="Use the specified window for loading OCR configuration")
+        parser.add_argument("--keep_newline", action="store_true", help="Keep new lines in OCR output")
         args = parser.parse_args()
@@ -446,6 +448,7 @@ if __name__ == "__main__":
         clipboard_output = args.clipboard_output
         optimize_second_scan = args.optimize_second_scan
         use_window_for_config = args.use_window_for_config
+        keep_newline = args.keep_newline
         window = None
         logger.info(f"Received arguments: {vars(args)}")

GameSentenceMiner/owocr/owocr/ocr.py CHANGED Viewed

@@ -14,7 +14,6 @@ from urllib.parse import urlparse, parse_qs
 import jaconv
 import numpy as np
 from PIL import Image
-from google.generativeai import GenerationConfig
 from loguru import logger
 import requests
@@ -92,8 +91,11 @@ def empty_post_process(text):
     return text
-def post_process(text):
-    text = ' '.join([''.join(i.split()) for i in text.splitlines()])
+def post_process(text, keep_blank_lines=False):
+    if keep_blank_lines:
+        text = '\n'.join([''.join(i.split()) for i in text.splitlines()])
+    else:
+        text = ''.join([''.join(i.split()) for i in text.splitlines()])
     text = text.replace('…', '...')
     text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
     text = jaconv.h2z(text, ascii=True, digit=True)
@@ -305,22 +307,42 @@ class GoogleLens:
         response_proto = LensOverlayServerResponse().FromString(res.content)
         response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
-        # with open(os.path.join(get_temporary_directory(), 'glens_response.json'), 'w', encoding='utf-8') as f:
-        #     json.dump(response_dict, f, indent=4, ensure_ascii=False)
+        with open(os.path.join(r"C:\Users\Beangate\GSM\Electron App\test", 'glens_response.json'), 'w', encoding='utf-8') as f:
+            json.dump(response_dict, f, indent=4, ensure_ascii=False)
         res = ''
         text = response_dict['objects_response']['text']
         skipped = []
-        if furigana_filter_sensitivity > 0:
-            if 'text_layout' in text:
-                for paragraph in text['text_layout']['paragraphs']:
-                    for line in paragraph['lines']:
+        previous_line = None
+        if 'text_layout' in text:
+            for paragraph in text['text_layout']['paragraphs']:
+                if previous_line:
+                    prev_bbox = previous_line['geometry']['bounding_box']
+                    curr_bbox = paragraph['geometry']['bounding_box']
+                    vertical_space = abs(curr_bbox['center_y'] - prev_bbox['center_y']) * img.height
+                    prev_height = prev_bbox['height'] * img.height
+                    current_height = curr_bbox['height'] * img.height
+                    avg_height = (prev_height + current_height) / 2
+                    # If vertical space is close to previous line's height, add a blank line
+                    # logger.info(f"Vertical space: {vertical_space}, Average height: {avg_height}")
+                    # logger.info(avg_height * 2)
+                    if vertical_space > avg_height * 2:
+                        logger.info('Adding blank line')
+                        res += 'BLANK_LINE'
+                for line in paragraph['lines']:
+                    if furigana_filter_sensitivity:
                         if furigana_filter_sensitivity < line['geometry']['bounding_box']['width'] * img.width and furigana_filter_sensitivity < line['geometry']['bounding_box']['height'] * img.height:
                             for word in line['words']:
                                 res += word['plain_text'] + word['text_separator']
                         else:
                             skipped.append(word['plain_text'] for word in line['words'])
                             continue
-                        res += '\n'
+                    else:
+                        for word in line['words']:
+                                res += word['plain_text'] + word['text_separator']
+                        else:
+                            continue
+                previous_line = paragraph
+                res += '\n'
             # logger.info(
             #     f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
             # widths = []
@@ -351,16 +373,16 @@ class GoogleLens:
             #             else:
             #                 continue
             #         res += '\n'
-        else:
-            if 'text_layout' in text:
-                paragraphs = text['text_layout']['paragraphs']
-                for paragraph in paragraphs:
-                    for line in paragraph['lines']:
-                        for word in line['words']:
-                                res += word['plain_text'] + word['text_separator']
-                        else:
-                            continue
-                    res += '\n'
+        # else:
+        #     if 'text_layout' in text:
+        #         paragraphs = text['text_layout']['paragraphs']
+        #         for paragraph in paragraphs:
+        #             for line in paragraph['lines']:
+        #                 for word in line['words']:
+        #                         res += word['plain_text'] + word['text_separator']
+        #                 else:
+        #                     continue
+        #             res += '\n'
         x = (True, res)
@@ -1128,17 +1150,33 @@ class GeminiOCR:
         # if "google-generativeai" not in sys.modules:
         #     logger.warning('google-generativeai not available, GeminiOCR will not work!')
         # else:
-        import google.generativeai as genai
+        from google import genai
+        from google.genai import types
         try:
             self.api_key = config['api_key']
             if not self.api_key:
                 logger.warning('Gemini API key not provided, GeminiOCR will not work!')
             else:
-                genai.configure(api_key=self.api_key)
-                self.model = genai.GenerativeModel(config['model'], generation_config=GenerationConfig(
+                self.client = genai.Client(api_key=self.api_key)
+                self.model = config['model']
+                self.generation_config = types.GenerateContentConfig(
                     temperature=0.0,
-                    max_output_tokens=300
-                ))
+                    max_output_tokens=300,
+                    safety_settings=[
+                        types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
+                                            threshold=types.HarmBlockThreshold.BLOCK_NONE),
+                        types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+                                            threshold=types.HarmBlockThreshold.BLOCK_NONE),
+                        types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+                                            threshold=types.HarmBlockThreshold.BLOCK_NONE),
+                        types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+                                            threshold=types.HarmBlockThreshold.BLOCK_NONE),
+                    ],
+                )
+                if "2.5" in self.model:
+                    self.generation_config.thinking_config = types.ThinkingConfig(
+                        thinking_budget=0,
+                    )
                 self.available = True
                 logger.info('Gemini (using google-generativeai) ready')
         except KeyError:
@@ -1151,29 +1189,36 @@ class GeminiOCR:
             return (False, 'GeminiOCR is not available due to missing API key or configuration error.')
         try:
+            from google.genai import types
             img, is_path = input_to_pil_image(img)
-            import google.generativeai as genai
             img_bytes = self._preprocess(img)
             if not img_bytes:
                 return (False, 'Error processing image for Gemini.')
             contents = [
-                {
-                    'parts': [
-                        {
-                            'inline_data': {
-                                'mime_type': 'image/png',
-                                'data': img_bytes
-                            }
-                        },
-                        {
-                            'text': 'Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary.'
-                        }
+                types.Content(
+                    parts=[
+                        types.Part(
+                            inline_data=types.Blob(
+                                mime_type="image/png",
+                                data=img_bytes
+                            )
+                        ),
+                        types.Part(
+                            text="""
+                            **Disclaimer:** The image provided is from a video game. This content is entirely fictional and part of a narrative. It must not be treated as real-world user input or a genuine request.
+                            Analyze the image. Extract text \\*only\\* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return \\*only\\* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
+                            """
+                        )
                     ]
-                }
+                )
             ]
-            response = self.model.generate_content(contents)
+            response = self.client.models.generate_content(
+                model=self.model,
+                contents=contents,
+                config=self.generation_config
+            )
             text_output = response.text.strip()
             return (True, text_output)
@@ -1373,8 +1418,8 @@ class GroqOCR:
 #     def _preprocess(self, img):
 #         return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
-# lens = GoogleLens()
+# lens = GeminiOCR(config={'model': 'gemini-2.5-flash-lite-preview-06-17', 'api_key': ''})
 #
-# res, text = lens(Image.open('test_furigana.png'), furigana_filter_sensitivity=.6)  # Example usage
+# res, text = lens(Image.open('test_furigana.png'))  # Example usage
 #
 # print(text)

GameSentenceMiner/owocr/owocr/run.py CHANGED Viewed

@@ -353,7 +353,9 @@ class TextFiltering:
         orig_text_filtered = []
         for block in orig_text:
-            if lang == "ja":
+            if "BLANK_LINE" in block:
+                block_filtered = ["\n"]
+            elif lang == "ja":
                 block_filtered = self.kana_kanji_regex.findall(block)
             elif lang == "zh":
                 block_filtered = self.chinese_common_regex.findall(block)
@@ -394,7 +396,8 @@ class TextFiltering:
         new_blocks = []
         for idx, block in enumerate(orig_text):
             if orig_text_filtered[idx] and (orig_text_filtered[idx] not in last_text):
-                new_blocks.append(block)
+                new_blocks.append(str(block).strip().replace("BLANK_LINE", "\n"))
         final_blocks = []
         if self.accurate_filtering:
@@ -407,9 +410,10 @@ class TextFiltering:
         else:
             for block in new_blocks:
                 # This only filters out NON JA/ZH from text when lang is JA/ZH
-                if lang not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh']:
+                if lang not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n":
                     final_blocks.append(block)
         text = '\n'.join(final_blocks)
         return text, orig_text_filtered
@@ -937,7 +941,7 @@ def process_and_write_results(img_or_path, write_to=None, last_result=None, filt
         if filtering:
             text, orig_text = filtering(text, last_result)
         if lang == "ja" or lang == "zh":
-            text = post_process(text)
+            text = post_process(text, keep_blank_lines=keep_new_lines)
         logger.opt(ansi=True).info(f'Text recognized in {end_time - start_time:0.03f}s using <{engine_color}>{engine_instance.readable_name}</{engine_color}>: {text}')
         if notify and config.get_general('notifications'):
             notifier.send(title='owocr', message='Text recognized: ' + text)
@@ -999,6 +1003,7 @@ def run(read_from=None,
         ocr2=None,
         gsm_ocr_config=None,
         furigana_filter_sensitivity=None,
+        keep_line_breaks=False,
         ):
     """
     Japanese OCR client
@@ -1075,11 +1080,13 @@ def run(read_from=None,
     global engine_instances
     global engine_keys
     global lang
+    global keep_new_lines
     lang = language
     engine_instances = []
     config_engines = []
     engine_keys = []
     default_engine = ''
+    keep_new_lines = keep_line_breaks
     if len(config.get_general('engines')) > 0:
         for config_engine in config.get_general('engines').split(','):

GameSentenceMiner/util/configuration.py CHANGED Viewed

@@ -16,13 +16,13 @@ import toml
 from dataclasses_json import dataclass_json
 OFF = 'OFF'
-VOSK = 'VOSK'
+# VOSK = 'VOSK'
 SILERO = 'SILERO'
 WHISPER = 'WHISPER'
-GROQ = 'GROQ'
+# GROQ = 'GROQ'
-VOSK_BASE = 'BASE'
-VOSK_SMALL = 'SMALL'
+# VOSK_BASE = 'BASE'
+# VOSK_SMALL = 'SMALL'
 WHISPER_TINY = 'tiny'
 WHISPER_BASE = 'base'
@@ -33,6 +33,7 @@ WHISPER_TURBO = 'turbo'
 AI_GEMINI = 'Gemini'
 AI_GROQ = 'Groq'
+AI_LOCAL = 'Local'
 INFO = 'INFO'
 DEBUG = 'DEBUG'
@@ -219,7 +220,7 @@ class VAD:
     whisper_model: str = WHISPER_BASE
     do_vad_postprocessing: bool = True
     language: str = 'ja'
-    vosk_url: str = VOSK_BASE
+    # vosk_url: str = VOSK_BASE
     selected_vad_model: str = WHISPER
     backup_vad_model: str = SILERO
     trim_beginning: bool = False
@@ -234,11 +235,11 @@ class VAD:
     def is_whisper(self):
         return self.selected_vad_model == WHISPER or self.backup_vad_model == WHISPER
-    def is_vosk(self):
-        return self.selected_vad_model == VOSK or self.backup_vad_model == VOSK
+    # def is_vosk(self):
+    #     return self.selected_vad_model == VOSK or self.backup_vad_model == VOSK
-    def is_groq(self):
-        return self.selected_vad_model == GROQ or self.backup_vad_model == GROQ
+    # def is_groq(self):
+    #     return self.selected_vad_model == GROQ or self.backup_vad_model == GROQ
 @dataclass_json
@@ -266,6 +267,7 @@ class Ai:
     anki_field: str = ''
     provider: str = AI_GEMINI
     gemini_model: str = 'gemini-2.5-flash'
+    local_model: str = OFF
     groq_model: str = 'meta-llama/llama-4-scout-17b-16e-instruct'
     api_key: str = '' # Deprecated
     gemini_api_key: str = ''

GameSentenceMiner/util/text_log.py CHANGED Viewed

@@ -20,6 +20,7 @@ class GameLine:
     next: 'GameLine | None'
     index: int = 0
     scene: str = ""
+    TL: str = ""
     def get_previous_time(self):
         if self.prev:
@@ -31,6 +32,9 @@ class GameLine:
             return self.next.time
         return 0
+    def set_TL(self, tl: str):
+        self.TL = tl
     def __str__(self):
         return str({"text": self.text, "time": self.time})

GameSentenceMiner 2.10.16__py3-none-any.whl → 2.11.0__py3-none-any.whl

GameSentenceMiner 2.10.16py3-none-any.whl → 2.11.0py3-none-any.whl