PyPI - pygpt-net - Versions diffs - 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl - Mend

pygpt-net 2.7.4py3-none-any.whl → 2.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

pygpt_net/CHANGELOG.txt +15 -0
pygpt_net/__init__.py +4 -4
pygpt_net/app_core.py +4 -2
pygpt_net/controller/__init__.py +5 -1
pygpt_net/controller/assistant/assistant.py +1 -4
pygpt_net/controller/assistant/batch.py +5 -504
pygpt_net/controller/assistant/editor.py +5 -5
pygpt_net/controller/assistant/files.py +16 -16
pygpt_net/controller/chat/handler/google_stream.py +307 -1
pygpt_net/controller/chat/handler/worker.py +10 -25
pygpt_net/controller/chat/handler/xai_stream.py +621 -52
pygpt_net/controller/chat/image.py +2 -2
pygpt_net/controller/debug/fixtures.py +3 -2
pygpt_net/controller/dialogs/confirm.py +73 -101
pygpt_net/controller/files/files.py +65 -4
pygpt_net/controller/lang/mapping.py +9 -9
pygpt_net/controller/painter/capture.py +50 -1
pygpt_net/controller/presets/presets.py +2 -1
pygpt_net/controller/remote_store/__init__.py +12 -0
pygpt_net/{provider/core/assistant_file/db_sqlite → controller/remote_store/google}/__init__.py +2 -2
pygpt_net/controller/remote_store/google/batch.py +402 -0
pygpt_net/controller/remote_store/google/store.py +615 -0
pygpt_net/controller/remote_store/openai/__init__.py +12 -0
pygpt_net/controller/remote_store/openai/batch.py +524 -0
pygpt_net/controller/{assistant → remote_store/openai}/store.py +63 -60
pygpt_net/controller/remote_store/remote_store.py +35 -0
pygpt_net/controller/ui/ui.py +20 -1
pygpt_net/core/assistants/assistants.py +3 -15
pygpt_net/core/db/database.py +5 -3
pygpt_net/core/filesystem/url.py +4 -1
pygpt_net/core/locale/placeholder.py +35 -0
pygpt_net/core/remote_store/__init__.py +12 -0
pygpt_net/core/remote_store/google/__init__.py +11 -0
pygpt_net/core/remote_store/google/files.py +224 -0
pygpt_net/core/remote_store/google/store.py +248 -0
pygpt_net/core/remote_store/openai/__init__.py +11 -0
pygpt_net/core/{assistants → remote_store/openai}/files.py +26 -19
pygpt_net/core/{assistants → remote_store/openai}/store.py +32 -15
pygpt_net/core/remote_store/remote_store.py +24 -0
pygpt_net/core/render/web/body.py +3 -2
pygpt_net/core/types/chunk.py +27 -0
pygpt_net/data/config/config.json +8 -4
pygpt_net/data/config/models.json +77 -3
pygpt_net/data/config/settings.json +45 -0
pygpt_net/data/js/app/template.js +1 -1
pygpt_net/data/js/app.min.js +2 -2
pygpt_net/data/locale/locale.de.ini +44 -41
pygpt_net/data/locale/locale.en.ini +56 -43
pygpt_net/data/locale/locale.es.ini +44 -41
pygpt_net/data/locale/locale.fr.ini +44 -41
pygpt_net/data/locale/locale.it.ini +44 -41
pygpt_net/data/locale/locale.pl.ini +45 -42
pygpt_net/data/locale/locale.uk.ini +44 -41
pygpt_net/data/locale/locale.zh.ini +44 -41
pygpt_net/data/locale/plugin.cmd_history.de.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.en.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.es.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.fr.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.it.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.pl.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.uk.ini +1 -1
pygpt_net/data/locale/plugin.cmd_history.zh.ini +1 -1
pygpt_net/data/locale/plugin.cmd_mouse_control.en.ini +14 -0
pygpt_net/data/locale/plugin.cmd_web.de.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.en.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.es.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.fr.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.it.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.pl.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.uk.ini +1 -1
pygpt_net/data/locale/plugin.cmd_web.zh.ini +1 -1
pygpt_net/data/locale/plugin.idx_llama_index.de.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.en.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.es.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.fr.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.it.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.pl.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.uk.ini +2 -2
pygpt_net/data/locale/plugin.idx_llama_index.zh.ini +2 -2
pygpt_net/item/assistant.py +1 -211
pygpt_net/item/ctx.py +3 -3
pygpt_net/item/store.py +238 -0
pygpt_net/js_rc.py +2449 -2447
pygpt_net/migrations/Version20260102190000.py +35 -0
pygpt_net/migrations/__init__.py +3 -1
pygpt_net/plugin/cmd_mouse_control/config.py +471 -1
pygpt_net/plugin/cmd_mouse_control/plugin.py +487 -22
pygpt_net/plugin/cmd_mouse_control/worker.py +464 -87
pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +729 -0
pygpt_net/plugin/idx_llama_index/config.py +2 -2
pygpt_net/provider/api/anthropic/__init__.py +10 -8
pygpt_net/provider/api/google/__init__.py +21 -58
pygpt_net/provider/api/google/chat.py +545 -129
pygpt_net/provider/api/google/computer.py +190 -0
pygpt_net/provider/api/google/realtime/realtime.py +2 -2
pygpt_net/provider/api/google/remote_tools.py +93 -0
pygpt_net/provider/api/google/store.py +546 -0
pygpt_net/provider/api/google/worker/__init__.py +0 -0
pygpt_net/provider/api/google/worker/importer.py +392 -0
pygpt_net/provider/api/openai/__init__.py +7 -3
pygpt_net/provider/api/openai/computer.py +10 -1
pygpt_net/provider/api/openai/responses.py +0 -0
pygpt_net/provider/api/openai/store.py +6 -6
pygpt_net/provider/api/openai/worker/importer.py +24 -24
pygpt_net/provider/api/x_ai/__init__.py +10 -9
pygpt_net/provider/api/x_ai/chat.py +272 -102
pygpt_net/provider/core/config/patch.py +16 -1
pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +3 -3
pygpt_net/provider/core/model/patch.py +17 -3
pygpt_net/provider/core/preset/json_file.py +13 -7
pygpt_net/provider/core/{assistant_file → remote_file}/__init__.py +1 -1
pygpt_net/provider/core/{assistant_file → remote_file}/base.py +9 -9
pygpt_net/provider/core/remote_file/db_sqlite/__init__.py +12 -0
pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/patch.py +1 -1
pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/provider.py +23 -20
pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/storage.py +35 -27
pygpt_net/provider/core/{assistant_file → remote_file}/db_sqlite/utils.py +5 -4
pygpt_net/provider/core/{assistant_store → remote_store}/__init__.py +1 -1
pygpt_net/provider/core/{assistant_store → remote_store}/base.py +10 -10
pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/__init__.py +1 -1
pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/patch.py +1 -1
pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/provider.py +16 -15
pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/storage.py +30 -23
pygpt_net/provider/core/{assistant_store → remote_store}/db_sqlite/utils.py +5 -4
pygpt_net/provider/core/{assistant_store → remote_store}/json_file.py +9 -9
pygpt_net/provider/llms/google.py +2 -2
pygpt_net/tools/image_viewer/ui/dialogs.py +298 -12
pygpt_net/tools/text_editor/ui/widgets.py +5 -1
pygpt_net/ui/base/config_dialog.py +3 -2
pygpt_net/ui/base/context_menu.py +44 -1
pygpt_net/ui/dialog/assistant.py +3 -3
pygpt_net/ui/dialog/plugins.py +3 -1
pygpt_net/ui/dialog/remote_store_google.py +539 -0
pygpt_net/ui/dialog/{assistant_store.py → remote_store_openai.py} +95 -95
pygpt_net/ui/dialogs.py +5 -3
pygpt_net/ui/layout/chat/attachments_uploaded.py +3 -3
pygpt_net/ui/layout/toolbox/computer_env.py +26 -8
pygpt_net/ui/layout/toolbox/indexes.py +22 -19
pygpt_net/ui/layout/toolbox/model.py +28 -5
pygpt_net/ui/menu/tools.py +13 -5
pygpt_net/ui/widget/dialog/remote_store_google.py +56 -0
pygpt_net/ui/widget/dialog/{assistant_store.py → remote_store_openai.py} +9 -9
pygpt_net/ui/widget/element/button.py +4 -4
pygpt_net/ui/widget/image/display.py +25 -8
pygpt_net/ui/widget/lists/remote_store_google.py +248 -0
pygpt_net/ui/widget/lists/{assistant_store.py → remote_store_openai.py} +21 -21
pygpt_net/ui/widget/option/checkbox_list.py +47 -9
pygpt_net/ui/widget/option/combo.py +39 -3
pygpt_net/ui/widget/tabs/output.py +9 -1
pygpt_net/ui/widget/textarea/editor.py +14 -1
pygpt_net/ui/widget/textarea/input.py +20 -7
pygpt_net/ui/widget/textarea/notepad.py +24 -1
pygpt_net/ui/widget/textarea/output.py +23 -1
pygpt_net/ui/widget/textarea/web.py +16 -1
{pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/METADATA +41 -2
{pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/RECORD +158 -132
{pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/LICENSE +0 -0
{pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/WHEEL +0 -0
{pygpt_net-2.7.4.dist-info → pygpt_net-2.7.6.dist-info}/entry_points.txt +0 -0

pygpt_net/provider/api/google/chat.py CHANGED Viewed

@@ -6,15 +6,16 @@
 # GitHub:  https://github.com/szczyglis-dev/py-gpt   #
 # MIT License                                        #
 # Created By  : Marcin Szczygliński                  #
-# Updated Date: 2025.08.28 20:00:00                  #
+# Updated Date: 2026.01.03 17:00:00                  #
 # ================================================== #
-from typing import Optional, Dict, Any, List
+import os
+from typing import Optional, Dict, Any, List, Tuple
 from google.genai import types as gtypes
 from google.genai.types import Content, Part
-from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
+from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO, MODE_COMPUTER, MODE_RESEARCH
 from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
 from pygpt_net.item.attachment import AttachmentItem
 from pygpt_net.item.ctx import CtxItem
@@ -35,7 +36,7 @@ class Chat:
             extra: Optional[Dict[str, Any]] = None
     ):
         """
-        Call Google GenAI for chat / multimodal / audio.
+        Call Google GenAI for chat / multimodal / audio / computer use.
         :param context: BridgeContext with prompt, model, history, mode, etc.
         :param extra: Extra parameters (not used currently)
@@ -62,7 +63,6 @@ class Chat:
         # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
         if mode == MODE_AUDIO and has_audio_input:
-            # Build minimal transcription request: [instruction text, audio part]
             transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
             transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
             audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
@@ -73,13 +73,10 @@ class Chat:
                 ])
             ]
             trans_cfg = gtypes.GenerateContentConfig(
-                # Keep minimal; no tools/system for transcription
                 temperature=self.window.core.config.get('temperature'),
                 top_p=self.window.core.config.get('top_p'),
                 max_output_tokens=context.max_tokens if context.max_tokens else None,
             )
-            # Always non-stream here (we immediately need the text for TTS)
             trans_resp = client.models.generate_content(
                 model=transcribe_model,
                 contents=trans_inputs,
@@ -87,20 +84,17 @@ class Chat:
             )
             transcribed_text = self.extract_text(trans_resp).strip()
             if transcribed_text:
-                # Feed transcription into TTS as the final prompt
                 prompt = transcribed_text
                 ctx.input = transcribed_text
                 try:
-                    # optional: store for debugging/UX
                     if isinstance(ctx.extra, dict):
                         ctx.extra["transcription"] = transcribed_text
                 except Exception:
                     pass
-            ctx.is_audio = False  # transcription is text
-            multimodal_ctx.is_audio_input = False  # disable audio input for TTS below
+            ctx.is_audio = False
+            multimodal_ctx.is_audio_input = False
-        # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
-        # Build contents for chat/multimodal (will be overridden for TTS below)
+        # ---------------------- REGULAR CHAT/COMPUTER PATH ----------------------
         inputs = self.build_input(
             prompt=prompt,
             system_prompt=system_prompt,
@@ -108,23 +102,35 @@ class Chat:
             history=context.history,
             attachments=attachments,
             multimodal_ctx=multimodal_ctx,
+            mode=mode,
         )
         # Best-effort input tokens estimate
         self.reset_tokens()
-        count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
+        count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history, mode)
         self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
         # Tools -> merge app-defined tools with remote tools
         base_tools = self.window.core.api.google.tools.prepare(model, functions)
-        remote_tools = self.window.core.api.google.build_remote_tools(model)
+        remote_tools = self.window.core.api.google.remote_tools.build_remote_tools(model)
-        # Check tools compatibility
+        # Note: Combining native (remote) tools with function declarations is documented as Live API-only.
         if base_tools:
-            remote_tools = [] # remote tools are not allowed if function calling is used
+            remote_tools = []
         tools = (base_tools or []) + (remote_tools or [])
-        if "-image" in model.id:
-            tools = None  # function calling is not supported for image models
+        # Enable Computer Use tool in computer mode (use the official Tool/ComputerUse object)
+        if mode == MODE_COMPUTER or (model and isinstance(model.id, str) and "computer-use" in model.id.lower()):
+            comp_env = gtypes.Environment.ENVIRONMENT_BROWSER
+            tools = [gtypes.Tool(
+                computer_use=gtypes.ComputerUse(
+                    environment=comp_env,
+                )
+            )]  # reset tools to only Computer Use (multiple tools not supported together)
+        # Some models cannot use tools; keep behavior for image-only models
+        if model and isinstance(model.id, str) and "-image" in model.id:
+            tools = None
         # Sampling
         temperature = self.window.core.config.get('temperature')
@@ -145,10 +151,9 @@ class Chat:
             stream = False  # TTS non-stream in this app
             supports_tts = self._supports_tts(model.id)
-            # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
             inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
-            # Remove params not used by TTS flow (and that sometimes cause issues)
+            # Remove params not used by TTS flow
             for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
                 if key in cfg_kwargs:
                     del cfg_kwargs[key]
@@ -171,11 +176,92 @@ class Chat:
                         prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
                     )
                 )
-            # else: fallback to text-only below
         cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
         params = dict(model=model.id, contents=inputs, config=cfg)
+        if mode == MODE_RESEARCH:
+            # Deep Research does not support audio inputs; if an audio snippet is present, transcribe it to text first.
+            if has_audio_input:
+                try:
+                    transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
+                    transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
+                    audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
+                    trans_inputs = [
+                        Content(role="user", parts=[
+                            Part.from_text(text=transcribe_prompt),
+                            audio_part,
+                        ])
+                    ]
+                    trans_cfg = gtypes.GenerateContentConfig(
+                        temperature=self.window.core.config.get('temperature'),
+                        top_p=self.window.core.config.get('top_p'),
+                        max_output_tokens=context.max_tokens if context.max_tokens else None,
+                    )
+                    trans_resp = client.models.generate_content(
+                        model=transcribe_model,
+                        contents=trans_inputs,
+                        config=trans_cfg,
+                    )
+                    transcribed_text = self.extract_text(trans_resp).strip()
+                    if transcribed_text:
+                        prompt = (str(prompt or "").strip() + "\n\n" + transcribed_text).strip() if prompt else transcribed_text
+                        ctx.input = transcribed_text
+                        try:
+                            if isinstance(ctx.extra, dict):
+                                ctx.extra["transcription"] = transcribed_text
+                        except Exception:
+                            pass
+                except Exception:
+                    pass
+                # Ensure we don't send raw audio to Interactions API
+                if multimodal_ctx:
+                    multimodal_ctx.is_audio_input = False
+            # Build single-turn multimodal input for Interactions API (no full chat history)
+            research_parts = self._build_user_parts(
+                content=str(prompt),
+                attachments=attachments,
+                multimodal_ctx=multimodal_ctx,
+            )
+            interactions_input = self._parts_to_interactions_input(research_parts)
+            # Try to continue context with the last completed interaction (server-side state)
+            prev_interaction_id, last_event_id, last_status = self._find_last_interaction_state(
+                history=context.history,
+                ctx=ctx,
+            )
+            try:
+                if ctx.extra is None:
+                    ctx.extra = {}
+                if prev_interaction_id:
+                    ctx.extra["previous_interaction_id"] = prev_interaction_id
+                if last_event_id:
+                    ctx.extra["google_last_event_id"] = last_event_id
+                if last_status:
+                    ctx.extra["google_interaction_status"] = last_status
+            except Exception:
+                pass
+            # Deep Research agent must use background=True; stream=True enables live progress updates.
+            create_kwargs: Dict[str, Any] = {
+                "agent": model.id,
+                "input": interactions_input if interactions_input else (str(prompt or "") or " "),
+                "background": True,
+                "stream": stream,
+                "agent_config": {
+                    "type": "deep-research",
+                    "thinking_summaries": "auto"
+                }
+            }
+            # Continue conversation on server using previous_interaction_id if available
+            if prev_interaction_id:
+                create_kwargs["previous_interaction_id"] = prev_interaction_id
+            # Do not pass custom tools here; Deep Research manages its own built-in tools.
+            return client.interactions.create(**create_kwargs)
         if stream and mode != MODE_AUDIO:
             return client.models.generate_content_stream(**params)
         else:
@@ -189,28 +275,21 @@ class Chat:
         """
         Unpack non-streaming response from Google GenAI and set context.
-        :param mode: MODE_CHAT or MODE_AUDIO
+        :param mode: MODE_CHAT, MODE_AUDIO or MODE_COMPUTER
         :param response: Response object
         :param ctx: CtxItem to set output, audio_output, tokens, tool_calls
         """
         if mode == MODE_AUDIO:
-            # Prefer audio if present
             audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
             if audio_bytes:
-                # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
-                # https://ai.google.dev/gemini-api/docs/speech-generation
-                if mime == "audio/pcm" or mime.startswith("audio/"):
+                if mime == "audio/pcm" or (isinstance(mime, str) and mime.startswith("audio/")):
                     wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
                     ctx.audio_output = wav_b64
                     ctx.is_audio = True
-                # Text transcript is typically not present for TTS; still try:
                 txt = self.extract_text(response)
                 ctx.output = txt or "..."
             else:
-                # No audio present -> fallback to text
                 ctx.output = self.extract_text(response)
-            # Usage
             try:
                 usage = getattr(response, "usage_metadata", None)
                 if usage:
@@ -219,17 +298,58 @@ class Chat:
                     ctx.set_tokens(p, c)
             except Exception:
                 pass
+            return
-            return  # audio path done
+        # ---- chat / computer ----
+        ctx.output = self.extract_text(response) or ""
-        # ---- regular chat/completion ----
-        ctx.output = self.extract_text(response)
-        # Extract function calls
+        # 1) Extract tool calls and store in ctx.tool_calls (backward-compatible shape)
         calls = self.extract_tool_calls(response)
         if calls:
             ctx.tool_calls = calls
+        # 2) In MODE_COMPUTER: capture raw model parts (with thought_signature) for next FunctionResponse turn
+        #    and translate Computer Use calls into plugin commands now.
+        if mode == MODE_COMPUTER:
+            candidate = None
+            try:
+                cands = getattr(response, "candidates", None) or []
+                if cands:
+                    candidate = cands[0]
+            except Exception:
+                pass
+            if candidate and getattr(candidate, "content", None):
+                parts = getattr(candidate.content, "parts", None) or []
+                dump = self._dump_model_parts(parts)
+                if dump:
+                    if ctx.extra is None:
+                        ctx.extra = {}
+                    ctx.extra["prev_model_parts"] = dump
+            tool_calls: List[dict] = []
+            try:
+                tool_calls, has_calls = self.window.core.api.google.computer.handle_stream_chunk(
+                    ctx=ctx,
+                    chunk=response,
+                    tool_calls=tool_calls,
+                )
+            except Exception as e:
+                has_calls = False
+                print(f"Gemini computer-use mapping error: {e}")
+            if has_calls and tool_calls:
+                ctx.force_call = True
+                self.window.core.debug.info("[chat] Google tool calls found, unpacking...")
+                self.window.core.command.unpack_tool_calls_chunks(ctx, tool_calls)
+            if calls:
+                if ctx.extra is None:
+                    ctx.extra = {}
+                ctx.extra["function_response_required"] = True
+                ctx.extra["function_response_source"] = "ctx.tool_calls"
+                ctx.extra["function_response_reason"] = "computer_use"
         # Usage if available
         try:
             usage = getattr(response, "usage_metadata", None)
@@ -283,12 +403,11 @@ class Chat:
         :return: List of tool calls
         """
         def _to_plain_dict(obj):
-            # Convert pydantic/genai objects to plain dict recursively
             try:
                 if hasattr(obj, "to_json_dict"):
                     return obj.to_json_dict()
                 if hasattr(obj, "model_dump"):
-                    return obj.model_dump()  # pydantic v2
+                    return obj.model_dump()
                 if hasattr(obj, "to_dict"):
                     return obj.to_dict()
             except Exception:
@@ -307,7 +426,6 @@ class Chat:
             name = getattr(fc, "name", "") or ""
             args_obj = getattr(fc, "args", {}) or {}
             args_dict = _to_plain_dict(args_obj) or {}
-            # if str, try to parse
             if isinstance(args_dict, str):
                 try:
                     import json
@@ -319,7 +437,7 @@ class Chat:
                 "type": "function",
                 "function": {
                     "name": name,
-                    "arguments": args_dict,  # <--- DICT, not string
+                    "arguments": args_dict,
                 }
             })
@@ -345,11 +463,11 @@ class Chat:
                         except Exception:
                             args_dict = {}
                     out.append({
-                        "id": "",
+                        "id": getattr(fn, "id", "") or "",
                         "type": "function",
                         "function": {
                             "name": name,
-                            "arguments": args_dict,  # <--- DICT
+                            "arguments": args_dict,
                         }
                     })
         except Exception:
@@ -357,17 +475,274 @@ class Chat:
         return out
+    def build_input(
+            self,
+            prompt: str,
+            system_prompt: str,
+            model: ModelItem,
+            history: Optional[List[CtxItem]] = None,
+            attachments: Optional[Dict[str, AttachmentItem]] = None,
+            multimodal_ctx: Optional[MultimodalContext] = None,
+            mode: str = MODE_CHAT,
+    ) -> List[Content]:
+        """
+        Build Google GenAI contents list
+        :param prompt: User prompt
+        :param system_prompt: System prompt/instruction
+        :param model: ModelItem
+        :param history: List of CtxItem for history
+        :param attachments: Dict of AttachmentItem for images/screenshots
+        :param multimodal_ctx: MultimodalContext for audio
+        :param mode: MODE_CHAT / MODE_AUDIO / MODE_COMPUTER
+        :return: List of Content
+        """
+        # FunctionResponse turn for Computer Use (strictly immediate after functionCall)
+        if mode == MODE_COMPUTER and self.window.core.config.get('use_context'):
+            hist = self.window.core.ctx.get_history(
+                history,
+                model.id,
+                MODE_CHAT,
+                self.window.core.tokens.from_user(prompt, system_prompt),
+                self._fit_ctx(model),
+            )
+            fr_contents = self._build_function_responses_from_history(hist, attachments)
+            if fr_contents:
+                return fr_contents
+        # Build conversation history first to detect "first input"
+        items: List[CtxItem] = []
+        if self.window.core.config.get('use_context'):
+            items = self.window.core.ctx.get_history(
+                history,
+                model.id,
+                MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
+                self.window.core.tokens.from_user(prompt, system_prompt),
+                self._fit_ctx(model),
+            )
+        is_first_turn = (len(items) == 0)
+        is_sandbox = bool(self.window.core.config.get("remote_tools.computer_use.sandbox", False))
+        contents: List[Content] = []
+        # Append conversation history (text only)
+        for item in items:
+            if item.final_input:
+                contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
+            if item.final_output:
+                contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
+        # Current user message:
+        # - In MODE_COMPUTER attach initial screenshot only on the very first turn
+        if mode == MODE_COMPUTER:
+            initial_attachments = {}
+            if is_first_turn and not attachments and not is_sandbox:
+                self.window.controller.attachment.clear_silent()
+                self.window.controller.painter.capture.screenshot(attach_cursor=True, silent=True)
+                initial_attachments = self.window.core.attachments.get_all(mode)
+            send_attachments = initial_attachments if initial_attachments else attachments
+            parts = self._build_user_parts(
+                content=str(prompt),
+                attachments=send_attachments,
+                multimodal_ctx=multimodal_ctx,
+            )
+        else:
+            parts = self._build_user_parts(
+                content=str(prompt),
+                attachments=attachments,
+                multimodal_ctx=multimodal_ctx,
+            )
+        contents.append(Content(role="user", parts=parts))
+        return contents
+    def _build_function_responses_from_history(
+            self,
+            history: Optional[List[CtxItem]],
+            attachments: Optional[Dict[str, AttachmentItem]],
+    ) -> Optional[List[Content]]:
+        """
+        Build FunctionResponse contents for the immediate next turn after executing
+        Computer Use function calls. It reconstructs the last user -> model(functionCall) turn
+        and returns [user_content, model_function_call_content, tool_function_response_content].
+        """
+        if not self.window.core.config.get('use_context') or not history:
+            return None
+        last_item = history[-1]
+        if not getattr(last_item, "extra", None):
+            return None
+        if not last_item.extra.get("function_response_required"):
+            return None
+        # 1) Find the user message that started the current turn (previous item's input)
+        prior_user_text = ""
+        if len(history) >= 2:
+            prev = history[-2]
+            if getattr(prev, "final_input", None):
+                prior_user_text = str(prev.final_input)
+        if not prior_user_text and getattr(last_item, "input", None):
+            prior_user_text = str(last_item.input)
+        if not prior_user_text:
+            prior_user_text = "..."
+        user_content = Content(role="user", parts=[Part.from_text(text=prior_user_text)])
+        # 2) Rebuild the model functionCall content with thought_signature preserved
+        raw_parts = last_item.extra.get("prev_model_parts", [])
+        model_parts = self._rehydrate_model_parts(raw_parts)
+        if not model_parts:
+            model_parts = self._rehydrate_from_tool_calls(getattr(last_item, "tool_calls", []))
+        # append also text part if not empty
+        if getattr(last_item, "final_output", None):
+            output_text = str(last_item.final_output).strip()
+            if output_text:
+                model_parts.append(Part.from_text(text=output_text))
+        model_fc_content = Content(role="model", parts=model_parts)
+        # 3) Build a single tool content with N FunctionResponse parts (one per functionCall)
+        screenshot_part = self._screenshot_function_response_part(attachments)
+        fr_parts: List[Part] = []
+        for p in model_parts:
+            if getattr(p, "function_call", None):
+                fn = p.function_call
+                fr = Part.from_function_response(
+                    name=fn.name,
+                    response=self._minimal_tool_response(last_item),
+                    parts=[screenshot_part] if screenshot_part else None
+                )
+                fr_parts.append(fr)
+        if not fr_parts:
+            return None
+        tool_content = Content(role="tool", parts=fr_parts)
+        return [user_content, model_fc_content, tool_content]
+    def _rehydrate_from_tool_calls(self, calls: List[dict]) -> List[Part]:
+        """
+        Fallback rehydration when prev_model_parts are unavailable (no thought signatures).
+        """
+        parts: List[Part] = []
+        for c in calls or []:
+            if not isinstance(c, dict):
+                continue
+            if c.get("type") != "function":
+                continue
+            fn = c.get("function") or {}
+            name = fn.get("name")
+            args = fn.get("arguments") or {}
+            if not name:
+                continue
+            parts.append(Part.from_function_call(name=name, args=args))
+        return parts
+    def _dump_model_parts(self, parts: List[Part]) -> List[dict]:
+        """
+        Dump model parts into a JSON-serializable structure, preserving thought_signature.
+        """
+        out: List[dict] = []
+        for p in parts or []:
+            ts = getattr(p, "thought_signature", None)
+            if getattr(p, "function_call", None):
+                fn = p.function_call
+                name = getattr(fn, "name", "") or ""
+                args = getattr(fn, "args", {}) or {}
+                out.append({
+                    "type": "function_call",
+                    "name": name,
+                    "args": args,
+                    "thought_signature": ts,
+                })
+            elif getattr(p, "text", None):
+                out.append({"type": "text", "text": str(p.text)})
+        return out
+    def _rehydrate_model_parts(self, raw_parts: List[dict]) -> List[Part]:
+        """
+        Recreate SDK Part objects from dumped parts, including thought_signature on the Part.
+        """
+        parts: List[Part] = []
+        for it in raw_parts or []:
+            t = (it.get("type") or "").lower()
+            if t == "function_call":
+                name = it.get("name")
+                args = it.get("args") or {}
+                ts = it.get("thought_signature")
+                if name:
+                    parts.append(Part(function_call=gtypes.FunctionCall(name=name, args=args),
+                                      thought_signature=ts))
+            elif t == "text":
+                parts.append(Part.from_text(text=str(it.get("text", ""))))
+        return parts
+    def _screenshot_function_response_part(
+            self,
+            attachments: Optional[Dict[str, AttachmentItem]]
+    ) -> Optional[gtypes.FunctionResponsePart]:
+        """
+        Build FunctionResponsePart with inlineData PNG/JPEG screenshot from attachments.
+        """
+        if not attachments:
+            return None
+        chosen_path = None
+        for _, att in attachments.items():
+            if not att or not att.path:
+                continue
+            p = att.path
+            if isinstance(p, str) and os.path.exists(p):
+                ext = os.path.splitext(p)[1].lower()
+                if ext in (".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"):
+                    chosen_path = p
+                    if ext == ".png":
+                        break
+        if not chosen_path:
+            return None
+        try:
+            with open(chosen_path, "rb") as f:
+                data = f.read()
+            blob = gtypes.FunctionResponseBlob(
+                mime_type="image/png" if chosen_path.lower().endswith(".png") else "image/jpeg",
+                data=data,
+            )
+            return gtypes.FunctionResponsePart(inline_data=blob)
+        except Exception:
+            return None
+    @staticmethod
+    def _minimal_tool_response(item: CtxItem) -> Dict[str, Any]:
+        """
+        Construct a minimal structured payload for FunctionResponse.response.
+        """
+        resp: Dict[str, Any] = {"ok": True}
+        try:
+            if item and item.extra and isinstance(item.extra, dict):
+                outputs = item.extra.get("tool_output")
+                if isinstance(outputs, list) and len(outputs) > 0:
+                    last = outputs[-1]
+                    if isinstance(last, dict):
+                        if "result" in last and isinstance(last["result"], dict):
+                            resp = last["result"]
+                        if "error" in last:
+                            resp["error"] = last["error"]
+        except Exception:
+            pass
+        return resp
     def _extract_inline_images_and_links(
             self,
             response, ctx: CtxItem
     ) -> None:
         """
         Extract inline image parts (Gemini image output) and file links.
-        - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
-        - Appends HTTP(S) image URIs from file_data to ctx.urls.
-        :param response: Response object
-        :param ctx: CtxItem to set images and urls
         """
         images: list[str] = []
         urls: list[str] = []
@@ -378,7 +753,6 @@ class Chat:
                 content = getattr(cand, "content", None)
                 parts = getattr(content, "parts", None) or []
                 for p in parts:
-                    # Inline image bytes (image preview / image generation in chat)
                     blob = getattr(p, "inline_data", None)
                     if blob:
                         mime = (getattr(blob, "mime_type", "") or "").lower()
@@ -392,16 +766,14 @@ class Chat:
                                         f.write(img_bytes)
                                     images.append(img_path)
-                    # File data URI (may contain http/https or gs://)
                     fdata = getattr(p, "file_data", None)
                     if fdata:
                         uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
                         mime = (getattr(fdata, "mime_type", "") or "").lower()
                         if uri and mime.startswith("image/"):
-                            # Store only as URL; downloading is out of scope here.
                             if uri.startswith("http://") or uri.startswith("https://"):
                                 urls.append(uri)
-        except Exception as e:
+        except Exception:
             pass
         if images:
@@ -418,9 +790,6 @@ class Chat:
     def _ensure_bytes(data) -> bytes | None:
         """
         Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
-        :param data: bytes or str
-        :return: bytes or None
         """
         try:
             if isinstance(data, (bytes, bytearray)):
@@ -432,56 +801,6 @@ class Chat:
             return None
         return None
-    def build_input(
-            self,
-            prompt: str,
-            system_prompt: str,
-            model: ModelItem,
-            history: Optional[List[CtxItem]] = None,
-            attachments: Optional[Dict[str, AttachmentItem]] = None,
-            multimodal_ctx: Optional[MultimodalContext] = None,
-    ) -> List[Content]:
-        """
-        Build Google GenAI contents list
-        :param prompt: User prompt
-        :param system_prompt: System prompt/instruction
-        :param model: ModelItem
-        :param history: List of CtxItem for history
-        :param attachments: Dict of AttachmentItem for images
-        :param multimodal_ctx: MultimodalContext for audio
-        :return: List of Content
-        """
-        contents: List[Content] = []
-        # System instruction is passed separately (system_instruction),
-        # so we do not build an explicit system role part here.
-        # Append conversation history
-        if self.window.core.config.get('use_context'):
-            items = self.window.core.ctx.get_history(
-                history,
-                model.id,
-                MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
-                self.window.core.tokens.from_user(prompt, system_prompt),
-                self._fit_ctx(model),
-            )
-            for item in items:
-                if item.final_input:
-                    contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
-                if item.final_output:
-                    contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
-        # Current user message with multimodal parts
-        parts = self._build_user_parts(
-            content=str(prompt),
-            attachments=attachments,
-            multimodal_ctx=multimodal_ctx,
-        )
-        contents.append(Content(role="user", parts=parts))
-        return contents
     def _build_user_parts(
             self,
             content: str,
@@ -490,11 +809,6 @@ class Chat:
     ) -> List[Part]:
         """
         Build user message parts (text + images + audio)
-        :param content: User text content
-        :param attachments: Dict of AttachmentItem for images
-        :param multimodal_ctx: MultimodalContext for audio
-        :return: List of Part
         """
         self.window.core.api.google.vision.reset()
         parts: List[Part] = []
@@ -515,9 +829,6 @@ class Chat:
     def _fit_ctx(self, model: ModelItem) -> int:
         """
         Fit to max model tokens (best-effort, uses model.ctx if present)
-        :param model: ModelItem
-        :return: max context tokens
         """
         max_ctx_tokens = self.window.core.config.get('max_total_tokens')
         if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
@@ -530,15 +841,10 @@ class Chat:
             system_prompt: str,
             model: ModelItem,
             history: Optional[List[CtxItem]] = None,
+            mode: str = MODE_CHAT,
     ) -> List[dict]:
         """
         Build simple messages structure for local token estimation
-        :param prompt: User prompt
-        :param system_prompt: System prompt/instruction
-        :param model: ModelItem
-        :param history: List of CtxItem for history
-        :return: List of messages dicts with 'role' and 'content' keys
         """
         messages = []
         if system_prompt:
@@ -562,7 +868,6 @@ class Chat:
         messages.append({"role": "user", "content": str(prompt)})
         return messages
     def reset_tokens(self):
         """Reset input tokens counter"""
         self.input_tokens = 0
@@ -570,8 +875,6 @@ class Chat:
     def get_used_tokens(self) -> int:
         """
         Get input tokens counter (estimated before sending)
-        :return: input tokens count
         """
         return self.input_tokens
@@ -579,13 +882,126 @@ class Chat:
     def _supports_tts(model_id: Optional[str]) -> bool:
         """
         Heuristic check if the model supports native TTS.
-        - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
-        - Future/preview names may contain 'native-audio'.
-        :param model_id: Model ID
-        :return: True if supports TTS, False otherwise
         """
         if not model_id:
             return False
         mid = model_id.lower()
-        return ("-tts" in mid) or ("native-audio" in mid)
+        return ("-tts" in mid) or ("native-audio" in mid)
+    @staticmethod
+    def _find_last_interaction_state(
+            history: Optional[List[CtxItem]],
+            ctx: CtxItem,
+    ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """
+        Resolve last known Interactions state:
+        - previous_interaction_id: to continue conversation context
+        - last_event_id: to resume streaming (not used here, but returned for completeness)
+        - last_status: last known status string if available
+        Looks at current ctx.extra first, then scans history from newest to oldest.
+        """
+        prev_interaction_id: Optional[str] = None
+        last_event_id: Optional[str] = None
+        last_status: Optional[str] = None
+        try:
+            if getattr(ctx, "extra", None) and isinstance(ctx.extra, dict):
+                prev_interaction_id = (
+                    ctx.extra.get("previous_interaction_id")
+                    or ctx.extra.get("google_interaction_id")
+                    or ctx.extra.get("google_last_interaction_id")
+                )
+                last_event_id = ctx.extra.get("google_last_event_id")
+                last_status = ctx.extra.get("google_interaction_status")
+        except Exception:
+            pass
+        if not prev_interaction_id and history:
+            for item in reversed(history or []):
+                ex = getattr(item, "extra", None)
+                if not ex or not isinstance(ex, dict):
+                    continue
+                prev_interaction_id = (
+                    ex.get("previous_interaction_id")
+                    or ex.get("google_interaction_id")
+                    or ex.get("google_last_interaction_id")
+                    or prev_interaction_id
+                )
+                last_event_id = ex.get("google_last_event_id") or last_event_id
+                last_status = ex.get("google_interaction_status") or last_status
+                if prev_interaction_id and last_event_id:
+                    break
+        return prev_interaction_id, last_event_id, last_status
+    @staticmethod
+    def _mime_to_interactions_type(mime: str) -> Optional[str]:
+        """
+        Map MIME type to Interactions input type.
+        """
+        if not mime:
+            return None
+        m = mime.lower()
+        if m.startswith("image/"):
+            return "image"
+        if m.startswith("audio/"):
+            return "audio"
+        if m.startswith("video/"):
+            return "video"
+        return None
+    @staticmethod
+    def _ensure_base64(data) -> Optional[str]:
+        """
+        Return base64 string from raw bytes or a base64 string.
+        """
+        try:
+            if data is None:
+                return None
+            if isinstance(data, str):
+                return data
+            if isinstance(data, (bytes, bytearray)):
+                import base64
+                return base64.b64encode(bytes(data)).decode("utf-8")
+        except Exception:
+            return None
+        return None
+    def _parts_to_interactions_input(self, parts: List[Part]) -> List[Dict[str, Any]]:
+        """
+        Convert Responses API Part list into Interactions API input payload.
+        """
+        out: List[Dict[str, Any]] = []
+        for p in parts or []:
+            # Text
+            t = getattr(p, "text", None)
+            if t is not None:
+                s = str(t).strip()
+                if s:
+                    out.append({"type": "text", "text": s})
+                continue
+            # Inline data (images/audio/video)
+            inline = getattr(p, "inline_data", None)
+            if inline:
+                mime = (getattr(inline, "mime_type", "") or "").lower()
+                typ = self._mime_to_interactions_type(mime)
+                data = getattr(inline, "data", None)
+                b64 = self._ensure_base64(data)
+                if typ and b64:
+                    out.append({"type": typ, "data": b64, "mime_type": mime})
+                continue
+            # File references (prefer URIs from Gemini Files API)
+            fdata = getattr(p, "file_data", None)
+            if fdata:
+                uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
+                mime = (getattr(fdata, "mime_type", "") or "").lower()
+                typ = self._mime_to_interactions_type(mime)
+                if typ and uri:
+                    out.append({"type": typ, "uri": uri})
+                continue
+        return out

pygpt-net 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl

pygpt-net 2.7.4py3-none-any.whl → 2.7.6py3-none-any.whl