PyPI - pygpt-net - Versions diffs - 2.6.29__py3-none-any.whl → 2.6.31__py3-none-any.whl - Mend

pygpt-net 2.6.29py3-none-any.whl → 2.6.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

pygpt_net/CHANGELOG.txt +15 -0
pygpt_net/__init__.py +3 -3
pygpt_net/app.py +4 -0
pygpt_net/{container.py → app_core.py} +5 -6
pygpt_net/controller/__init__.py +5 -2
pygpt_net/controller/access/control.py +1 -9
pygpt_net/controller/assistant/assistant.py +4 -4
pygpt_net/controller/assistant/batch.py +7 -7
pygpt_net/controller/assistant/files.py +4 -4
pygpt_net/controller/assistant/threads.py +3 -3
pygpt_net/controller/attachment/attachment.py +4 -7
pygpt_net/controller/audio/audio.py +25 -1
pygpt_net/controller/audio/ui.py +2 -2
pygpt_net/controller/chat/audio.py +1 -8
pygpt_net/controller/chat/common.py +30 -4
pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
pygpt_net/controller/chat/output.py +8 -3
pygpt_net/controller/chat/stream.py +4 -405
pygpt_net/controller/chat/text.py +3 -2
pygpt_net/controller/chat/vision.py +11 -19
pygpt_net/controller/config/placeholder.py +1 -1
pygpt_net/controller/ctx/ctx.py +1 -1
pygpt_net/controller/ctx/summarizer.py +1 -1
pygpt_net/controller/kernel/kernel.py +11 -3
pygpt_net/controller/kernel/reply.py +5 -1
pygpt_net/controller/mode/mode.py +21 -12
pygpt_net/controller/plugins/settings.py +3 -2
pygpt_net/controller/presets/editor.py +112 -99
pygpt_net/controller/realtime/__init__.py +12 -0
pygpt_net/controller/realtime/manager.py +53 -0
pygpt_net/controller/realtime/realtime.py +268 -0
pygpt_net/controller/theme/theme.py +3 -2
pygpt_net/controller/ui/mode.py +7 -0
pygpt_net/controller/ui/ui.py +19 -1
pygpt_net/controller/ui/vision.py +4 -4
pygpt_net/core/agents/legacy.py +2 -2
pygpt_net/core/agents/runners/openai_workflow.py +2 -2
pygpt_net/core/assistants/files.py +5 -5
pygpt_net/core/assistants/store.py +4 -4
pygpt_net/core/audio/audio.py +6 -1
pygpt_net/core/audio/backend/native/__init__.py +12 -0
pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
pygpt_net/core/audio/backend/native/player.py +139 -0
pygpt_net/core/audio/backend/native/realtime.py +250 -0
pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
pygpt_net/core/audio/backend/pyaudio/realtime.py +275 -0
pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
pygpt_net/core/audio/backend/shared/__init__.py +38 -0
pygpt_net/core/audio/backend/shared/conversions.py +211 -0
pygpt_net/core/audio/backend/shared/envelope.py +38 -0
pygpt_net/core/audio/backend/shared/player.py +137 -0
pygpt_net/core/audio/backend/shared/rt.py +52 -0
pygpt_net/core/audio/capture.py +5 -0
pygpt_net/core/audio/output.py +13 -2
pygpt_net/core/audio/whisper.py +6 -2
pygpt_net/core/bridge/bridge.py +4 -3
pygpt_net/core/bridge/worker.py +31 -9
pygpt_net/core/debug/console/console.py +2 -2
pygpt_net/core/debug/presets.py +2 -2
pygpt_net/core/dispatcher/dispatcher.py +37 -1
pygpt_net/core/events/__init__.py +2 -1
pygpt_net/core/events/realtime.py +55 -0
pygpt_net/core/experts/experts.py +2 -2
pygpt_net/core/image/image.py +51 -1
pygpt_net/core/modes/modes.py +2 -2
pygpt_net/core/presets/presets.py +3 -3
pygpt_net/core/realtime/options.py +87 -0
pygpt_net/core/realtime/shared/__init__.py +0 -0
pygpt_net/core/realtime/shared/audio.py +213 -0
pygpt_net/core/realtime/shared/loop.py +64 -0
pygpt_net/core/realtime/shared/session.py +59 -0
pygpt_net/core/realtime/shared/text.py +37 -0
pygpt_net/core/realtime/shared/tools.py +276 -0
pygpt_net/core/realtime/shared/turn.py +38 -0
pygpt_net/core/realtime/shared/types.py +16 -0
pygpt_net/core/realtime/worker.py +164 -0
pygpt_net/core/tokens/tokens.py +4 -4
pygpt_net/core/types/__init__.py +1 -0
pygpt_net/core/types/image.py +48 -0
pygpt_net/core/types/mode.py +5 -2
pygpt_net/core/vision/analyzer.py +1 -1
pygpt_net/data/config/config.json +13 -4
pygpt_net/data/config/models.json +219 -101
pygpt_net/data/config/modes.json +3 -9
pygpt_net/data/config/settings.json +135 -27
pygpt_net/data/config/settings_section.json +2 -2
pygpt_net/data/locale/locale.de.ini +7 -7
pygpt_net/data/locale/locale.en.ini +25 -12
pygpt_net/data/locale/locale.es.ini +7 -7
pygpt_net/data/locale/locale.fr.ini +7 -7
pygpt_net/data/locale/locale.it.ini +7 -7
pygpt_net/data/locale/locale.pl.ini +8 -8
pygpt_net/data/locale/locale.uk.ini +7 -7
pygpt_net/data/locale/locale.zh.ini +3 -3
pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
pygpt_net/item/model.py +23 -3
pygpt_net/plugin/audio_input/plugin.py +37 -4
pygpt_net/plugin/audio_input/simple.py +57 -8
pygpt_net/plugin/cmd_files/worker.py +3 -0
pygpt_net/plugin/openai_dalle/plugin.py +4 -4
pygpt_net/plugin/openai_vision/plugin.py +12 -13
pygpt_net/provider/agents/openai/agent.py +5 -5
pygpt_net/provider/agents/openai/agent_b2b.py +5 -5
pygpt_net/provider/agents/openai/agent_planner.py +5 -6
pygpt_net/provider/agents/openai/agent_with_experts.py +5 -5
pygpt_net/provider/agents/openai/agent_with_experts_feedback.py +4 -4
pygpt_net/provider/agents/openai/agent_with_feedback.py +4 -4
pygpt_net/provider/agents/openai/bot_researcher.py +2 -2
pygpt_net/provider/agents/openai/bots/research_bot/agents/planner_agent.py +1 -1
pygpt_net/provider/agents/openai/bots/research_bot/agents/search_agent.py +1 -1
pygpt_net/provider/agents/openai/bots/research_bot/agents/writer_agent.py +1 -1
pygpt_net/provider/agents/openai/evolve.py +5 -5
pygpt_net/provider/agents/openai/supervisor.py +4 -4
pygpt_net/provider/api/__init__.py +27 -0
pygpt_net/provider/api/anthropic/__init__.py +68 -0
pygpt_net/provider/api/google/__init__.py +295 -0
pygpt_net/provider/api/google/audio.py +121 -0
pygpt_net/provider/api/google/chat.py +591 -0
pygpt_net/provider/api/google/image.py +427 -0
pygpt_net/provider/api/google/realtime/__init__.py +12 -0
pygpt_net/provider/api/google/realtime/client.py +1945 -0
pygpt_net/provider/api/google/realtime/realtime.py +186 -0
pygpt_net/provider/api/google/tools.py +222 -0
pygpt_net/provider/api/google/vision.py +129 -0
pygpt_net/provider/{gpt → api/openai}/__init__.py +24 -4
pygpt_net/provider/api/openai/agents/__init__.py +0 -0
pygpt_net/provider/{gpt → api/openai}/agents/computer.py +1 -1
pygpt_net/provider/{gpt → api/openai}/agents/experts.py +1 -1
pygpt_net/provider/{gpt → api/openai}/agents/response.py +1 -1
pygpt_net/provider/{gpt → api/openai}/assistants.py +1 -1
pygpt_net/provider/{gpt → api/openai}/chat.py +15 -8
pygpt_net/provider/{gpt → api/openai}/completion.py +1 -1
pygpt_net/provider/{gpt → api/openai}/image.py +1 -1
pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
pygpt_net/provider/api/openai/realtime/client.py +1828 -0
pygpt_net/provider/api/openai/realtime/realtime.py +194 -0
pygpt_net/provider/{gpt → api/openai}/remote_tools.py +1 -1
pygpt_net/provider/{gpt → api/openai}/responses.py +34 -20
pygpt_net/provider/{gpt → api/openai}/store.py +2 -2
pygpt_net/provider/{gpt → api/openai}/vision.py +1 -1
pygpt_net/provider/api/openai/worker/__init__.py +0 -0
pygpt_net/provider/{gpt → api/openai}/worker/assistants.py +4 -4
pygpt_net/provider/{gpt → api/openai}/worker/importer.py +10 -10
pygpt_net/provider/audio_input/google_genai.py +103 -0
pygpt_net/provider/audio_input/openai_whisper.py +1 -1
pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
pygpt_net/provider/audio_output/openai_tts.py +9 -6
pygpt_net/provider/core/config/patch.py +26 -0
pygpt_net/provider/core/model/patch.py +20 -0
pygpt_net/provider/core/preset/json_file.py +2 -4
pygpt_net/provider/llms/anthropic.py +2 -5
pygpt_net/provider/llms/base.py +4 -3
pygpt_net/provider/llms/google.py +8 -9
pygpt_net/provider/llms/openai.py +1 -1
pygpt_net/provider/loaders/hub/image_vision/base.py +1 -1
pygpt_net/ui/dialog/preset.py +71 -55
pygpt_net/ui/layout/toolbox/footer.py +16 -0
pygpt_net/ui/layout/toolbox/image.py +5 -0
pygpt_net/ui/main.py +6 -4
pygpt_net/ui/widget/option/combo.py +15 -1
pygpt_net/utils.py +9 -0
{pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/METADATA +55 -55
{pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/RECORD +181 -135
pygpt_net/core/audio/backend/pyaudio.py +0 -554
/pygpt_net/{provider/gpt/agents → controller/chat/handler}/__init__.py +0 -0
/pygpt_net/{provider/gpt/worker → core/realtime}/__init__.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/agents/client.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/agents/remote_tools.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/agents/utils.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/audio.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/computer.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/container.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/summarizer.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/tools.py +0 -0
/pygpt_net/provider/{gpt → api/openai}/utils.py +0 -0
{pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/LICENSE +0 -0
{pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/WHEEL +0 -0
{pygpt_net-2.6.29.dist-info → pygpt_net-2.6.31.dist-info}/entry_points.txt +0 -0

pygpt_net/provider/api/google/chat.py ADDED Viewed

@@ -0,0 +1,591 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ================================================== #
+# This file is a part of PYGPT package               #
+# Website: https://pygpt.net                         #
+# GitHub:  https://github.com/szczyglis-dev/py-gpt   #
+# MIT License                                        #
+# Created By  : Marcin Szczygliński                  #
+# Updated Date: 2025.08.28 20:00:00                  #
+# ================================================== #
+from typing import Optional, Dict, Any, List
+from google.genai import types as gtypes
+from google.genai.types import Content, Part
+from pygpt_net.core.types import MODE_CHAT, MODE_AUDIO
+from pygpt_net.core.bridge.context import BridgeContext, MultimodalContext
+from pygpt_net.item.attachment import AttachmentItem
+from pygpt_net.item.ctx import CtxItem
+from pygpt_net.item.model import ModelItem
+class Chat:
+    def __init__(self, window=None):
+        """
+        Chat wrapper for Google GenAI.
+        """
+        self.window = window
+        self.input_tokens = 0
+    def send(
+            self,
+            context: BridgeContext,
+            extra: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Call Google GenAI for chat / multimodal / audio.
+        :param context: BridgeContext with prompt, model, history, mode, etc.
+        :param extra: Extra parameters (not used currently)
+        :return: Response object or generator (if streaming)
+        """
+        prompt = context.prompt
+        stream = context.stream
+        system_prompt = context.system_prompt
+        model = context.model
+        functions = context.external_functions
+        attachments = context.attachments
+        multimodal_ctx = context.multimodal_ctx
+        mode = context.mode
+        ctx = context.ctx or CtxItem()
+        client = self.window.core.api.google.get_client(context.mode, model)
+        # Detect audio-input present
+        has_audio_input = bool(
+            multimodal_ctx
+            and getattr(multimodal_ctx, "is_audio_input", False)
+            and getattr(multimodal_ctx, "audio_data", None)
+        )
+        # ------------- TRANSCRIPTION PATH (audio input -> text -> feed to TTS) -------------
+        if mode == MODE_AUDIO and has_audio_input:
+            # Build minimal transcription request: [instruction text, audio part]
+            transcribe_model = self.window.core.config.get("google_audio.transcribe_model", "gemini-2.5-flash")
+            transcribe_prompt = self.window.core.config.get("google_audio.transcribe_prompt", "Transcribe this audio clip to text.")
+            audio_part = self.window.core.api.google.audio.build_part(multimodal_ctx)
+            trans_inputs = [
+                Content(role="user", parts=[
+                    Part.from_text(text=transcribe_prompt),
+                    audio_part,
+                ])
+            ]
+            trans_cfg = gtypes.GenerateContentConfig(
+                # Keep minimal; no tools/system for transcription
+                temperature=self.window.core.config.get('temperature'),
+                top_p=self.window.core.config.get('top_p'),
+                max_output_tokens=context.max_tokens if context.max_tokens else None,
+            )
+            # Always non-stream here (we immediately need the text for TTS)
+            trans_resp = client.models.generate_content(
+                model=transcribe_model,
+                contents=trans_inputs,
+                config=trans_cfg,
+            )
+            transcribed_text = self.extract_text(trans_resp).strip()
+            if transcribed_text:
+                # Feed transcription into TTS as the final prompt
+                prompt = transcribed_text
+                ctx.input = transcribed_text
+                try:
+                    # optional: store for debugging/UX
+                    if isinstance(ctx.extra, dict):
+                        ctx.extra["transcription"] = transcribed_text
+                except Exception:
+                    pass
+            ctx.is_audio = False  # transcription is text
+            multimodal_ctx.is_audio_input = False  # disable audio input for TTS below
+        # ---------------------- REGULAR CHAT PATH (or no-audio in MODE_AUDIO) ----------------------
+        # Build contents for chat/multimodal (will be overridden for TTS below)
+        inputs = self.build_input(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            model=model,
+            history=context.history,
+            attachments=attachments,
+            multimodal_ctx=multimodal_ctx,
+        )
+        # Best-effort input tokens estimate
+        self.reset_tokens()
+        count_msgs = self._build_count_messages(prompt, system_prompt, model, context.history)
+        self.input_tokens += self.window.core.tokens.from_messages(count_msgs, model.id)
+        # Tools -> merge app-defined tools with remote tools
+        base_tools = self.window.core.api.google.tools.prepare(model, functions)
+        remote_tools = self.window.core.api.google.build_remote_tools(model)
+        # Check tools compatibility
+        if base_tools:
+            remote_tools = [] # remote tools are not allowed if function calling is used
+        tools = (base_tools or []) + (remote_tools or [])
+        if "-image" in model.id:
+            tools = None  # function calling is not supported for image models
+        # Sampling
+        temperature = self.window.core.config.get('temperature')
+        top_p = self.window.core.config.get('top_p')
+        max_tokens = context.max_tokens if context.max_tokens else None
+        # Base config
+        cfg_kwargs: Dict[str, Any] = dict(
+            temperature=temperature,
+            top_p=top_p,
+            max_output_tokens=max_tokens,
+            system_instruction=system_prompt if system_prompt else None,
+            tools=tools if tools else None,
+        )
+        # ---------- AUDIO MODE (output TTS) ----------
+        if mode == MODE_AUDIO:
+            stream = False  # TTS non-stream in this app
+            supports_tts = self._supports_tts(model.id)
+            # Force minimal single-turn input for TTS (text only), using prompt possibly replaced by transcription
+            inputs = [Content(role="user", parts=[Part.from_text(text=str(prompt or ""))])]
+            # Remove params not used by TTS flow (and that sometimes cause issues)
+            for key in ("temperature", "top_p", "max_output_tokens", "system_instruction", "tools"):
+                if key in cfg_kwargs:
+                    del cfg_kwargs[key]
+            # Voice selection (case-sensitive name)
+            voice_name = "Kore"
+            try:
+                tmp = self.window.core.plugins.get_option("audio_output", "google_genai_tts_voice")
+                if tmp:
+                    name = str(tmp).strip()
+                    mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
+                    voice_name = mapping.get(name.lower(), name)
+            except Exception:
+                pass
+            if supports_tts:
+                cfg_kwargs["response_modalities"] = ["AUDIO"]
+                cfg_kwargs["speech_config"] = gtypes.SpeechConfig(
+                    voice_config=gtypes.VoiceConfig(
+                        prebuilt_voice_config=gtypes.PrebuiltVoiceConfig(voice_name=voice_name)
+                    )
+                )
+            # else: fallback to text-only below
+        cfg = gtypes.GenerateContentConfig(**cfg_kwargs)
+        params = dict(model=model.id, contents=inputs, config=cfg)
+        if stream and mode != MODE_AUDIO:
+            return client.models.generate_content_stream(**params)
+        else:
+            return client.models.generate_content(**params)
+    def unpack_response(
+            self,
+            mode: str,
+            response, ctx: CtxItem
+    ):
+        """
+        Unpack non-streaming response from Google GenAI and set context.
+        :param mode: MODE_CHAT or MODE_AUDIO
+        :param response: Response object
+        :param ctx: CtxItem to set output, audio_output, tokens, tool_calls
+        """
+        if mode == MODE_AUDIO:
+            # Prefer audio if present
+            audio_bytes, mime = self.window.core.api.google.audio.extract_first_audio_part(response)
+            if audio_bytes:
+                # Google returns PCM16 24kHz mono for TTS; wrap to WAV (base64) for UI compatibility
+                # https://ai.google.dev/gemini-api/docs/speech-generation
+                if mime == "audio/pcm" or mime.startswith("audio/"):
+                    wav_b64 = self.window.core.api.google.audio.pcm16_to_wav_base64(audio_bytes, rate=24000)
+                    ctx.audio_output = wav_b64
+                    ctx.is_audio = True
+                # Text transcript is typically not present for TTS; still try:
+                txt = self.extract_text(response)
+                ctx.output = txt or "..."
+            else:
+                # No audio present -> fallback to text
+                ctx.output = self.extract_text(response)
+            # Usage
+            try:
+                usage = getattr(response, "usage_metadata", None)
+                if usage:
+                    p = getattr(usage, "prompt_token_count", 0) or 0
+                    c = getattr(usage, "candidates_token_count", 0) or 0
+                    ctx.set_tokens(p, c)
+            except Exception:
+                pass
+            return  # audio path done
+        # ---- regular chat/completion ----
+        ctx.output = self.extract_text(response)
+        # Extract function calls
+        calls = self.extract_tool_calls(response)
+        if calls:
+            ctx.tool_calls = calls
+        # Usage if available
+        try:
+            usage = getattr(response, "usage_metadata", None)
+            if usage:
+                p = getattr(usage, "prompt_token_count", 0) or 0
+                c = getattr(usage, "candidates_token_count", 0) or 0
+                ctx.set_tokens(p, c)
+        except Exception:
+            pass
+        # Best-effort: inline images / links (image-output in chat)
+        try:
+            self._extract_inline_images_and_links(response, ctx)
+        except Exception:
+            pass
+    def extract_text(self, response) -> str:
+        """
+        Extract output text.
+        Prefer response.text (Python SDK), then fallback to parts[].text.
+        :param response: Response object
+        :return: Extracted text
+        """
+        txt = getattr(response, "text", None) or getattr(response, "output_text", None)
+        if txt:
+            return str(txt).strip()
+        try:
+            cands = getattr(response, "candidates", None) or []
+            if cands:
+                parts = getattr(cands[0], "content", None)
+                parts = getattr(parts, "parts", None) or []
+                out = []
+                for p in parts:
+                    t = getattr(p, "text", None)
+                    if t:
+                        out.append(str(t))
+                return "".join(out).strip()
+        except Exception:
+            pass
+        return ""
+    def extract_tool_calls(self, response) -> List[dict]:
+        """
+        Extract tool calls in a format compatible with app's tool execution.
+        Prefer response.function_calls (Python SDK), then fallback to parts[].function_call.
+        Returns arguments as a Python dict (not a JSON string).
+        :param response: Response object
+        :return: List of tool calls
+        """
+        def _to_plain_dict(obj):
+            # Convert pydantic/genai objects to plain dict recursively
+            try:
+                if hasattr(obj, "to_json_dict"):
+                    return obj.to_json_dict()
+                if hasattr(obj, "model_dump"):
+                    return obj.model_dump()  # pydantic v2
+                if hasattr(obj, "to_dict"):
+                    return obj.to_dict()
+            except Exception:
+                pass
+            if isinstance(obj, dict):
+                return {k: _to_plain_dict(v) for k, v in obj.items()}
+            if isinstance(obj, (list, tuple)):
+                return [_to_plain_dict(x) for x in obj]
+            return obj
+        out: List[dict] = []
+        # 1) response.function_calls
+        fcs = getattr(response, "function_calls", None) or []
+        for fc in fcs:
+            name = getattr(fc, "name", "") or ""
+            args_obj = getattr(fc, "args", {}) or {}
+            args_dict = _to_plain_dict(args_obj) or {}
+            # if str, try to parse
+            if isinstance(args_dict, str):
+                try:
+                    import json
+                    args_dict = json.loads(args_dict)
+                except Exception:
+                    args_dict = {}
+            out.append({
+                "id": getattr(fc, "id", "") or "",
+                "type": "function",
+                "function": {
+                    "name": name,
+                    "arguments": args_dict,  # <--- DICT, not string
+                }
+            })
+        if out:
+            return out
+        # 2) Fallback: candidates -> parts[].function_call
+        try:
+            cands = getattr(response, "candidates", None) or []
+            for cand in cands:
+                parts = getattr(getattr(cand, "content", None), "parts", None) or []
+                for part in parts:
+                    fn = getattr(part, "function_call", None)
+                    if not fn:
+                        continue
+                    name = getattr(fn, "name", "") or ""
+                    args_obj = getattr(fn, "args", {}) or {}
+                    args_dict = _to_plain_dict(args_obj) or {}
+                    if isinstance(args_dict, str):
+                        try:
+                            import json
+                            args_dict = json.loads(args_dict)
+                        except Exception:
+                            args_dict = {}
+                    out.append({
+                        "id": "",
+                        "type": "function",
+                        "function": {
+                            "name": name,
+                            "arguments": args_dict,  # <--- DICT
+                        }
+                    })
+        except Exception:
+            pass
+        return out
+    def _extract_inline_images_and_links(
+            self,
+            response, ctx: CtxItem
+    ) -> None:
+        """
+        Extract inline image parts (Gemini image output) and file links.
+        - Saves inline_data (image/*) bytes to files and appends paths to ctx.images.
+        - Appends HTTP(S) image URIs from file_data to ctx.urls.
+        :param response: Response object
+        :param ctx: CtxItem to set images and urls
+        """
+        images: list[str] = []
+        urls: list[str] = []
+        try:
+            cands = getattr(response, "candidates", None) or []
+            for cand in cands:
+                content = getattr(cand, "content", None)
+                parts = getattr(content, "parts", None) or []
+                for p in parts:
+                    # Inline image bytes (image preview / image generation in chat)
+                    blob = getattr(p, "inline_data", None)
+                    if blob:
+                        mime = (getattr(blob, "mime_type", "") or "").lower()
+                        if mime.startswith("image/"):
+                            data = getattr(blob, "data", None)
+                            if data:
+                                img_bytes = self._ensure_bytes(data)
+                                if img_bytes:
+                                    img_path = self.window.core.image.gen_unique_path(ctx)
+                                    with open(img_path, "wb") as f:
+                                        f.write(img_bytes)
+                                    images.append(img_path)
+                    # File data URI (may contain http/https or gs://)
+                    fdata = getattr(p, "file_data", None)
+                    if fdata:
+                        uri = getattr(fdata, "file_uri", None) or getattr(fdata, "uri", None)
+                        mime = (getattr(fdata, "mime_type", "") or "").lower()
+                        if uri and mime.startswith("image/"):
+                            # Store only as URL; downloading is out of scope here.
+                            if uri.startswith("http://") or uri.startswith("https://"):
+                                urls.append(uri)
+        except Exception:
+            # Best-effort only
+            pass
+        if images:
+            if not isinstance(ctx.images, list):
+                ctx.images = []
+            ctx.images.extend(images)
+        if urls:
+            if ctx.urls is None:
+                ctx.urls = []
+            ctx.urls.extend(urls)
+    @staticmethod
+    def _ensure_bytes(data) -> bytes | None:
+        """
+        Return raw bytes from SDK part.inline_data.data which can be bytes or base64 string.
+        :param data: bytes or str
+        :return: bytes or None
+        """
+        try:
+            if isinstance(data, (bytes, bytearray)):
+                return bytes(data)
+            if isinstance(data, str):
+                import base64
+                return base64.b64decode(data)
+        except Exception:
+            return None
+        return None
+    def build_input(
+            self,
+            prompt: str,
+            system_prompt: str,
+            model: ModelItem,
+            history: Optional[List[CtxItem]] = None,
+            attachments: Optional[Dict[str, AttachmentItem]] = None,
+            multimodal_ctx: Optional[MultimodalContext] = None,
+    ) -> List[Content]:
+        """
+        Build Google GenAI contents list
+        :param prompt: User prompt
+        :param system_prompt: System prompt/instruction
+        :param model: ModelItem
+        :param history: List of CtxItem for history
+        :param attachments: Dict of AttachmentItem for images
+        :param multimodal_ctx: MultimodalContext for audio
+        :return: List of Content
+        """
+        contents: List[Content] = []
+        # System instruction is passed separately (system_instruction),
+        # so we do not build an explicit system role part here.
+        # Append conversation history
+        if self.window.core.config.get('use_context'):
+            items = self.window.core.ctx.get_history(
+                history,
+                model.id,
+                MODE_CHAT if model and MODE_CHAT in model.mode else MODE_CHAT,
+                self.window.core.tokens.from_user(prompt, system_prompt),
+                self._fit_ctx(model),
+            )
+            for item in items:
+                if item.final_input:
+                    contents.append(Content(role="user", parts=[Part.from_text(text=str(item.final_input))]))
+                if item.final_output:
+                    contents.append(Content(role="model", parts=[Part.from_text(text=str(item.final_output))]))
+        # Current user message with multimodal parts
+        parts = self._build_user_parts(
+            content=str(prompt),
+            attachments=attachments,
+            multimodal_ctx=multimodal_ctx,
+        )
+        contents.append(Content(role="user", parts=parts))
+        return contents
+    def _build_user_parts(
+            self,
+            content: str,
+            attachments: Optional[Dict[str, AttachmentItem]] = None,
+            multimodal_ctx: Optional[MultimodalContext] = None,
+    ) -> List[Part]:
+        """
+        Build user message parts (text + images + audio)
+        :param content: User text content
+        :param attachments: Dict of AttachmentItem for images
+        :param multimodal_ctx: MultimodalContext for audio
+        :return: List of Part
+        """
+        parts: List[Part] = []
+        if content:
+            parts.append(Part.from_text(text=str(content)))
+        if attachments:
+            img_parts = self.window.core.api.google.vision.build_parts(content, attachments)
+            parts.extend(img_parts)
+        if multimodal_ctx and multimodal_ctx.is_audio_input and multimodal_ctx.audio_data:
+            audio_format = (multimodal_ctx.audio_format or "wav").lower()
+            mime = f"audio/{audio_format}"
+            parts.append(Part.from_bytes(data=multimodal_ctx.audio_data, mime_type=mime))
+        return parts
+    def _fit_ctx(self, model: ModelItem) -> int:
+        """
+        Fit to max model tokens (best-effort, uses model.ctx if present)
+        :param model: ModelItem
+        :return: max context tokens
+        """
+        max_ctx_tokens = self.window.core.config.get('max_total_tokens')
+        if model and model.ctx and 0 < model.ctx < max_ctx_tokens:
+            max_ctx_tokens = model.ctx
+        return max_ctx_tokens
+    def _build_count_messages(
+            self,
+            prompt: str,
+            system_prompt: str,
+            model: ModelItem,
+            history: Optional[List[CtxItem]] = None,
+    ) -> List[dict]:
+        """
+        Build simple messages structure for local token estimation
+        :param prompt: User prompt
+        :param system_prompt: System prompt/instruction
+        :param model: ModelItem
+        :param history: List of CtxItem for history
+        :return: List of messages dicts with 'role' and 'content' keys
+        """
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        if self.window.core.config.get('use_context'):
+            used_tokens = self.window.core.tokens.from_user(prompt, system_prompt)
+            items = self.window.core.ctx.get_history(
+                history,
+                model.id,
+                MODE_CHAT,
+                used_tokens,
+                self._fit_ctx(model),
+            )
+            for item in items:
+                if item.final_input:
+                    messages.append({"role": "user", "content": str(item.final_input)})
+                if item.final_output:
+                    messages.append({"role": "assistant", "content": str(item.final_output)})
+        messages.append({"role": "user", "content": str(prompt)})
+        return messages
+    def reset_tokens(self):
+        """Reset input tokens counter"""
+        self.input_tokens = 0
+    def get_used_tokens(self) -> int:
+        """
+        Get input tokens counter (estimated before sending)
+        :return: input tokens count
+        """
+        return self.input_tokens
+    @staticmethod
+    def _supports_tts(model_id: Optional[str]) -> bool:
+        """
+        Heuristic check if the model supports native TTS.
+        - Official TTS models contain '-tts' in id (e.g. 'gemini-2.5-flash-preview-tts').
+        - Future/preview names may contain 'native-audio'.
+        :param model_id: Model ID
+        :return: True if supports TTS, False otherwise
+        """
+        if not model_id:
+            return False
+        mid = model_id.lower()
+        return ("-tts" in mid) or ("native-audio" in mid)

pygpt-net 2.6.29__py3-none-any.whl → 2.6.31__py3-none-any.whl

pygpt-net 2.6.29py3-none-any.whl → 2.6.31py3-none-any.whl