PyPI - opengradient - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

opengradient 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

opengradient/__init__.py +34 -6
opengradient/cli.py +155 -55
opengradient/client.py +429 -146
opengradient/defaults.py +3 -1
opengradient/llm/og_langchain.py +6 -1
opengradient/types.py +229 -11
opengradient/x402_auth.py +60 -0
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/METADATA +6 -3
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/RECORD +13 -12
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/WHEEL +1 -1
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/entry_points.txt +0 -0
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/licenses/LICENSE +0 -0
{opengradient-0.5.7.dist-info → opengradient-0.5.9.dist-info}/top_level.txt +0 -0

opengradient/client.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Union, Callable
 import firebase
 import numpy as np
 import requests
+import httpx
 from eth_account.account import LocalAccount
 from web3 import Web3
 from web3.exceptions import ContractLogicError
@@ -17,7 +18,9 @@ import urllib.parse
 import asyncio
 from x402.clients.httpx import x402HttpxClient
 from x402.clients.base import decode_x_payment_response, x402Client
+from x402.clients.httpx import x402HttpxClient
+from .x402_auth import X402Auth
 from .exceptions import OpenGradientError
 from .proto import infer_pb2, infer_pb2_grpc
 from .types import (
@@ -29,17 +32,22 @@ from .types import (
     LlmInferenceMode,
     ModelOutput,
     TextGenerationOutput,
+    TextGenerationStream,
     SchedulerParams,
     InferenceResult,
     ModelRepository,
     FileUploadResult,
+    StreamChunk,
 )
 from .defaults import (
-    DEFAULT_IMAGE_GEN_HOST,
-    DEFAULT_IMAGE_GEN_PORT,
+    DEFAULT_IMAGE_GEN_HOST,
+    DEFAULT_IMAGE_GEN_PORT,
     DEFAULT_SCHEDULER_ADDRESS,
-    DEFAULT_LLM_SERVER_URL,
-    DEFAULT_OPENGRADIENT_LLM_SERVER_URL)
+    DEFAULT_LLM_SERVER_URL,
+    DEFAULT_OPENGRADIENT_LLM_SERVER_URL,
+    DEFAULT_OPENGRADIENT_LLM_STREAMING_SERVER_URL,
+    DEFAULT_NETWORK_FILTER,
+)
 from .utils import convert_array_to_model_output, convert_to_model_input, convert_to_model_output
 _FIREBASE_CONFIG = {
@@ -65,6 +73,19 @@ PRECOMPILE_CONTRACT_ADDRESS = "0x00000000000000000000000000000000000000F4"
 X402_PROCESSING_HASH_HEADER = "x-processing-hash"
 X402_PLACEHOLDER_API_KEY = "0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef"
+TIMEOUT = httpx.Timeout(
+    timeout=90.0,
+    connect=15.0,
+    read=15.0,
+    write=30.0,
+    pool=10.0,
+)
+LIMITS = httpx.Limits(
+    max_keepalive_connections=100,
+    max_connections=500,
+    keepalive_expiry=60 * 20,  # 20 minutes
+)
 class Client:
     _inference_hub_contract_address: str
     _blockchain: Web3
@@ -76,20 +97,22 @@ class Client:
     _precompile_abi: Dict
     _llm_server_url: str
     _external_api_keys: Dict[str, str]
     def __init__(
-        self,
-        private_key: str,
-        rpc_url: str,
-        api_url: str,
-        contract_address: str,
-        email: Optional[str] = None,
-        password: Optional[str] = None,
+        self,
+        private_key: str,
+        rpc_url: str,
+        api_url: str,
+        contract_address: str,
+        email: Optional[str] = None,
+        password: Optional[str] = None,
         llm_server_url: Optional[str] = DEFAULT_LLM_SERVER_URL,
         og_llm_server_url: Optional[str] = DEFAULT_OPENGRADIENT_LLM_SERVER_URL,
+        og_llm_streaming_server_url: Optional[str] = DEFAULT_OPENGRADIENT_LLM_STREAMING_SERVER_URL,
         openai_api_key: Optional[str] = None,
         anthropic_api_key: Optional[str] = None,
         google_api_key: Optional[str] = None,
-        ):
+    ):
         """
         Initialize the Client with private key, RPC URL, and contract address.
@@ -120,7 +143,8 @@ class Client:
         self._llm_server_url = llm_server_url
         self._og_llm_server_url = og_llm_server_url
+        self._og_llm_streaming_server_url = og_llm_streaming_server_url
         self._external_api_keys = {}
         if openai_api_key or os.getenv("OPENAI_API_KEY"):
             self._external_api_keys["openai"] = openai_api_key or os.getenv("OPENAI_API_KEY")
@@ -132,7 +156,7 @@ class Client:
     def set_api_key(self, provider: str, api_key: str):
         """
         Set or update API key for an external provider.
         Args:
             provider: Provider name (e.g., 'openai', 'anthropic', 'google')
             api_key: The API key for the provider
@@ -142,10 +166,10 @@ class Client:
     def _is_local_model(self, model_cid: str) -> bool:
         """
         Check if a model is hosted locally on OpenGradient.
         Args:
             model_cid: Model identifier
         Returns:
             True if model is local, False if it should use external provider
         """
@@ -158,7 +182,7 @@ class Client:
     def _get_provider_from_model(self, model: str) -> str:
         """Infer provider from model name."""
         model_lower = model.lower()
         if "gpt" in model_lower or model.startswith("openai/"):
             return "openai"
         elif "claude" in model_lower or model.startswith("anthropic/"):
@@ -173,10 +197,10 @@ class Client:
     def _get_api_key_for_model(self, model: str) -> Optional[str]:
         """
         Get the appropriate API key for a model.
         Args:
             model: Model identifier
         Returns:
             API key string or None
         """
@@ -418,11 +442,11 @@ class Client:
         return run_with_retry(execute_transaction, max_retries)
-    def _og_payment_selector(self, accepts, network_filter=None, scheme_filter=None, max_value=None):
-        """Custom payment selector for OpenGradient network (og-devnet)."""
+    def _og_payment_selector(self, accepts, network_filter=DEFAULT_NETWORK_FILTER, scheme_filter=None, max_value=None):
+        """Custom payment selector for OpenGradient network."""
         return x402Client.default_payment_requirements_selector(
             accepts,
-            network_filter="og-devnet",
+            network_filter=network_filter,
             scheme_filter=scheme_filter,
             max_value=max_value,
         )
@@ -451,11 +475,17 @@ class Client:
             temperature (float): Temperature for LLM inference, between 0 and 1. Default is 0.0.
             max_retries (int, optional): Maximum number of retry attempts for blockchain transactions.
             local_model (bool, optional): Force use of local model even if not in LLM enum.
+            x402_settlement_mode (x402SettlementMode, optional): Settlement mode for x402 payments.
+                - SETTLE: Records input/output hashes only (most privacy-preserving).
+                - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
+                - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
+                Defaults to SETTLE_BATCH.
         Returns:
             TextGenerationOutput: Generated text results including:
                 - Transaction hash (or "external" for external providers)
                 - String of completion output
+                - Payment hash for x402 transactions (when using x402 settlement)
         Raises:
             OpenGradientError: If the inference fails.
@@ -467,14 +497,14 @@ class Client:
                 return OpenGradientError("That model CID is not supported yet for TEE inference")
             return self._external_llm_completion(
-                model=model_cid.split('/')[1],
+                model=model_cid.split("/")[1],
                 prompt=prompt,
                 max_tokens=max_tokens,
                 stop_sequence=stop_sequence,
                 temperature=temperature,
                 x402_settlement_mode=x402_settlement_mode,
             )
         # Original local model logic
         def execute_transaction():
             if inference_mode != LlmInferenceMode.VANILLA:
@@ -482,10 +512,10 @@ class Client:
             if model_cid not in [llm.value for llm in LLM]:
                 raise OpenGradientError("That model CID is not yet supported for inference")
             model_name = model_cid
             if model_cid in [llm.value for llm in TEE_LLM]:
-                model_name = model_cid.split('/')[1]
+                model_name = model_cid.split("/")[1]
             contract = self._blockchain.eth.contract(address=self._inference_hub_contract_address, abi=self._inference_abi)
@@ -523,55 +553,49 @@ class Client:
     ) -> TextGenerationOutput:
         """
         Route completion request to external LLM server with x402 payments.
         Args:
             model: Model identifier
             prompt: Input prompt
             max_tokens: Maximum tokens to generate
             stop_sequence: Stop sequences
             temperature: Sampling temperature
         Returns:
             TextGenerationOutput with completion
         Raises:
             OpenGradientError: If request fails
         """
         api_key = self._get_api_key_for_model(model)
         if api_key:
             logging.debug("External LLM completions using API key")
             url = f"{self._llm_server_url}/v1/completions"
-            headers = {
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {api_key}"
-            }
+            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
             payload = {
                 "model": model,
                 "prompt": prompt,
                 "max_tokens": max_tokens,
                 "temperature": temperature,
             }
             if stop_sequence:
                 payload["stop"] = stop_sequence
             try:
                 response = requests.post(url, json=payload, headers=headers, timeout=60)
                 response.raise_for_status()
                 result = response.json()
-                return TextGenerationOutput(
-                    transaction_hash="external",
-                    completion_output=result.get("completion")
-                )
+                return TextGenerationOutput(transaction_hash="external", completion_output=result.get("completion"))
             except requests.RequestException as e:
                 error_msg = f"External LLM completion failed: {str(e)}"
-                if hasattr(e, 'response') and e.response is not None:
+                if hasattr(e, "response") and e.response is not None:
                     try:
                         error_detail = e.response.json()
                         error_msg += f" - {error_detail}"
@@ -591,20 +615,20 @@ class Client:
                     "Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}",
                     "X-SETTLEMENT-TYPE": x402_settlement_mode,
                 }
                 payload = {
                     "model": model,
                     "prompt": prompt,
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                 }
                 if stop_sequence:
                     payload["stop"] = stop_sequence
                 try:
                     response = await client.post("/v1/completions", json=payload, headers=headers, timeout=60)
                     # Read the response content
                     content = await response.aread()
                     result = json.loads(content.decode())
@@ -612,24 +636,22 @@ class Client:
                     if X402_PROCESSING_HASH_HEADER in response.headers:
                         payment_hash = response.headers[X402_PROCESSING_HASH_HEADER]
                     return TextGenerationOutput(
-                        transaction_hash="external",
-                        completion_output=result.get("completion"),
-                        payment_hash=payment_hash
+                        transaction_hash="external", completion_output=result.get("completion"), payment_hash=payment_hash
                     )
                 except Exception as e:
                     error_msg = f"External LLM completion request failed: {str(e)}"
                     logging.error(error_msg)
                     raise OpenGradientError(error_msg)
         try:
             # Run the async function in a sync context
             return asyncio.run(make_request())
         except Exception as e:
             error_msg = f"External LLM completion failed: {str(e)}"
-            if hasattr(e, 'response') and e.response is not None:
+            if hasattr(e, "response") and e.response is not None:
                 try:
                     error_detail = e.response.json()
                     error_msg += f" - {error_detail}"
@@ -651,7 +673,8 @@ class Client:
         max_retries: Optional[int] = None,
         local_model: Optional[bool] = False,
         x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
-    ) -> TextGenerationOutput:
+        stream: bool = False,
+    ) -> Union[TextGenerationOutput, TextGenerationStream]:
         """
         Perform inference on an LLM model using chat.
@@ -666,9 +689,17 @@ class Client:
             tool_choice (str, optional): Sets a specific tool to choose.
             max_retries (int, optional): Maximum number of retry attempts.
             local_model (bool, optional): Force use of local model.
+            x402_settlement_mode (x402SettlementMode, optional): Settlement mode for x402 payments.
+                - SETTLE: Records input/output hashes only (most privacy-preserving).
+                - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
+                - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
+                Defaults to SETTLE_BATCH.
+            stream (bool, optional): Whether to stream the response. Default is False.
         Returns:
-            TextGenerationOutput: Generated text results.
+            Union[TextGenerationOutput, TextGenerationStream]:
+                - If stream=False: TextGenerationOutput with chat_output, transaction_hash, finish_reason, and payment_hash
+                - If stream=True: TextGenerationStream yielding StreamChunk objects with typed deltas (true streaming via threading)
         Raises:
             OpenGradientError: If the inference fails.
@@ -679,28 +710,45 @@ class Client:
             if model_cid not in TEE_LLM:
                 return OpenGradientError("That model CID is not supported yet for TEE inference")
-            return self._external_llm_chat(
-                model=model_cid.split('/')[1],
-                messages=messages,
-                max_tokens=max_tokens,
-                stop_sequence=stop_sequence,
-                temperature=temperature,
-                tools=tools,
-                tool_choice=tool_choice,
-                x402_settlement_mode=x402_settlement_mode,
-            )
+            if stream:
+                # Use threading bridge for true sync streaming
+                return self._external_llm_chat_stream_sync(
+                    model=model_cid.split("/")[1],
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    stop_sequence=stop_sequence,
+                    temperature=temperature,
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    x402_settlement_mode=x402_settlement_mode,
+                    use_tee=True,
+                )
+            else:
+                # Non-streaming
+                return self._external_llm_chat(
+                    model=model_cid.split("/")[1],
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    stop_sequence=stop_sequence,
+                    temperature=temperature,
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    x402_settlement_mode=x402_settlement_mode,
+                    stream=False,
+                    use_tee=True,
+                )
         # Original local model logic
         def execute_transaction():
             if inference_mode != LlmInferenceMode.VANILLA:
                 raise OpenGradientError("Invalid inference mode %s: Inference mode must be VANILLA or TEE" % inference_mode)
             if model_cid not in [llm.value for llm in LLM]:
                 raise OpenGradientError("That model CID is not yet supported for inference")
             model_name = model_cid
             if model_cid in [llm.value for llm in TEE_LLM]:
-                model_name = model_cid.split('/')[1]
+                model_name = model_cid.split("/")[1]
             contract = self._blockchain.eth.contract(address=self._inference_hub_contract_address, abi=self._inference_abi)
@@ -768,10 +816,12 @@ class Client:
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
-    ) -> TextGenerationOutput:
+        stream: bool = False,
+        use_tee: bool = False,
+    ) -> Union[TextGenerationOutput, TextGenerationStream]:
         """
         Route chat request to external LLM server with x402 payments.
         Args:
             model: Model identifier
             messages: List of chat messages
@@ -780,53 +830,63 @@ class Client:
             temperature: Sampling temperature
             tools: Function calling tools
             tool_choice: Tool selection strategy
+            stream: Whether to stream the response
+            use_tee: Whether to use TEE
         Returns:
-            TextGenerationOutput with chat completion
+            Union[TextGenerationOutput, TextGenerationStream]: Chat completion or TextGenerationStream
         Raises:
             OpenGradientError: If request fails
         """
-        api_key = self._get_api_key_for_model(model)
+        api_key = None if use_tee else self._get_api_key_for_model(model)
         if api_key:
-            logging.debug("External LLM completion using API key")
-            url = f"{self._llm_server_url}/v1/chat/completions"
-            headers = {
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {api_key}"
-            }
+            logging.debug("External LLM chat using API key")
+            if stream:
+                url = f"{self._llm_server_url}/v1/chat/completions/stream"
+            else:
+                url = f"{self._llm_server_url}/v1/chat/completions"
+            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
             payload = {
                 "model": model,
                 "messages": messages,
                 "max_tokens": max_tokens,
                 "temperature": temperature,
             }
             if stop_sequence:
                 payload["stop"] = stop_sequence
             if tools:
                 payload["tools"] = tools
                 payload["tool_choice"] = tool_choice or "auto"
             try:
-                response = requests.post(url, json=payload, headers=headers, timeout=60)
-                response.raise_for_status()
-                result = response.json()
-                return TextGenerationOutput(
-                    transaction_hash="external",
-                    finish_reason=result.get("finish_reason"),
-                    chat_output=result.get("message")
-                )
+                if stream:
+                    # Return streaming response wrapped in TextGenerationStream
+                    response = requests.post(url, json=payload, headers=headers, timeout=60, stream=True)
+                    response.raise_for_status()
+                    return TextGenerationStream(_iterator=response.iter_lines(decode_unicode=True), _is_async=False)
+                else:
+                    # Non-streaming response
+                    response = requests.post(url, json=payload, headers=headers, timeout=60)
+                    response.raise_for_status()
+                    result = response.json()
+                    return TextGenerationOutput(
+                        transaction_hash="external",
+                        finish_reason=result.get("finish_reason"),
+                        chat_output=result.get("message")
+                    )
             except requests.RequestException as e:
                 error_msg = f"External LLM chat failed: {str(e)}"
-                if hasattr(e, 'response') and e.response is not None:
+                if hasattr(e, "response") and e.response is not None:
                     try:
                         error_detail = e.response.json()
                         error_msg += f" - {error_detail}"
@@ -835,6 +895,7 @@ class Client:
                 logging.error(error_msg)
                 raise OpenGradientError(error_msg)
+        # x402 payment path - non-streaming only here
         async def make_request():
             async with x402HttpxClient(
                 account=self._wallet_account,
@@ -844,58 +905,58 @@ class Client:
                 headers = {
                     "Content-Type": "application/json",
                     "Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}",
-                    "X-SETTLEMENT-TYPE": x402_settlement_mode
+                    "X-SETTLEMENT-TYPE": x402_settlement_mode,
                 }
                 payload = {
                     "model": model,
                     "messages": messages,
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                 }
                 if stop_sequence:
                     payload["stop"] = stop_sequence
                 if tools:
                     payload["tools"] = tools
                     payload["tool_choice"] = tool_choice or "auto"
                 try:
-                    response = await client.post("/v1/chat/completions", json=payload, headers=headers, timeout=60)
+                    # Non-streaming with x402
+                    endpoint = "/v1/chat/completions"
+                    response = await client.post(endpoint, json=payload, headers=headers, timeout=60)
                     # Read the response content
                     content = await response.aread()
                     result = json.loads(content.decode())
-                    # print(f"Response: {response}")
-                    # print(f"Response Headers: {response.headers}")
                     payment_hash = ""
                     if X402_PROCESSING_HASH_HEADER in response.headers:
                         payment_hash = response.headers[X402_PROCESSING_HASH_HEADER]
                     choices = result.get("choices")
                     if not choices:
                         raise OpenGradientError(f"Invalid response: 'choices' missing or empty in {result}")
                     return TextGenerationOutput(
                         transaction_hash="external",
                         finish_reason=choices[0].get("finish_reason"),
                         chat_output=choices[0].get("message"),
-                        payment_hash=payment_hash
+                        payment_hash=payment_hash,
                     )
                 except Exception as e:
                     error_msg = f"External LLM chat request failed: {str(e)}"
                     logging.error(error_msg)
                     raise OpenGradientError(error_msg)
         try:
             # Run the async function in a sync context
             return asyncio.run(make_request())
         except Exception as e:
             error_msg = f"External LLM chat failed: {str(e)}"
-            if hasattr(e, 'response') and e.response is not None:
+            if hasattr(e, "response") and e.response is not None:
                 try:
                     error_detail = e.response.json()
                     error_msg += f" - {error_detail}"
@@ -904,6 +965,234 @@ class Client:
             logging.error(error_msg)
             raise OpenGradientError(error_msg)
+    def _external_llm_chat_stream_sync(
+        self,
+        model: str,
+        messages: List[Dict],
+        max_tokens: int = 100,
+        stop_sequence: Optional[List[str]] = None,
+        temperature: float = 0.0,
+        tools: Optional[List[Dict]] = None,
+        tool_choice: Optional[str] = None,
+        x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        use_tee: bool = False,
+    ):
+        """
+        Sync streaming using threading bridge - TRUE real-time streaming.
+        Yields StreamChunk objects as they arrive from the background thread.
+        NO buffering, NO conversion, just direct pass-through.
+        """
+        import threading
+        from queue import Queue
+        queue = Queue()
+        exception_holder = []
+        def _run_async():
+            """Run async streaming in background thread"""
+            loop = None
+            try:
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                async def _stream():
+                    try:
+                        async for chunk in self._external_llm_chat_stream_async(
+                            model=model,
+                            messages=messages,
+                            max_tokens=max_tokens,
+                            stop_sequence=stop_sequence,
+                            temperature=temperature,
+                            tools=tools,
+                            tool_choice=tool_choice,
+                            x402_settlement_mode=x402_settlement_mode,
+                            use_tee=use_tee,
+                        ):
+                            queue.put(chunk)  # Put chunk immediately
+                    except Exception as e:
+                        exception_holder.append(e)
+                    finally:
+                        queue.put(None)  # Signal completion
+                loop.run_until_complete(_stream())
+            except Exception as e:
+                exception_holder.append(e)
+                queue.put(None)
+            finally:
+                if loop:
+                    try:
+                        pending = asyncio.all_tasks(loop)
+                        for task in pending:
+                            task.cancel()
+                        loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
+                    finally:
+                        loop.close()
+        # Start background thread
+        thread = threading.Thread(target=_run_async, daemon=True)
+        thread.start()
+        # Yield chunks DIRECTLY as they arrive - NO buffering
+        try:
+            while True:
+                chunk = queue.get()  # Blocks until chunk available
+                if chunk is None:
+                    break
+                yield chunk  # Yield immediately!
+            thread.join(timeout=5)
+            if exception_holder:
+                raise exception_holder[0]
+        except Exception as e:
+            thread.join(timeout=1)
+            raise
+    async def _external_llm_chat_stream_async(
+        self,
+        model: str,
+        messages: List[Dict],
+        max_tokens: int = 100,
+        stop_sequence: Optional[List[str]] = None,
+        temperature: float = 0.0,
+        tools: Optional[List[Dict]] = None,
+        tool_choice: Optional[str] = None,
+        x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
+        use_tee: bool = False,
+    ):
+        """
+        Internal async streaming implementation.
+        Yields StreamChunk objects as they arrive from the server.
+        """
+        api_key = None if use_tee else self._get_api_key_for_model(model)
+        if api_key:
+            # API key path - streaming to local llm-server
+            url = f"{self._og_llm_streaming_server_url}/v1/chat/completions"
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {api_key}"
+            }
+            payload = {
+                "model": model,
+                "messages": messages,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "stream": True,
+            }
+            if stop_sequence:
+                payload["stop"] = stop_sequence
+            if tools:
+                payload["tools"] = tools
+                payload["tool_choice"] = tool_choice or "auto"
+            async with httpx.AsyncClient(verify=False, timeout=None) as client:
+                async with client.stream("POST", url, json=payload, headers=headers) as response:
+                    buffer = b""
+                    async for chunk in response.aiter_raw():
+                        if not chunk:
+                            continue
+                        buffer += chunk
+                        # Process all complete lines in buffer
+                        while b"\n" in buffer:
+                            line_bytes, buffer = buffer.split(b"\n", 1)
+                            if not line_bytes.strip():
+                                continue
+                            try:
+                                line = line_bytes.decode('utf-8').strip()
+                            except UnicodeDecodeError:
+                                continue
+                            if not line.startswith("data: "):
+                                continue
+                            data_str = line[6:]  # Strip "data: " prefix
+                            if data_str.strip() == "[DONE]":
+                                return
+                            try:
+                                data = json.loads(data_str)
+                                yield StreamChunk.from_sse_data(data)
+                            except json.JSONDecodeError:
+                                continue
+        else:
+            # x402 payment path
+            async with httpx.AsyncClient(
+                base_url=self._og_llm_streaming_server_url,
+                headers={"Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}"},
+                timeout=TIMEOUT,
+                limits=LIMITS,
+                http2=False,
+                follow_redirects=False,
+                auth=X402Auth(account=self._wallet_account),  # type: ignore
+            ) as client:
+                headers = {
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}",
+                    "X-SETTLEMENT-TYPE": x402_settlement_mode,
+                }
+                payload = {
+                    "model": model,
+                    "messages": messages,
+                    "max_tokens": max_tokens,
+                    "temperature": temperature,
+                    "stream": True,
+                }
+                if stop_sequence:
+                    payload["stop"] = stop_sequence
+                if tools:
+                    payload["tools"] = tools
+                    payload["tool_choice"] = tool_choice or "auto"
+                async with client.stream(
+                    "POST",
+                    "/v1/chat/completions",
+                    json=payload,
+                    headers=headers,
+                ) as response:
+                    buffer = b""
+                    async for chunk in response.aiter_raw():
+                        if not chunk:
+                            continue
+                        buffer += chunk
+                        # Process complete lines from buffer
+                        while b"\n" in buffer:
+                            line_bytes, buffer = buffer.split(b"\n", 1)
+                            if not line_bytes.strip():
+                                continue
+                            try:
+                                line = line_bytes.decode('utf-8').strip()
+                            except UnicodeDecodeError:
+                                continue
+                            if not line.startswith("data: "):
+                                continue
+                            data_str = line[6:]
+                            if data_str.strip() == "[DONE]":
+                                return
+                            try:
+                                data = json.loads(data_str)
+                                yield StreamChunk.from_sse_data(data)
+                            except json.JSONDecodeError:
+                                continue
     def list_files(self, model_name: str, version: str) -> List[Dict]:
         """
         List files for a specific version of a model.
@@ -1104,12 +1393,12 @@ class Client:
         except ContractLogicError as e:
             try:
                 run_function.call({"from": self._wallet_account.address})
             except ContractLogicError as call_err:
                 raise ContractLogicError(f"simulation failed with revert reason: {call_err.args[0]}")
             raise ContractLogicError(f"simulation failed with no revert reason. Reason: {e}")
         gas_limit = int(estimated_gas * 3)
         transaction = run_function.build_transaction(
@@ -1128,10 +1417,10 @@ class Client:
         if tx_receipt["status"] == 0:
             try:
                 run_function.call({"from": self._wallet_account.address})
             except ContractLogicError as call_err:
                 raise ContractLogicError(f"Transaction failed with revert reason: {call_err.args[0]}")
             raise ContractLogicError(f"Transaction failed with no revert reason. Receipt: {tx_receipt}")
         return tx_hash, tx_receipt
@@ -1346,45 +1635,42 @@ class Client:
         results = contract.functions.getLastInferenceResults(num_results).call()
         return [convert_array_to_model_output(result) for result in results]
     def _get_inference_result_from_node(self, inference_id: str, inference_mode: InferenceMode) -> Dict:
         """
         Get the inference result from node.
         Args:
             inference_id (str): Inference id for a inference request
         Returns:
             Dict: The inference result as returned by the node
         Raises:
             OpenGradientError: If the request fails or returns an error
         """
         try:
-            encoded_id = urllib.parse.quote(inference_id, safe='')
+            encoded_id = urllib.parse.quote(inference_id, safe="")
             url = f"{self._api_url}/artela-network/artela-rollkit/inference/tx/{encoded_id}"
             response = requests.get(url)
             if response.status_code == 200:
                 resp = response.json()
                 inference_result = resp.get("inference_results", {})
                 if inference_result:
                     decoded_bytes = base64.b64decode(inference_result[0])
-                    decoded_string = decoded_bytes.decode('utf-8')
-                    output = json.loads(decoded_string).get("InferenceResult",{})
+                    decoded_string = decoded_bytes.decode("utf-8")
+                    output = json.loads(decoded_string).get("InferenceResult", {})
                     if output is None:
                         raise OpenGradientError("Missing InferenceResult in inference output")
                     match inference_mode:
                         case InferenceMode.VANILLA:
                             if "VanillaResult" not in output:
                                 raise OpenGradientError("Missing VanillaResult in inference output")
                             if "model_output" not in output["VanillaResult"]:
                                 raise OpenGradientError("Missing model_output in VanillaResult")
-                            return {
-                                "output": output["VanillaResult"]["model_output"]
-                            }
+                            return {"output": output["VanillaResult"]["model_output"]}
                         case InferenceMode.TEE:
                             if "TeeNodeResult" not in output:
                                 raise OpenGradientError("Missing TeeNodeResult in inference output")
@@ -1393,34 +1679,30 @@ class Client:
                             if "VanillaResponse" in output["TeeNodeResult"]["Response"]:
                                 if "model_output" not in output["TeeNodeResult"]["Response"]["VanillaResponse"]:
                                     raise OpenGradientError("Missing model_output in VanillaResponse")
-                                return {
-                                    "output": output["TeeNodeResult"]["Response"]["VanillaResponse"]["model_output"]
-                                }
+                                return {"output": output["TeeNodeResult"]["Response"]["VanillaResponse"]["model_output"]}
                             else:
                                 raise OpenGradientError("Missing VanillaResponse in TeeNodeResult Response")
                         case InferenceMode.ZKML:
                             if "ZkmlResult" not in output:
                                 raise OpenGradientError("Missing ZkmlResult in inference output")
                             if "model_output" not in output["ZkmlResult"]:
                                 raise OpenGradientError("Missing model_output in ZkmlResult")
-                            return {
-                                "output": output["ZkmlResult"]["model_output"]
-                            }
+                            return {"output": output["ZkmlResult"]["model_output"]}
                         case _:
                             raise OpenGradientError(f"Invalid inference mode: {inference_mode}")
                 else:
                     return None
             else:
                 error_message = f"Failed to get inference result: HTTP {response.status_code}"
                 if response.text:
                     error_message += f" - {response.text}"
                 logging.error(error_message)
                 raise OpenGradientError(error_message)
         except requests.RequestException as e:
             logging.error(f"Request exception when getting inference result: {str(e)}")
             raise OpenGradientError(f"Failed to get inference result: {str(e)}")
@@ -1428,6 +1710,7 @@ class Client:
             logging.error(f"Unexpected error when getting inference result: {str(e)}", exc_info=True)
             raise OpenGradientError(f"Failed to get inference result: {str(e)}")
 def run_with_retry(txn_function: Callable, max_retries=DEFAULT_MAX_RETRY, retry_delay=DEFAULT_RETRY_DELAY_SEC):
     """
     Execute a blockchain transaction with retry logic.

opengradient 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

opengradient 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl