opengradient 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
opengradient/__init__.py CHANGED
@@ -17,6 +17,7 @@ from .types import (
17
17
  InferenceResult,
18
18
  LlmInferenceMode,
19
19
  TextGenerationOutput,
20
+ TextGenerationStream,
20
21
  ModelOutput,
21
22
  ModelRepository,
22
23
  FileUploadResult,
@@ -225,7 +226,8 @@ def llm_chat(
225
226
  tool_choice: Optional[str] = None,
226
227
  max_retries: Optional[int] = None,
227
228
  x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
228
- ) -> TextGenerationOutput:
229
+ stream: Optional[bool] = False,
230
+ ) -> Union[TextGenerationOutput, TextGenerationStream]:
229
231
  """Have a chat conversation with an LLM.
230
232
 
231
233
  Args:
@@ -239,9 +241,10 @@ def llm_chat(
239
241
  tool_choice: Optional specific tool to use
240
242
  max_retries: Maximum number of retries for failed transactions
241
243
  x402_settlement_mode: Settlement modes for x402 payment protocol transactions (enum x402SettlementMode)
244
+ stream: Optional boolean to enable streaming
242
245
 
243
246
  Returns:
244
- TextGenerationOutput
247
+ TextGenerationOutput or TextGenerationStream
245
248
 
246
249
  Raises:
247
250
  RuntimeError: If SDK is not initialized
@@ -258,7 +261,8 @@ def llm_chat(
258
261
  tools=tools,
259
262
  tool_choice=tool_choice,
260
263
  max_retries=max_retries,
261
- x402_settlement_mode=x402_settlement_mode
264
+ x402_settlement_mode=x402_settlement_mode,
265
+ stream=stream,
262
266
  )
263
267
 
264
268
 
opengradient/cli.py CHANGED
@@ -6,6 +6,7 @@ import logging
6
6
  import webbrowser
7
7
  from pathlib import Path
8
8
  from typing import Dict, List, Optional
9
+ import sys
9
10
 
10
11
  import click
11
12
 
@@ -557,6 +558,7 @@ def print_llm_completion_result(model_cid, tx_hash, llm_output, is_local=True):
557
558
  default="settle-batch",
558
559
  help="Settlement mode for x402 payments: settle (hashes only), settle-batch (batched, default), settle-metadata (full data)",
559
560
  )
561
+ @click.option("--stream", is_flag=True, default=False, help="Stream the output from the LLM")
560
562
  @click.pass_context
561
563
  def chat(
562
564
  ctx,
@@ -572,6 +574,7 @@ def chat(
572
574
  tool_choice: Optional[str],
573
575
  x402_settlement_mode: Optional[str],
574
576
  local: bool,
577
+ stream: bool,
575
578
  ):
576
579
  """
577
580
  Run chat inference on an LLM model (local or external).
@@ -590,6 +593,9 @@ def chat(
590
593
 
591
594
  # External Anthropic model
592
595
  opengradient chat --model claude-haiku-4-5-20251001 --messages '[{"role":"user","content":"Write a poem"}]' --max-tokens 100
596
+
597
+ # Stream output
598
+ opengradient chat --model anthropic/claude-3.5-haiku --messages '[{"role":"user","content":"How are clouds formed?"}]' --max-tokens 250 --stream
593
599
  """
594
600
  client: Client = ctx.obj["client"]
595
601
 
@@ -656,7 +662,7 @@ def chat(
656
662
  if not tools and not tools_file:
657
663
  parsed_tools = None
658
664
 
659
- completion_output = client.llm_chat(
665
+ result = client.llm_chat(
660
666
  model_cid=model_cid,
661
667
  inference_mode=LlmInferenceModes[inference_mode],
662
668
  messages=messages,
@@ -667,11 +673,16 @@ def chat(
667
673
  tool_choice=tool_choice,
668
674
  local_model=local,
669
675
  x402_settlement_mode=x402_settlement_mode,
676
+ stream=stream,
670
677
  )
671
678
 
672
- print_llm_chat_result(
673
- model_cid, completion_output.transaction_hash, completion_output.finish_reason, completion_output.chat_output, is_local
674
- )
679
+ # Handle response based on streaming flag
680
+ if stream:
681
+ print_streaming_chat_result(model_cid, result, is_local)
682
+ else:
683
+ print_llm_chat_result(
684
+ model_cid, result.transaction_hash, result.finish_reason, result.chat_output, is_local
685
+ )
675
686
 
676
687
  except Exception as e:
677
688
  click.echo(f"Error running LLM chat inference: {str(e)}")
@@ -706,6 +717,80 @@ def print_llm_chat_result(model_cid, tx_hash, finish_reason, chat_output, is_loc
706
717
  click.echo()
707
718
 
708
719
 
720
+ def print_streaming_chat_result(model_cid, stream, is_local=True):
721
+ """Handle streaming chat response with typed chunks - prints in real-time"""
722
+ click.secho("🌊 Streaming LLM Chat", fg="green", bold=True)
723
+ click.echo("──────────────────────────────────────")
724
+ click.echo("Model: ", nl=False)
725
+ click.secho(model_cid, fg="cyan", bold=True)
726
+
727
+ if is_local:
728
+ click.echo("Source: ", nl=False)
729
+ click.secho("OpenGradient TEE", fg="cyan", bold=True)
730
+ else:
731
+ click.echo("Source: ", nl=False)
732
+ click.secho("External Provider", fg="cyan", bold=True)
733
+
734
+ click.echo("──────────────────────────────────────")
735
+ click.secho("Response:", fg="yellow", bold=True)
736
+ click.echo()
737
+
738
+ try:
739
+ content_parts = []
740
+ chunk_count = 0
741
+
742
+ for chunk in stream:
743
+ chunk_count += 1
744
+
745
+ if chunk.choices[0].delta.content:
746
+ content = chunk.choices[0].delta.content
747
+ sys.stdout.write(content)
748
+ sys.stdout.flush()
749
+ content_parts.append(content)
750
+
751
+ # Handle tool calls
752
+ if chunk.choices[0].delta.tool_calls:
753
+ sys.stdout.write("\n")
754
+ sys.stdout.flush()
755
+ click.secho("Tool Calls:", fg="yellow", bold=True)
756
+ for tool_call in chunk.choices[0].delta.tool_calls:
757
+ click.echo(f" Function: {tool_call['function']['name']}")
758
+ click.echo(f" Arguments: {tool_call['function']['arguments']}")
759
+
760
+ # Print final info when stream completes
761
+ if chunk.is_final:
762
+ sys.stdout.write("\n\n")
763
+ sys.stdout.flush()
764
+ click.echo("──────────────────────────────────────")
765
+
766
+ if chunk.usage:
767
+ click.secho("Token Usage:", fg="cyan")
768
+ click.echo(f" Prompt tokens: {chunk.usage.prompt_tokens}")
769
+ click.echo(f" Completion tokens: {chunk.usage.completion_tokens}")
770
+ click.echo(f" Total tokens: {chunk.usage.total_tokens}")
771
+ click.echo()
772
+
773
+ if chunk.choices[0].finish_reason:
774
+ click.echo("Finish reason: ", nl=False)
775
+ click.secho(chunk.choices[0].finish_reason, fg="green")
776
+
777
+ click.echo("──────────────────────────────────────")
778
+ click.echo(f"Chunks received: {chunk_count}")
779
+ click.echo(f"Content length: {len(''.join(content_parts))} characters")
780
+ click.echo()
781
+
782
+ except KeyboardInterrupt:
783
+ sys.stdout.write("\n")
784
+ sys.stdout.flush()
785
+ click.secho("Stream interrupted by user", fg="yellow")
786
+ click.echo()
787
+ except Exception as e:
788
+ sys.stdout.write("\n")
789
+ sys.stdout.flush()
790
+ click.secho(f"Streaming error: {str(e)}", fg="red", bold=True)
791
+ click.echo()
792
+
793
+
709
794
  @cli.command()
710
795
  def create_account():
711
796
  """Create a new test account for OpenGradient inference and model management"""
opengradient/client.py CHANGED
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Union, Callable
9
9
  import firebase
10
10
  import numpy as np
11
11
  import requests
12
+ import httpx
12
13
  from eth_account.account import LocalAccount
13
14
  from web3 import Web3
14
15
  from web3.exceptions import ContractLogicError
@@ -17,7 +18,9 @@ import urllib.parse
17
18
  import asyncio
18
19
  from x402.clients.httpx import x402HttpxClient
19
20
  from x402.clients.base import decode_x_payment_response, x402Client
21
+ from x402.clients.httpx import x402HttpxClient
20
22
 
23
+ from .x402_auth import X402Auth
21
24
  from .exceptions import OpenGradientError
22
25
  from .proto import infer_pb2, infer_pb2_grpc
23
26
  from .types import (
@@ -29,10 +32,12 @@ from .types import (
29
32
  LlmInferenceMode,
30
33
  ModelOutput,
31
34
  TextGenerationOutput,
35
+ TextGenerationStream,
32
36
  SchedulerParams,
33
37
  InferenceResult,
34
38
  ModelRepository,
35
39
  FileUploadResult,
40
+ StreamChunk,
36
41
  )
37
42
  from .defaults import (
38
43
  DEFAULT_IMAGE_GEN_HOST,
@@ -40,6 +45,8 @@ from .defaults import (
40
45
  DEFAULT_SCHEDULER_ADDRESS,
41
46
  DEFAULT_LLM_SERVER_URL,
42
47
  DEFAULT_OPENGRADIENT_LLM_SERVER_URL,
48
+ DEFAULT_OPENGRADIENT_LLM_STREAMING_SERVER_URL,
49
+ DEFAULT_NETWORK_FILTER,
43
50
  )
44
51
  from .utils import convert_array_to_model_output, convert_to_model_input, convert_to_model_output
45
52
 
@@ -66,6 +73,18 @@ PRECOMPILE_CONTRACT_ADDRESS = "0x00000000000000000000000000000000000000F4"
66
73
  X402_PROCESSING_HASH_HEADER = "x-processing-hash"
67
74
  X402_PLACEHOLDER_API_KEY = "0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef"
68
75
 
76
+ TIMEOUT = httpx.Timeout(
77
+ timeout=90.0,
78
+ connect=15.0,
79
+ read=15.0,
80
+ write=30.0,
81
+ pool=10.0,
82
+ )
83
+ LIMITS = httpx.Limits(
84
+ max_keepalive_connections=100,
85
+ max_connections=500,
86
+ keepalive_expiry=60 * 20, # 20 minutes
87
+ )
69
88
 
70
89
  class Client:
71
90
  _inference_hub_contract_address: str
@@ -89,6 +108,7 @@ class Client:
89
108
  password: Optional[str] = None,
90
109
  llm_server_url: Optional[str] = DEFAULT_LLM_SERVER_URL,
91
110
  og_llm_server_url: Optional[str] = DEFAULT_OPENGRADIENT_LLM_SERVER_URL,
111
+ og_llm_streaming_server_url: Optional[str] = DEFAULT_OPENGRADIENT_LLM_STREAMING_SERVER_URL,
92
112
  openai_api_key: Optional[str] = None,
93
113
  anthropic_api_key: Optional[str] = None,
94
114
  google_api_key: Optional[str] = None,
@@ -123,6 +143,7 @@ class Client:
123
143
 
124
144
  self._llm_server_url = llm_server_url
125
145
  self._og_llm_server_url = og_llm_server_url
146
+ self._og_llm_streaming_server_url = og_llm_streaming_server_url
126
147
 
127
148
  self._external_api_keys = {}
128
149
  if openai_api_key or os.getenv("OPENAI_API_KEY"):
@@ -421,11 +442,11 @@ class Client:
421
442
 
422
443
  return run_with_retry(execute_transaction, max_retries)
423
444
 
424
- def _og_payment_selector(self, accepts, network_filter=None, scheme_filter=None, max_value=None):
425
- """Custom payment selector for OpenGradient network (og-devnet)."""
445
+ def _og_payment_selector(self, accepts, network_filter=DEFAULT_NETWORK_FILTER, scheme_filter=None, max_value=None):
446
+ """Custom payment selector for OpenGradient network."""
426
447
  return x402Client.default_payment_requirements_selector(
427
448
  accepts,
428
- network_filter="og-devnet",
449
+ network_filter=network_filter,
429
450
  scheme_filter=scheme_filter,
430
451
  max_value=max_value,
431
452
  )
@@ -652,7 +673,8 @@ class Client:
652
673
  max_retries: Optional[int] = None,
653
674
  local_model: Optional[bool] = False,
654
675
  x402_settlement_mode: Optional[x402SettlementMode] = x402SettlementMode.SETTLE_BATCH,
655
- ) -> TextGenerationOutput:
676
+ stream: bool = False,
677
+ ) -> Union[TextGenerationOutput, TextGenerationStream]:
656
678
  """
657
679
  Perform inference on an LLM model using chat.
658
680
 
@@ -672,13 +694,12 @@ class Client:
672
694
  - SETTLE_BATCH: Aggregates multiple inferences into batch hashes (most cost-efficient).
673
695
  - SETTLE_METADATA: Records full model info, complete input/output data, and all metadata.
674
696
  Defaults to SETTLE_BATCH.
697
+ stream (bool, optional): Whether to stream the response. Default is False.
675
698
 
676
699
  Returns:
677
- TextGenerationOutput: Generated text results including:
678
- - chat_output: Dict with role, content, and tool_calls
679
- - transaction_hash: Blockchain hash (or "external" for external providers)
680
- - finish_reason: Reason for completion (e.g., "stop", "tool_call")
681
- - payment_hash: Payment hash for x402 transactions (when using x402 settlement)
700
+ Union[TextGenerationOutput, TextGenerationStream]:
701
+ - If stream=False: TextGenerationOutput with chat_output, transaction_hash, finish_reason, and payment_hash
702
+ - If stream=True: TextGenerationStream yielding StreamChunk objects with typed deltas (true streaming via threading)
682
703
 
683
704
  Raises:
684
705
  OpenGradientError: If the inference fails.
@@ -689,16 +710,33 @@ class Client:
689
710
  if model_cid not in TEE_LLM:
690
711
  return OpenGradientError("That model CID is not supported yet for TEE inference")
691
712
 
692
- return self._external_llm_chat(
693
- model=model_cid.split("/")[1],
694
- messages=messages,
695
- max_tokens=max_tokens,
696
- stop_sequence=stop_sequence,
697
- temperature=temperature,
698
- tools=tools,
699
- tool_choice=tool_choice,
700
- x402_settlement_mode=x402_settlement_mode,
701
- )
713
+ if stream:
714
+ # Use threading bridge for true sync streaming
715
+ return self._external_llm_chat_stream_sync(
716
+ model=model_cid.split("/")[1],
717
+ messages=messages,
718
+ max_tokens=max_tokens,
719
+ stop_sequence=stop_sequence,
720
+ temperature=temperature,
721
+ tools=tools,
722
+ tool_choice=tool_choice,
723
+ x402_settlement_mode=x402_settlement_mode,
724
+ use_tee=True,
725
+ )
726
+ else:
727
+ # Non-streaming
728
+ return self._external_llm_chat(
729
+ model=model_cid.split("/")[1],
730
+ messages=messages,
731
+ max_tokens=max_tokens,
732
+ stop_sequence=stop_sequence,
733
+ temperature=temperature,
734
+ tools=tools,
735
+ tool_choice=tool_choice,
736
+ x402_settlement_mode=x402_settlement_mode,
737
+ stream=False,
738
+ use_tee=True,
739
+ )
702
740
 
703
741
  # Original local model logic
704
742
  def execute_transaction():
@@ -778,7 +816,9 @@ class Client:
778
816
  tools: Optional[List[Dict]] = None,
779
817
  tool_choice: Optional[str] = None,
780
818
  x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
781
- ) -> TextGenerationOutput:
819
+ stream: bool = False,
820
+ use_tee: bool = False,
821
+ ) -> Union[TextGenerationOutput, TextGenerationStream]:
782
822
  """
783
823
  Route chat request to external LLM server with x402 payments.
784
824
 
@@ -790,18 +830,24 @@ class Client:
790
830
  temperature: Sampling temperature
791
831
  tools: Function calling tools
792
832
  tool_choice: Tool selection strategy
833
+ stream: Whether to stream the response
834
+ use_tee: Whether to use TEE
793
835
 
794
836
  Returns:
795
- TextGenerationOutput with chat completion
837
+ Union[TextGenerationOutput, TextGenerationStream]: Chat completion or TextGenerationStream
796
838
 
797
839
  Raises:
798
840
  OpenGradientError: If request fails
799
841
  """
800
- api_key = self._get_api_key_for_model(model)
842
+ api_key = None if use_tee else self._get_api_key_for_model(model)
801
843
 
802
844
  if api_key:
803
- logging.debug("External LLM completion using API key")
804
- url = f"{self._llm_server_url}/v1/chat/completions"
845
+ logging.debug("External LLM chat using API key")
846
+
847
+ if stream:
848
+ url = f"{self._llm_server_url}/v1/chat/completions/stream"
849
+ else:
850
+ url = f"{self._llm_server_url}/v1/chat/completions"
805
851
 
806
852
  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
807
853
 
@@ -820,14 +866,23 @@ class Client:
820
866
  payload["tool_choice"] = tool_choice or "auto"
821
867
 
822
868
  try:
823
- response = requests.post(url, json=payload, headers=headers, timeout=60)
824
- response.raise_for_status()
869
+ if stream:
870
+ # Return streaming response wrapped in TextGenerationStream
871
+ response = requests.post(url, json=payload, headers=headers, timeout=60, stream=True)
872
+ response.raise_for_status()
873
+ return TextGenerationStream(_iterator=response.iter_lines(decode_unicode=True), _is_async=False)
874
+ else:
875
+ # Non-streaming response
876
+ response = requests.post(url, json=payload, headers=headers, timeout=60)
877
+ response.raise_for_status()
825
878
 
826
- result = response.json()
879
+ result = response.json()
827
880
 
828
- return TextGenerationOutput(
829
- transaction_hash="external", finish_reason=result.get("finish_reason"), chat_output=result.get("message")
830
- )
881
+ return TextGenerationOutput(
882
+ transaction_hash="external",
883
+ finish_reason=result.get("finish_reason"),
884
+ chat_output=result.get("message")
885
+ )
831
886
 
832
887
  except requests.RequestException as e:
833
888
  error_msg = f"External LLM chat failed: {str(e)}"
@@ -840,6 +895,7 @@ class Client:
840
895
  logging.error(error_msg)
841
896
  raise OpenGradientError(error_msg)
842
897
 
898
+ # x402 payment path - non-streaming only here
843
899
  async def make_request():
844
900
  async with x402HttpxClient(
845
901
  account=self._wallet_account,
@@ -867,13 +923,13 @@ class Client:
867
923
  payload["tool_choice"] = tool_choice or "auto"
868
924
 
869
925
  try:
870
- response = await client.post("/v1/chat/completions", json=payload, headers=headers, timeout=60)
926
+ # Non-streaming with x402
927
+ endpoint = "/v1/chat/completions"
928
+ response = await client.post(endpoint, json=payload, headers=headers, timeout=60)
871
929
 
872
930
  # Read the response content
873
931
  content = await response.aread()
874
932
  result = json.loads(content.decode())
875
- # print(f"Response: {response}")
876
- # print(f"Response Headers: {response.headers}")
877
933
 
878
934
  payment_hash = ""
879
935
  if X402_PROCESSING_HASH_HEADER in response.headers:
@@ -909,6 +965,234 @@ class Client:
909
965
  logging.error(error_msg)
910
966
  raise OpenGradientError(error_msg)
911
967
 
968
+ def _external_llm_chat_stream_sync(
969
+ self,
970
+ model: str,
971
+ messages: List[Dict],
972
+ max_tokens: int = 100,
973
+ stop_sequence: Optional[List[str]] = None,
974
+ temperature: float = 0.0,
975
+ tools: Optional[List[Dict]] = None,
976
+ tool_choice: Optional[str] = None,
977
+ x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
978
+ use_tee: bool = False,
979
+ ):
980
+ """
981
+ Sync streaming using threading bridge - TRUE real-time streaming.
982
+
983
+ Yields StreamChunk objects as they arrive from the background thread.
984
+ NO buffering, NO conversion, just direct pass-through.
985
+ """
986
+ import threading
987
+ from queue import Queue
988
+
989
+ queue = Queue()
990
+ exception_holder = []
991
+
992
+ def _run_async():
993
+ """Run async streaming in background thread"""
994
+ loop = None
995
+ try:
996
+ loop = asyncio.new_event_loop()
997
+ asyncio.set_event_loop(loop)
998
+
999
+ async def _stream():
1000
+ try:
1001
+ async for chunk in self._external_llm_chat_stream_async(
1002
+ model=model,
1003
+ messages=messages,
1004
+ max_tokens=max_tokens,
1005
+ stop_sequence=stop_sequence,
1006
+ temperature=temperature,
1007
+ tools=tools,
1008
+ tool_choice=tool_choice,
1009
+ x402_settlement_mode=x402_settlement_mode,
1010
+ use_tee=use_tee,
1011
+ ):
1012
+ queue.put(chunk) # Put chunk immediately
1013
+ except Exception as e:
1014
+ exception_holder.append(e)
1015
+ finally:
1016
+ queue.put(None) # Signal completion
1017
+
1018
+ loop.run_until_complete(_stream())
1019
+ except Exception as e:
1020
+ exception_holder.append(e)
1021
+ queue.put(None)
1022
+ finally:
1023
+ if loop:
1024
+ try:
1025
+ pending = asyncio.all_tasks(loop)
1026
+ for task in pending:
1027
+ task.cancel()
1028
+ loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
1029
+ finally:
1030
+ loop.close()
1031
+
1032
+ # Start background thread
1033
+ thread = threading.Thread(target=_run_async, daemon=True)
1034
+ thread.start()
1035
+
1036
+ # Yield chunks DIRECTLY as they arrive - NO buffering
1037
+ try:
1038
+ while True:
1039
+ chunk = queue.get() # Blocks until chunk available
1040
+ if chunk is None:
1041
+ break
1042
+ yield chunk # Yield immediately!
1043
+
1044
+ thread.join(timeout=5)
1045
+
1046
+ if exception_holder:
1047
+ raise exception_holder[0]
1048
+ except Exception as e:
1049
+ thread.join(timeout=1)
1050
+ raise
1051
+
1052
+
1053
+ async def _external_llm_chat_stream_async(
1054
+ self,
1055
+ model: str,
1056
+ messages: List[Dict],
1057
+ max_tokens: int = 100,
1058
+ stop_sequence: Optional[List[str]] = None,
1059
+ temperature: float = 0.0,
1060
+ tools: Optional[List[Dict]] = None,
1061
+ tool_choice: Optional[str] = None,
1062
+ x402_settlement_mode: x402SettlementMode = x402SettlementMode.SETTLE_BATCH,
1063
+ use_tee: bool = False,
1064
+ ):
1065
+ """
1066
+ Internal async streaming implementation.
1067
+
1068
+ Yields StreamChunk objects as they arrive from the server.
1069
+ """
1070
+ api_key = None if use_tee else self._get_api_key_for_model(model)
1071
+
1072
+ if api_key:
1073
+ # API key path - streaming to local llm-server
1074
+ url = f"{self._og_llm_streaming_server_url}/v1/chat/completions"
1075
+ headers = {
1076
+ "Content-Type": "application/json",
1077
+ "Authorization": f"Bearer {api_key}"
1078
+ }
1079
+
1080
+ payload = {
1081
+ "model": model,
1082
+ "messages": messages,
1083
+ "max_tokens": max_tokens,
1084
+ "temperature": temperature,
1085
+ "stream": True,
1086
+ }
1087
+
1088
+ if stop_sequence:
1089
+ payload["stop"] = stop_sequence
1090
+ if tools:
1091
+ payload["tools"] = tools
1092
+ payload["tool_choice"] = tool_choice or "auto"
1093
+
1094
+ async with httpx.AsyncClient(verify=False, timeout=None) as client:
1095
+ async with client.stream("POST", url, json=payload, headers=headers) as response:
1096
+ buffer = b""
1097
+ async for chunk in response.aiter_raw():
1098
+ if not chunk:
1099
+ continue
1100
+
1101
+ buffer += chunk
1102
+
1103
+ # Process all complete lines in buffer
1104
+ while b"\n" in buffer:
1105
+ line_bytes, buffer = buffer.split(b"\n", 1)
1106
+
1107
+ if not line_bytes.strip():
1108
+ continue
1109
+
1110
+ try:
1111
+ line = line_bytes.decode('utf-8').strip()
1112
+ except UnicodeDecodeError:
1113
+ continue
1114
+
1115
+ if not line.startswith("data: "):
1116
+ continue
1117
+
1118
+ data_str = line[6:] # Strip "data: " prefix
1119
+ if data_str.strip() == "[DONE]":
1120
+ return
1121
+
1122
+ try:
1123
+ data = json.loads(data_str)
1124
+ yield StreamChunk.from_sse_data(data)
1125
+ except json.JSONDecodeError:
1126
+ continue
1127
+ else:
1128
+ # x402 payment path
1129
+ async with httpx.AsyncClient(
1130
+ base_url=self._og_llm_streaming_server_url,
1131
+ headers={"Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}"},
1132
+ timeout=TIMEOUT,
1133
+ limits=LIMITS,
1134
+ http2=False,
1135
+ follow_redirects=False,
1136
+ auth=X402Auth(account=self._wallet_account), # type: ignore
1137
+ ) as client:
1138
+ headers = {
1139
+ "Content-Type": "application/json",
1140
+ "Authorization": f"Bearer {X402_PLACEHOLDER_API_KEY}",
1141
+ "X-SETTLEMENT-TYPE": x402_settlement_mode,
1142
+ }
1143
+
1144
+ payload = {
1145
+ "model": model,
1146
+ "messages": messages,
1147
+ "max_tokens": max_tokens,
1148
+ "temperature": temperature,
1149
+ "stream": True,
1150
+ }
1151
+
1152
+ if stop_sequence:
1153
+ payload["stop"] = stop_sequence
1154
+ if tools:
1155
+ payload["tools"] = tools
1156
+ payload["tool_choice"] = tool_choice or "auto"
1157
+
1158
+ async with client.stream(
1159
+ "POST",
1160
+ "/v1/chat/completions",
1161
+ json=payload,
1162
+ headers=headers,
1163
+ ) as response:
1164
+ buffer = b""
1165
+ async for chunk in response.aiter_raw():
1166
+ if not chunk:
1167
+ continue
1168
+
1169
+ buffer += chunk
1170
+
1171
+ # Process complete lines from buffer
1172
+ while b"\n" in buffer:
1173
+ line_bytes, buffer = buffer.split(b"\n", 1)
1174
+
1175
+ if not line_bytes.strip():
1176
+ continue
1177
+
1178
+ try:
1179
+ line = line_bytes.decode('utf-8').strip()
1180
+ except UnicodeDecodeError:
1181
+ continue
1182
+
1183
+ if not line.startswith("data: "):
1184
+ continue
1185
+
1186
+ data_str = line[6:]
1187
+ if data_str.strip() == "[DONE]":
1188
+ return
1189
+
1190
+ try:
1191
+ data = json.loads(data_str)
1192
+ yield StreamChunk.from_sse_data(data)
1193
+ except json.JSONDecodeError:
1194
+ continue
1195
+
912
1196
  def list_files(self, model_name: str, version: str) -> List[Dict]:
913
1197
  """
914
1198
  List files for a specific version of a model.
opengradient/defaults.py CHANGED
@@ -9,4 +9,6 @@ DEFAULT_BLOCKCHAIN_EXPLORER = "https://explorer.opengradient.ai/tx/"
9
9
  DEFAULT_IMAGE_GEN_HOST = "18.217.25.69"
10
10
  DEFAULT_IMAGE_GEN_PORT = 5125
11
11
  DEFAULT_LLM_SERVER_URL = "http://35.225.197.84:8000"
12
- DEFAULT_OPENGRADIENT_LLM_SERVER_URL = "https://llm.opengradient.ai"
12
+ DEFAULT_OPENGRADIENT_LLM_SERVER_URL = "https://llmogevm.opengradient.ai"
13
+ DEFAULT_OPENGRADIENT_LLM_STREAMING_SERVER_URL = "https://llmogevm.opengradient.ai"
14
+ DEFAULT_NETWORK_FILTER = "og-evm"
opengradient/types.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import time
2
2
  from dataclasses import dataclass
3
3
  from enum import Enum, IntEnum, StrEnum
4
- from typing import Dict, List, Optional, Tuple, Union, DefaultDict
4
+ from typing import Dict, List, Optional, Tuple, Union, DefaultDict, Iterator, AsyncIterator
5
5
  import numpy as np
6
6
 
7
7
 
@@ -165,6 +165,196 @@ class InferenceResult:
165
165
  model_output: Dict[str, np.ndarray]
166
166
 
167
167
 
168
+ @dataclass
169
+ class StreamDelta:
170
+ """
171
+ Represents a delta (incremental change) in a streaming response.
172
+
173
+ Attributes:
174
+ content: Incremental text content (if any)
175
+ role: Message role (appears in first chunk)
176
+ tool_calls: Tool call information (if function calling is used)
177
+ """
178
+ content: Optional[str] = None
179
+ role: Optional[str] = None
180
+ tool_calls: Optional[List[Dict]] = None
181
+
182
+
183
+ @dataclass
184
+ class StreamChoice:
185
+ """
186
+ Represents a choice in a streaming response.
187
+
188
+ Attributes:
189
+ delta: The incremental changes in this chunk
190
+ index: Choice index (usually 0)
191
+ finish_reason: Reason for completion (appears in final chunk)
192
+ """
193
+ delta: StreamDelta
194
+ index: int = 0
195
+ finish_reason: Optional[str] = None
196
+
197
+
198
+ @dataclass
199
+ class StreamUsage:
200
+ """
201
+ Token usage information for a streaming response.
202
+
203
+ Attributes:
204
+ prompt_tokens: Number of tokens in the prompt
205
+ completion_tokens: Number of tokens in the completion
206
+ total_tokens: Total tokens used
207
+ """
208
+ prompt_tokens: int
209
+ completion_tokens: int
210
+ total_tokens: int
211
+
212
+
213
+ @dataclass
214
+ class StreamChunk:
215
+ """
216
+ Represents a single chunk in a streaming LLM response.
217
+
218
+ This follows the OpenAI streaming format but is provider-agnostic.
219
+ Each chunk contains incremental data, with the final chunk including
220
+ usage information.
221
+
222
+ Attributes:
223
+ choices: List of streaming choices (usually contains one choice)
224
+ model: Model identifier
225
+ usage: Token usage information (only in final chunk)
226
+ is_final: Whether this is the final chunk (before [DONE])
227
+ """
228
+ choices: List[StreamChoice]
229
+ model: str
230
+ usage: Optional[StreamUsage] = None
231
+ is_final: bool = False
232
+
233
+ @classmethod
234
+ def from_sse_data(cls, data: Dict) -> "StreamChunk":
235
+ """
236
+ Parse a StreamChunk from SSE data dictionary.
237
+
238
+ Args:
239
+ data: Dictionary parsed from SSE data line
240
+
241
+ Returns:
242
+ StreamChunk instance
243
+ """
244
+ choices = []
245
+ for choice_data in data.get("choices", []):
246
+ delta_data = choice_data.get("delta", {})
247
+ delta = StreamDelta(
248
+ content=delta_data.get("content"),
249
+ role=delta_data.get("role"),
250
+ tool_calls=delta_data.get("tool_calls")
251
+ )
252
+ choice = StreamChoice(
253
+ delta=delta,
254
+ index=choice_data.get("index", 0),
255
+ finish_reason=choice_data.get("finish_reason")
256
+ )
257
+ choices.append(choice)
258
+
259
+ usage = None
260
+ if "usage" in data:
261
+ usage_data = data["usage"]
262
+ usage = StreamUsage(
263
+ prompt_tokens=usage_data.get("prompt_tokens", 0),
264
+ completion_tokens=usage_data.get("completion_tokens", 0),
265
+ total_tokens=usage_data.get("total_tokens", 0)
266
+ )
267
+
268
+ is_final = any(c.finish_reason is not None for c in choices) or usage is not None
269
+
270
+ return cls(
271
+ choices=choices,
272
+ model=data.get("model", "unknown"),
273
+ usage=usage,
274
+ is_final=is_final
275
+ )
276
+
277
+
278
+ @dataclass
279
+ class TextGenerationStream:
280
+ """
281
+ Iterator wrapper for streaming text generation responses.
282
+
283
+ Provides a clean interface for iterating over stream chunks with
284
+ automatic parsing of SSE format.
285
+
286
+ Usage:
287
+ stream = client.llm_chat(..., stream=True)
288
+ for chunk in stream:
289
+ if chunk.choices[0].delta.content:
290
+ print(chunk.choices[0].delta.content, end="")
291
+ """
292
+ _iterator: Union[Iterator[str], AsyncIterator[str]]
293
+ _is_async: bool = False
294
+
295
+ def __iter__(self):
296
+ """Iterate over stream chunks."""
297
+ return self
298
+
299
+ def __next__(self) -> StreamChunk:
300
+ """Get next stream chunk."""
301
+ import json
302
+
303
+ while True:
304
+ try:
305
+ line = next(self._iterator)
306
+ except StopIteration:
307
+ raise
308
+
309
+ if not line or not line.strip():
310
+ continue
311
+
312
+ if not line.startswith("data: "):
313
+ continue
314
+
315
+ data_str = line[6:] # Remove "data: " prefix
316
+
317
+ if data_str.strip() == "[DONE]":
318
+ raise StopIteration
319
+
320
+ try:
321
+ data = json.loads(data_str)
322
+ return StreamChunk.from_sse_data(data)
323
+ except json.JSONDecodeError:
324
+ # Skip malformed chunks
325
+ continue
326
+
327
+ async def __anext__(self) -> StreamChunk:
328
+ """Get next stream chunk (async version)."""
329
+ import json
330
+
331
+ if not self._is_async:
332
+ raise TypeError("Use __next__ for sync iterators")
333
+
334
+ while True:
335
+ try:
336
+ line = await self._iterator.__anext__()
337
+ except StopAsyncIteration:
338
+ raise
339
+
340
+ if not line or not line.strip():
341
+ continue
342
+
343
+ if not line.startswith("data: "):
344
+ continue
345
+
346
+ data_str = line[6:]
347
+
348
+ if data_str.strip() == "[DONE]":
349
+ raise StopAsyncIteration
350
+
351
+ try:
352
+ data = json.loads(data_str)
353
+ return StreamChunk.from_sse_data(data)
354
+ except json.JSONDecodeError:
355
+ continue
356
+
357
+
168
358
  @dataclass
169
359
  class TextGenerationOutput:
170
360
  """
@@ -0,0 +1,60 @@
1
+ import httpx
2
+ import typing
3
+ import logging
4
+
5
+ from x402.clients.base import x402Client
6
+ from x402.types import x402PaymentRequiredResponse, PaymentRequirements
7
+
8
+
9
+ class X402Auth(httpx.Auth):
10
+ """Auth class for handling x402 payment requirements."""
11
+
12
+ def __init__(
13
+ self,
14
+ account: typing.Any,
15
+ max_value: typing.Optional[int] = None,
16
+ payment_requirements_selector: typing.Optional[
17
+ typing.Callable[
18
+ [
19
+ list[PaymentRequirements],
20
+ typing.Optional[str],
21
+ typing.Optional[str],
22
+ typing.Optional[int],
23
+ ],
24
+ PaymentRequirements,
25
+ ]
26
+ ] = None,
27
+ ):
28
+ self.x402_client = x402Client(
29
+ account,
30
+ max_value=max_value,
31
+ payment_requirements_selector=payment_requirements_selector, # type: ignore
32
+ )
33
+
34
+ async def async_auth_flow(
35
+ self, request: httpx.Request
36
+ ) -> typing.AsyncGenerator[httpx.Request, httpx.Response]:
37
+ response = yield request
38
+
39
+ if response.status_code == 402:
40
+ try:
41
+ await response.aread()
42
+ data = response.json()
43
+
44
+ payment_response = x402PaymentRequiredResponse(**data)
45
+
46
+ selected_requirements = self.x402_client.select_payment_requirements(
47
+ payment_response.accepts
48
+ )
49
+
50
+ payment_header = self.x402_client.create_payment_header(
51
+ selected_requirements, payment_response.x402_version
52
+ )
53
+
54
+ request.headers["X-Payment"] = payment_header
55
+ request.headers["Access-Control-Expose-Headers"] = "X-Payment-Response"
56
+ yield request
57
+
58
+ except Exception as e:
59
+ logging.error(f"X402Auth: Error handling payment: {e}")
60
+ return
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opengradient
3
- Version: 0.5.8
3
+ Version: 0.5.9
4
4
  Summary: Python SDK for OpenGradient decentralized model management & inference services
5
5
  Author-email: OpenGradient <kyle@vannalabs.ai>
6
6
  License-Expression: MIT
@@ -23,7 +23,7 @@ Requires-Dist: requests>=2.32.3
23
23
  Requires-Dist: langchain>=0.3.7
24
24
  Requires-Dist: openai>=1.58.1
25
25
  Requires-Dist: pydantic>=2.9.2
26
- Requires-Dist: og-test-x402==0.0.1
26
+ Requires-Dist: og-test-x402==0.0.9
27
27
  Dynamic: license-file
28
28
 
29
29
  # OpenGradient Python SDK
@@ -1,11 +1,12 @@
1
- opengradient/__init__.py,sha256=7UkGoQRDtSb0lh3vobxmyJct_uFfm1Re_oz5s0s9dOs,13263
1
+ opengradient/__init__.py,sha256=1PSbDRGe4ft_0FYoPS3XpeajnRPOTkmx8aZZxcOeztQ,13455
2
2
  opengradient/account.py,sha256=5wrYpws_1lozjOFjLCTHtxgoxK-LmObDAaVy9eDcJY4,1145
3
- opengradient/cli.py,sha256=4IUKxecZV9la-_nEVxObOIjm6qQ9aEHhq5-m5clzzHc,29901
4
- opengradient/client.py,sha256=nozp80z8KSYQewKQmSVXZQIdVtsSjv53reS3TBRwlXc,63071
5
- opengradient/defaults.py,sha256=yiZnpIOLyHEmZhCEQXgWpT2eJin10UVsivJY6r61xmo,674
3
+ opengradient/cli.py,sha256=pfgyLfD1MIDifKmGLFsJqBlgvqIcnsIh3zzg7PaIeH4,33670
4
+ opengradient/client.py,sha256=KDkFxcZ-vGyriFW-ydWTnitgV6rYfxtnNzchWca-8u8,74009
5
+ opengradient/defaults.py,sha256=YOtFDq8HiwEkgMXlV4Zf3YgkopfKUkkx0CpgNuY_Mxk,796
6
6
  opengradient/exceptions.py,sha256=88tfegboGtlehQcwhxsl6ZzhLJWZWlkf_bkHTiCtXpo,3391
7
- opengradient/types.py,sha256=DSkJAcD4fRQ78bG3Ny5-_OqcfptFSIpliS4qKKYE2jU,9026
7
+ opengradient/types.py,sha256=bADakUM6WwdMORGC5HvQvWCezNwIlVc7l0zodPapbhQ,14622
8
8
  opengradient/utils.py,sha256=ZUq4OBIml2vsC0tRqus4Zwb_e3g4woo00apByrafuVw,8058
9
+ opengradient/x402_auth.py,sha256=Jmj-40OybugOXIt_qHzN1qy4x7U3QuM1MKNmPzoEKwc,1920
9
10
  opengradient/abi/InferencePrecompile.abi,sha256=reepTHg6Q01UrFP0Gexc-JayplsvOLPfG7jrEZ-cV28,10197
10
11
  opengradient/abi/PriceHistoryInference.abi,sha256=ZB3fZdx1kaFlp2wt1vTbTZZG1k8HPvmNtkG5Q8Bnajw,5098
11
12
  opengradient/abi/WorkflowScheduler.abi,sha256=yEGs76qO4S1z980KL5hBdfyXiJ6k-kERcB1O_o73AEU,416
@@ -27,9 +28,9 @@ opengradient/workflow_models/constants.py,sha256=viIkb_LGcfVprqQNaA80gBTj6cfYam0
27
28
  opengradient/workflow_models/types.py,sha256=Z22hF6c8Y4D2GlzVEIBODGwsqSjSrQvUcpZ7R-mIJdI,409
28
29
  opengradient/workflow_models/utils.py,sha256=ySfpuiOBqLTlfto6ZxZf2vc7K6RGIja0l4eaVm5AOzY,1503
29
30
  opengradient/workflow_models/workflow_models.py,sha256=d4C_gs39DAfy4cdY9Ee6GMXpPfzwvKFpmxzK1A7LNgU,3900
30
- opengradient-0.5.8.dist-info/licenses/LICENSE,sha256=xEcvQ3AxZOtDkrqkys2Mm6Y9diEnaSeQRKvxi-JGnNA,1069
31
- opengradient-0.5.8.dist-info/METADATA,sha256=DyqayJvXV39OUn5H9jUqVsKqX9ilHMAaQ2-u4GnqIwM,4215
32
- opengradient-0.5.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- opengradient-0.5.8.dist-info/entry_points.txt,sha256=yUKTaJx8RXnybkob0J62wVBiCp_1agVbgw9uzsmaeJc,54
34
- opengradient-0.5.8.dist-info/top_level.txt,sha256=oC1zimVLa2Yi1LQz8c7x-0IQm92milb5ax8gHBHwDqU,13
35
- opengradient-0.5.8.dist-info/RECORD,,
31
+ opengradient-0.5.9.dist-info/licenses/LICENSE,sha256=xEcvQ3AxZOtDkrqkys2Mm6Y9diEnaSeQRKvxi-JGnNA,1069
32
+ opengradient-0.5.9.dist-info/METADATA,sha256=kSTyBctZ-r4h3ilq7DRgxvxQhYO4ejUl3KbwcDX1Ygs,4215
33
+ opengradient-0.5.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
34
+ opengradient-0.5.9.dist-info/entry_points.txt,sha256=yUKTaJx8RXnybkob0J62wVBiCp_1agVbgw9uzsmaeJc,54
35
+ opengradient-0.5.9.dist-info/top_level.txt,sha256=oC1zimVLa2Yi1LQz8c7x-0IQm92milb5ax8gHBHwDqU,13
36
+ opengradient-0.5.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5