sference-sdk 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ dist-sdk/
6
+ dist-cli/
7
+ build/
8
+ .venv/
9
+ *.egg
10
+ .pytest_cache/
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: sference-sdk
3
+ Version: 0.0.1
4
+ Summary: Python SDK for the sference batch API
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: httpx>=0.28.1
7
+ Requires-Dist: pydantic>=2.12.5
8
+ Description-Content-Type: text/markdown
9
+
10
+ # sference Python SDK
11
+
12
+ Installable package: `sference-sdk` (import: `sference_sdk`). Used by the `sference` CLI and your own automation.
13
+
14
+ ## Install
15
+
16
+ ```bash
17
+ pip install sference-sdk
18
+ # or: uv add sference-sdk
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ Set `SFERENCE_API_KEY` and optional `SFERENCE_BASE_URL` (default `https://api.sference.com`), or pass `api_key=` / `base_url=` to the client.
24
+
25
+ ### Batches (sync)
26
+
27
+ Best for a **fixed JSONL workload**: one submit, poll until terminal, then fetch structured results or download JSONL via the API.
28
+
29
+ ```python
30
+ from sference_sdk import SferenceClient
31
+
32
+ client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
33
+
34
+ batch = client.submit_batch(
35
+ input_file="./workload.jsonl",
36
+ model="Qwen/Qwen2.5-7B-Instruct",
37
+ window="24h",
38
+ )
39
+ done = client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
40
+ results = client.get_results(done.id)
41
+ print(results.status, results.output_url)
42
+ ```
43
+
44
+ Use a `model` supported by your sference deployment.
45
+
46
+ ### OpenAI-compatible responses (sync)
47
+
48
+ Standalone or stream-associated jobs via `POST /v1/responses`. Keys need `responses:read` and `responses:write` (default on newly issued keys).
49
+
50
+ ```python
51
+ from sference_sdk import SferenceClient
52
+
53
+ client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
54
+
55
+ created = client.create_response(
56
+ model="Qwen/Qwen2.5-7B-Instruct",
57
+ input=[{"role": "user", "content": "Hello"}],
58
+ metadata={"completion_window": "24h"},
59
+ )
60
+ row = client.get_response(created.id)
61
+ ```
62
+
63
+ For a stream, add `stream_id` inside `metadata` next to `completion_window`.
64
+
65
+ ### OpenAI Python SDK (`openai` package)
66
+
67
+ If you already use the official OpenAI client, point it at a sference-compatible **`/v1`** base URL and the same API key (with `responses:read` and `responses:write`).
68
+
69
+ ```bash
70
+ pip install openai
71
+ ```
72
+
73
+ ```python
74
+ import asyncio
75
+ import os
76
+
77
+ from openai import AsyncOpenAI
78
+
79
+
80
+ async def main() -> None:
81
+ client = AsyncOpenAI(
82
+ base_url="https://api.sference.com/v1",
83
+ api_key=os.environ["SFERENCE_API_KEY"],
84
+ )
85
+
86
+ response = await client.responses.create(
87
+ model="zai-org/GLM-5",
88
+ input=[{"role": "user", "content": "Hello, world!"}],
89
+ background=True,
90
+ )
91
+ # Poll GET /v1/responses/{id} until terminal; your openai version may expose
92
+ # something like await client.responses.retrieve(response.id), or use
93
+ # AsyncSferenceClient.get_response(response.id) with the same host and key.
94
+
95
+
96
+ asyncio.run(main())
97
+ ```
98
+
99
+ **Self-hosted** (local API): use `base_url="http://127.0.0.1:8000/v1"` (or your `SFERENCE_BASE_URL` + `"/v1"`). **`model`** must match a model your inference workers consume.
100
+
101
+ **Metadata:** to set `completion_window` or `stream_id` like the native SDK, pass them in the request body your `openai` version supports (for example `metadata=` on `create`, or `extra_body={"metadata": {...}}` if the helper does not list those fields yet).
102
+
103
+ ### Async client — batches
104
+
105
+ `AsyncSferenceClient` uses `httpx.AsyncClient` so batch polling can run alongside other async I/O without blocking threads.
106
+
107
+ **Use case:** You already know the full set of prompts (for example a JSONL file) and want one scheduled unit of work with a clear terminal state and bulk results.
108
+
109
+ **Benefits:** Simple lifecycle (submit → wait → fetch results), fits large static workloads and JSONL-heavy pipelines.
110
+
111
+ ```python
112
+ import asyncio
113
+
114
+ from sference_sdk import AsyncSferenceClient
115
+
116
+
117
+ async def main() -> None:
118
+ async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
119
+ batch = await client.submit_batch(
120
+ input_file="./workload.jsonl",
121
+ model="Qwen/Qwen2.5-7B-Instruct",
122
+ window="24h",
123
+ )
124
+ done = await client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
125
+ results = await client.get_results(done.id)
126
+ print(results.status, results.output_url)
127
+
128
+
129
+ asyncio.run(main())
130
+ ```
131
+
132
+ ### Async client — streams
133
+
134
+ Stream-associated jobs use `create_response(..., metadata={"stream_id": ..., "completion_window": "24h"})`. Consume completions with `list_stream_events` (optional `wait_ms` long-poll) or `iter_stream_events` (paged replay; optional checkpoints align with CLI `stream tail`).
135
+
136
+ **Use case:** Work arrives over time, or you want one id to group many responses and observe completions as they land.
137
+
138
+ **Benefits:** Independent submits with aggregated progress, stream-level status in the API/UI, and efficient event tailing.
139
+
140
+ ```python
141
+ import asyncio
142
+
143
+ from sference_sdk import AsyncSferenceClient
144
+
145
+
146
+ async def main() -> None:
147
+ async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
148
+ stream = await client.create_stream(name="sdk-demo", window="24h")
149
+ await client.create_response(
150
+ model="Qwen/Qwen2.5-7B-Instruct",
151
+ input=[{"role": "user", "content": "Hello"}],
152
+ metadata={"stream_id": stream.id, "completion_window": "24h"},
153
+ )
154
+ async for ev in client.iter_stream_events(stream.id, checkpoint=False):
155
+ print(ev.completion_id, ev.status)
156
+
157
+
158
+ asyncio.run(main())
159
+ ```
160
+
161
+ ## cURL (same API the SDK calls)
162
+
163
+ API keys need `responses:read` and `responses:write` (default on newly issued keys). `X-API-Key` or `Authorization: Bearer sk_...` are both accepted.
164
+
165
+ ```bash
166
+ export TOKEN=sk_...
167
+ BASE_URL=https://api.sference.com
168
+
169
+ RID=$(curl -sS -X POST "${BASE_URL}/v1/responses" \
170
+ -H "X-API-Key: $TOKEN" \
171
+ -H 'Content-Type: application/json' \
172
+ -d '{
173
+ "model": "Qwen/Qwen2.5-7B-Instruct",
174
+ "input": [{"role": "user", "content": "Hello"}],
175
+ "metadata": {"completion_window": "24h"}
176
+ }' | jq -r '.id')
177
+
178
+ curl -sS "${BASE_URL}/v1/responses/${RID}" \
179
+ -H "X-API-Key: $TOKEN"
180
+ ```
181
+
182
+ For self-hosted APIs, set `BASE_URL` to your API origin (no `/v1` suffix on `BASE_URL` here—the paths already include `/v1`). Without `jq`, read `id` from the POST JSON and substitute it in the GET URL.
183
+
184
+ ## CLI
185
+
186
+ For `sference batch …` and `sference stream …` commands, see the [CLI README](../cli/README.md).
@@ -0,0 +1,177 @@
1
+ # sference Python SDK
2
+
3
+ Installable package: `sference-sdk` (import: `sference_sdk`). Used by the `sference` CLI and your own automation.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install sference-sdk
9
+ # or: uv add sference-sdk
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ Set `SFERENCE_API_KEY` and optional `SFERENCE_BASE_URL` (default `https://api.sference.com`), or pass `api_key=` / `base_url=` to the client.
15
+
16
+ ### Batches (sync)
17
+
18
+ Best for a **fixed JSONL workload**: one submit, poll until terminal, then fetch structured results or download JSONL via the API.
19
+
20
+ ```python
21
+ from sference_sdk import SferenceClient
22
+
23
+ client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
24
+
25
+ batch = client.submit_batch(
26
+ input_file="./workload.jsonl",
27
+ model="Qwen/Qwen2.5-7B-Instruct",
28
+ window="24h",
29
+ )
30
+ done = client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
31
+ results = client.get_results(done.id)
32
+ print(results.status, results.output_url)
33
+ ```
34
+
35
+ Use a `model` supported by your sference deployment.
36
+
37
+ ### OpenAI-compatible responses (sync)
38
+
39
+ Standalone or stream-associated jobs via `POST /v1/responses`. Keys need `responses:read` and `responses:write` (default on newly issued keys).
40
+
41
+ ```python
42
+ from sference_sdk import SferenceClient
43
+
44
+ client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
45
+
46
+ created = client.create_response(
47
+ model="Qwen/Qwen2.5-7B-Instruct",
48
+ input=[{"role": "user", "content": "Hello"}],
49
+ metadata={"completion_window": "24h"},
50
+ )
51
+ row = client.get_response(created.id)
52
+ ```
53
+
54
+ For a stream, add `stream_id` inside `metadata` next to `completion_window`.
55
+
56
+ ### OpenAI Python SDK (`openai` package)
57
+
58
+ If you already use the official OpenAI client, point it at a sference-compatible **`/v1`** base URL and the same API key (with `responses:read` and `responses:write`).
59
+
60
+ ```bash
61
+ pip install openai
62
+ ```
63
+
64
+ ```python
65
+ import asyncio
66
+ import os
67
+
68
+ from openai import AsyncOpenAI
69
+
70
+
71
+ async def main() -> None:
72
+ client = AsyncOpenAI(
73
+ base_url="https://api.sference.com/v1",
74
+ api_key=os.environ["SFERENCE_API_KEY"],
75
+ )
76
+
77
+ response = await client.responses.create(
78
+ model="zai-org/GLM-5",
79
+ input=[{"role": "user", "content": "Hello, world!"}],
80
+ background=True,
81
+ )
82
+ # Poll GET /v1/responses/{id} until terminal; your openai version may expose
83
+ # something like await client.responses.retrieve(response.id), or use
84
+ # AsyncSferenceClient.get_response(response.id) with the same host and key.
85
+
86
+
87
+ asyncio.run(main())
88
+ ```
89
+
90
+ **Self-hosted** (local API): use `base_url="http://127.0.0.1:8000/v1"` (or your `SFERENCE_BASE_URL` + `"/v1"`). **`model`** must match a model your inference workers consume.
91
+
92
+ **Metadata:** to set `completion_window` or `stream_id` like the native SDK, pass them in the request body your `openai` version supports (for example `metadata=` on `create`, or `extra_body={"metadata": {...}}` if the helper does not list those fields yet).
93
+
94
+ ### Async client — batches
95
+
96
+ `AsyncSferenceClient` uses `httpx.AsyncClient` so batch polling can run alongside other async I/O without blocking threads.
97
+
98
+ **Use case:** You already know the full set of prompts (for example a JSONL file) and want one scheduled unit of work with a clear terminal state and bulk results.
99
+
100
+ **Benefits:** Simple lifecycle (submit → wait → fetch results), fits large static workloads and JSONL-heavy pipelines.
101
+
102
+ ```python
103
+ import asyncio
104
+
105
+ from sference_sdk import AsyncSferenceClient
106
+
107
+
108
+ async def main() -> None:
109
+ async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
110
+ batch = await client.submit_batch(
111
+ input_file="./workload.jsonl",
112
+ model="Qwen/Qwen2.5-7B-Instruct",
113
+ window="24h",
114
+ )
115
+ done = await client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
116
+ results = await client.get_results(done.id)
117
+ print(results.status, results.output_url)
118
+
119
+
120
+ asyncio.run(main())
121
+ ```
122
+
123
+ ### Async client — streams
124
+
125
+ Stream-associated jobs use `create_response(..., metadata={"stream_id": ..., "completion_window": "24h"})`. Consume completions with `list_stream_events` (optional `wait_ms` long-poll) or `iter_stream_events` (paged replay; optional checkpoints align with CLI `stream tail`).
126
+
127
+ **Use case:** Work arrives over time, or you want one id to group many responses and observe completions as they land.
128
+
129
+ **Benefits:** Independent submits with aggregated progress, stream-level status in the API/UI, and efficient event tailing.
130
+
131
+ ```python
132
+ import asyncio
133
+
134
+ from sference_sdk import AsyncSferenceClient
135
+
136
+
137
+ async def main() -> None:
138
+ async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
139
+ stream = await client.create_stream(name="sdk-demo", window="24h")
140
+ await client.create_response(
141
+ model="Qwen/Qwen2.5-7B-Instruct",
142
+ input=[{"role": "user", "content": "Hello"}],
143
+ metadata={"stream_id": stream.id, "completion_window": "24h"},
144
+ )
145
+ async for ev in client.iter_stream_events(stream.id, checkpoint=False):
146
+ print(ev.completion_id, ev.status)
147
+
148
+
149
+ asyncio.run(main())
150
+ ```
151
+
152
+ ## cURL (same API the SDK calls)
153
+
154
+ API keys need `responses:read` and `responses:write` (default on newly issued keys). `X-API-Key` or `Authorization: Bearer sk_...` are both accepted.
155
+
156
+ ```bash
157
+ export TOKEN=sk_...
158
+ BASE_URL=https://api.sference.com
159
+
160
+ RID=$(curl -sS -X POST "${BASE_URL}/v1/responses" \
161
+ -H "X-API-Key: $TOKEN" \
162
+ -H 'Content-Type: application/json' \
163
+ -d '{
164
+ "model": "Qwen/Qwen2.5-7B-Instruct",
165
+ "input": [{"role": "user", "content": "Hello"}],
166
+ "metadata": {"completion_window": "24h"}
167
+ }' | jq -r '.id')
168
+
169
+ curl -sS "${BASE_URL}/v1/responses/${RID}" \
170
+ -H "X-API-Key: $TOKEN"
171
+ ```
172
+
173
+ For self-hosted APIs, set `BASE_URL` to your API origin (no `/v1` suffix on `BASE_URL` here—the paths already include `/v1`). Without `jq`, read `id` from the POST JSON and substitute it in the GET URL.
174
+
175
+ ## CLI
176
+
177
+ For `sference batch …` and `sference stream …` commands, see the [CLI README](../cli/README.md).
@@ -0,0 +1,20 @@
1
+ [project]
2
+ name = "sference-sdk"
3
+ version = "0.0.1"
4
+ description = "Python SDK for the sference batch API"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "httpx>=0.28.1",
9
+ "pydantic>=2.12.5",
10
+ ]
11
+
12
+ [build-system]
13
+ requires = ["hatchling>=1.29.0"]
14
+ build-backend = "hatchling.build"
15
+
16
+ [tool.hatch.build.targets.wheel]
17
+ packages = ["sference_sdk"]
18
+
19
+ [tool.hatch.build.targets.sdist]
20
+ include = ["sference_sdk"]
@@ -0,0 +1,32 @@
1
+ from .async_client import AsyncSferenceClient
2
+ from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
3
+ from .client import ApiError, SferenceClient
4
+ from .models import (
5
+ Batch,
6
+ BatchList,
7
+ BatchResults,
8
+ InferenceRequest,
9
+ LoginResponse,
10
+ Stream,
11
+ StreamInferenceCompletionEvent,
12
+ StreamEventList,
13
+ StreamList,
14
+ )
15
+
16
+ __all__ = [
17
+ "ApiError",
18
+ "AsyncSferenceClient",
19
+ "SferenceClient",
20
+ "Batch",
21
+ "BatchList",
22
+ "BatchResults",
23
+ "LoginResponse",
24
+ "InferenceRequest",
25
+ "Stream",
26
+ "StreamInferenceCompletionEvent",
27
+ "StreamEventList",
28
+ "StreamList",
29
+ "clear_checkpoint",
30
+ "load_checkpoint",
31
+ "save_checkpoint",
32
+ ]
@@ -0,0 +1,295 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import time
7
+ import warnings
8
+ from pathlib import Path
9
+ from typing import Any, AsyncIterator, BinaryIO
10
+
11
+ import httpx
12
+
13
+ from .client import ApiError, SferenceClient
14
+ from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
15
+ from .models import (
16
+ Batch,
17
+ BatchCreatePayload,
18
+ BatchList,
19
+ BatchResults,
20
+ InferenceRequest,
21
+ LoginResponse,
22
+ Response,
23
+ ResponseList,
24
+ Stream,
25
+ StreamInferenceCompletionEvent,
26
+ StreamEventList,
27
+ StreamList,
28
+ StreamWindow,
29
+ )
30
+
31
+
32
+ class AsyncSferenceClient:
33
+ """Async HTTP client for the sference batch API (``httpx.AsyncClient``)."""
34
+
35
+ def __init__(
36
+ self,
37
+ base_url: str | None = None,
38
+ api_key: str | None = None,
39
+ transport: httpx.AsyncBaseTransport | None = None,
40
+ ) -> None:
41
+ self.base_url = base_url or os.getenv("SFERENCE_BASE_URL", "https://api.sference.com")
42
+ self._token = api_key or os.getenv("SFERENCE_API_KEY")
43
+ self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0, transport=transport)
44
+
45
+ async def aclose(self) -> None:
46
+ await self._client.aclose()
47
+
48
+ async def __aenter__(self) -> AsyncSferenceClient:
49
+ return self
50
+
51
+ async def __aexit__(self, *_: Any) -> None:
52
+ await self.aclose()
53
+
54
+ def _headers(self) -> dict[str, str]:
55
+ headers = {"content-type": "application/json"}
56
+ if self._token:
57
+ headers["authorization"] = f"Bearer {self._token}"
58
+ return headers
59
+
60
+ async def _request(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> Any:
61
+ response = await self._client.request(method, path, headers=self._headers(), json=json_body)
62
+ if response.status_code >= 400:
63
+ try:
64
+ payload = response.json()
65
+ except Exception:
66
+ payload = {"detail": response.text}
67
+ raise ApiError(f"{response.status_code}: {payload}")
68
+ return response.json()
69
+
70
+ async def _request_response(
71
+ self, method: str, path: str, json_body: dict[str, Any] | None = None
72
+ ) -> httpx.Response:
73
+ response = await self._client.request(method, path, headers=self._headers(), json=json_body)
74
+ if response.status_code >= 400:
75
+ try:
76
+ payload = response.json()
77
+ except Exception:
78
+ payload = {"detail": response.text}
79
+ raise ApiError(f"{response.status_code}: {payload}")
80
+ return response
81
+
82
+ async def login(self, username: str, password: str) -> LoginResponse:
83
+ payload = await self._request("POST", "/v1/auth/login", {"username": username, "password": password})
84
+ result = LoginResponse.model_validate(payload)
85
+ self._token = result.access_token
86
+ return result
87
+
88
+ async def get_me(self) -> dict[str, Any]:
89
+ return await self._request("GET", "/v1/auth/me")
90
+
91
+ async def submit_batch(
92
+ self,
93
+ *,
94
+ input_file: str | None = None,
95
+ requests: list[dict[str, Any]] | None = None,
96
+ model: str | None = None,
97
+ window: str = "24h",
98
+ ) -> Batch:
99
+ if window != "24h":
100
+ raise ValueError('Only window "24h" is supported in MVP.')
101
+ resolved = requests or []
102
+ if input_file:
103
+ resolved = SferenceClient.parse_inference_requests_jsonl(Path(input_file), model=model)
104
+ if not resolved:
105
+ raise ValueError("At least one request is required (use input_file or non-empty requests)")
106
+ if isinstance(resolved, list) and resolved and isinstance(resolved[0], dict):
107
+ reqs: list[InferenceRequest] = []
108
+ for r in resolved:
109
+ if not isinstance(r, dict):
110
+ raise TypeError("requests must be a list of dicts or InferenceRequest objects")
111
+ body = r.get("body")
112
+ if isinstance(body, dict) and {"method", "url", "body"}.issubset(r.keys()):
113
+ reqs.append(InferenceRequest(custom_id=r.get("custom_id"), body=body))
114
+ else:
115
+ reqs.append(InferenceRequest.model_validate(r))
116
+ resolved = reqs
117
+ payload = BatchCreatePayload(window="24h", requests=resolved) # type: ignore[arg-type]
118
+ response = await self._request("POST", "/v1/batches", payload.model_dump())
119
+ return Batch.model_validate(response)
120
+
121
+ async def get_batch(self, batch_id: str) -> Batch:
122
+ response = await self._request("GET", f"/v1/batches/{batch_id}")
123
+ return Batch.model_validate(response)
124
+
125
+ async def list_batches(self) -> BatchList:
126
+ response = await self._request("GET", "/v1/batches")
127
+ return BatchList.model_validate(response)
128
+
129
+ async def get_results(self, batch_id: str) -> BatchResults:
130
+ response = await self._request("GET", f"/v1/batches/{batch_id}/results")
131
+ return BatchResults.model_validate(response)
132
+
133
+ async def cancel_batch(self, batch_id: str) -> Batch:
134
+ response = await self._request("POST", f"/v1/batches/{batch_id}/cancel")
135
+ return Batch.model_validate(response)
136
+
137
+ async def download_results_jsonl(self, batch_id: str, out: str | Path | BinaryIO) -> None:
138
+ resp = await self._request_response("GET", f"/v1/batches/{batch_id}/results.jsonl")
139
+ if isinstance(out, (str, Path)):
140
+ Path(out).write_bytes(resp.content)
141
+ return
142
+ out.write(resp.content)
143
+
144
+ async def wait_for_completion(self, batch_id: str, poll_interval: float = 1.0, timeout: float = 30.0) -> Batch:
145
+ deadline = time.time() + timeout
146
+ while True:
147
+ batch = await self.get_batch(batch_id)
148
+ if batch.status in ("completed", "failed", "cancelled"):
149
+ return batch
150
+ if time.time() >= deadline:
151
+ raise TimeoutError(f"Timed out waiting for batch {batch_id}")
152
+ await asyncio.sleep(poll_interval)
153
+
154
+ async def create_stream(self, name: str, window: StreamWindow = "24h") -> Stream:
155
+ response = await self._request("POST", "/v1/streams", {"name": name, "window": window})
156
+ return Stream.model_validate(response)
157
+
158
+ async def list_streams(self) -> StreamList:
159
+ response = await self._request("GET", "/v1/streams")
160
+ return StreamList.model_validate(response)
161
+
162
+ async def get_stream(self, stream_id: str) -> Stream:
163
+ response = await self._request("GET", f"/v1/streams/{stream_id}")
164
+ return Stream.model_validate(response)
165
+
166
+ # Note: POST /v1/streams/{id}/items has been removed. Use create_response() with metadata.stream_id instead.
167
+
168
+ async def create_response(
169
+ self,
170
+ *,
171
+ model: str,
172
+ input: list[dict[str, str]], # [{"role": "user", "content": "..."}]
173
+ instructions: str | None = None,
174
+ max_output_tokens: int | None = None,
175
+ temperature: float | None = None,
176
+ top_p: float | None = None,
177
+ metadata: dict[str, Any] | None = None,
178
+ ) -> Response:
179
+ """Create a standalone response or stream-associated response.
180
+
181
+ For stream-associated, include stream_id in metadata:
182
+ metadata={"stream_id": "uuid", "completion_window": "24h"}
183
+ """
184
+ payload: dict[str, Any] = {
185
+ "model": model,
186
+ "input": input,
187
+ }
188
+ if instructions is not None:
189
+ payload["instructions"] = instructions
190
+ if max_output_tokens is not None:
191
+ payload["max_output_tokens"] = max_output_tokens
192
+ if temperature is not None:
193
+ payload["temperature"] = temperature
194
+ if top_p is not None:
195
+ payload["top_p"] = top_p
196
+ if metadata is not None:
197
+ payload["metadata"] = metadata
198
+
199
+ response = await self._request("POST", "/v1/responses", payload)
200
+ return Response.model_validate(response)
201
+
202
+ async def get_response(self, response_id: str) -> Response:
203
+ """Get a response by ID."""
204
+ response = await self._request("GET", f"/v1/responses/{response_id}")
205
+ return Response.model_validate(response)
206
+
207
+ async def cancel_response(self, response_id: str) -> Response:
208
+ """Cancel a pending or running response."""
209
+ response = await self._request("DELETE", f"/v1/responses/{response_id}")
210
+ return Response.model_validate(response)
211
+
212
+ async def list_responses(
213
+ self,
214
+ *,
215
+ limit: int = 100,
216
+ stream_id: str | None = None,
217
+ ) -> ResponseList:
218
+ """List responses. Optionally filter by stream_id."""
219
+ params: dict[str, Any] = {"limit": limit}
220
+ if stream_id is not None:
221
+ params["stream_id"] = stream_id
222
+
223
+ response = await self._request("GET", "/v1/responses", params=params)
224
+ return ResponseList.model_validate(response)
225
+
226
+ async def archive_stream(self, stream_id: str) -> Stream:
227
+ response = await self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "archived"})
228
+ return Stream.model_validate(response)
229
+
230
+ async def cancel_stream(self, stream_id: str) -> Stream:
231
+ response = await self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "cancelled"})
232
+ return Stream.model_validate(response)
233
+
234
+ async def list_stream_events(
235
+ self,
236
+ stream_id: str,
237
+ *,
238
+ limit: int = 20,
239
+ starting_after: str | None = None,
240
+ ending_before: str | None = None,
241
+ wait_ms: int = 0,
242
+ ) -> StreamEventList:
243
+ params: dict[str, Any] = {"limit": limit, "wait_ms": wait_ms}
244
+ if starting_after is not None:
245
+ params["starting_after"] = starting_after
246
+ if ending_before is not None:
247
+ params["ending_before"] = ending_before
248
+ response = await self._client.request(
249
+ "GET",
250
+ f"/v1/streams/{stream_id}/events",
251
+ headers=self._headers(),
252
+ params=params,
253
+ )
254
+ if response.status_code >= 400:
255
+ try:
256
+ payload = response.json()
257
+ except Exception:
258
+ payload = {"detail": response.text}
259
+ raise ApiError(f"{response.status_code}: {payload}")
260
+ return StreamEventList.model_validate(response.json())
261
+
262
+ async def iter_stream_events(
263
+ self,
264
+ stream_id: str,
265
+ *,
266
+ consumer_name: str = "default",
267
+ starting_after: str | None = None,
268
+ checkpoint: bool = True,
269
+ from_latest: bool = False,
270
+ ) -> AsyncIterator[StreamInferenceCompletionEvent]:
271
+ if from_latest:
272
+ clear_checkpoint(self.base_url, stream_id, consumer_name)
273
+ cur: str | None = starting_after
274
+ if checkpoint and cur is None:
275
+ cur = load_checkpoint(self.base_url, stream_id, consumer_name)
276
+
277
+ if cur is None:
278
+ page = await self.list_stream_events(stream_id, limit=100, wait_ms=0)
279
+ for ev in reversed(page.data):
280
+ yield ev
281
+ if checkpoint:
282
+ save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
283
+ return
284
+
285
+ while True:
286
+ page = await self.list_stream_events(stream_id, limit=100, starting_after=cur, wait_ms=0)
287
+ if not page.data:
288
+ return
289
+ for ev in page.data:
290
+ yield ev
291
+ if checkpoint:
292
+ save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
293
+ cur = ev.completion_id
294
+ if not page.has_more:
295
+ return
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ def _store_path() -> Path:
11
+ raw = os.environ.get("SFERENCE_STREAM_CHECKPOINTS")
12
+ if raw:
13
+ return Path(raw)
14
+ return Path.home() / ".sference" / "stream_checkpoints.json"
15
+
16
+
17
+ def _checkpoint_key(base_url: str, stream_id: str, consumer_name: str) -> str:
18
+ return f"{base_url.rstrip('/')}:{stream_id}:{consumer_name}"
19
+
20
+
21
+ def _load_all(path: Path) -> dict[str, Any]:
22
+ if not path.exists():
23
+ return {}
24
+ try:
25
+ data = json.loads(path.read_text(encoding="utf-8"))
26
+ return data if isinstance(data, dict) else {}
27
+ except (json.JSONDecodeError, OSError):
28
+ return {}
29
+
30
+
31
+ def _write_all(path: Path, data: dict[str, Any]) -> None:
32
+ path.parent.mkdir(parents=True, exist_ok=True)
33
+ path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")
34
+
35
+
36
+ def load_checkpoint(base_url: str, stream_id: str, consumer_name: str) -> str | None:
37
+ path = _store_path()
38
+ key = _checkpoint_key(base_url, stream_id, consumer_name)
39
+ entry = _load_all(path).get(key)
40
+ if not isinstance(entry, dict):
41
+ return None
42
+ last = entry.get("last_completion_id") or entry.get("last_event_id")
43
+ return str(last) if last else None
44
+
45
+
46
+ def save_checkpoint(base_url: str, stream_id: str, consumer_name: str, last_completion_id: str) -> None:
47
+ path = _store_path()
48
+ key = _checkpoint_key(base_url, stream_id, consumer_name)
49
+ data = _load_all(path)
50
+ data[key] = {
51
+ "last_completion_id": last_completion_id,
52
+ "updated_at": datetime.now(tz=timezone.utc).isoformat(),
53
+ }
54
+ _write_all(path, data)
55
+
56
+
57
+ def clear_checkpoint(base_url: str, stream_id: str, consumer_name: str) -> None:
58
+ path = _store_path()
59
+ key = _checkpoint_key(base_url, stream_id, consumer_name)
60
+ data = _load_all(path)
61
+ data.pop(key, None)
62
+ _write_all(path, data)
@@ -0,0 +1,357 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import time
6
+ import warnings
7
+ from pathlib import Path
8
+ from typing import Any, BinaryIO, Iterator
9
+
10
+ import httpx
11
+
12
+ from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
13
+ from .models import (
14
+ Batch,
15
+ BatchCreatePayload,
16
+ BatchList,
17
+ BatchResults,
18
+ InferenceRequest,
19
+ LoginResponse,
20
+ Response,
21
+ ResponseList,
22
+ Stream,
23
+ StreamInferenceCompletionEvent,
24
+ StreamEventList,
25
+ StreamList,
26
+ StreamWindow,
27
+ )
28
+
29
+
30
+ class ApiError(Exception):
31
+ pass
32
+
33
+
34
+ class SferenceClient:
35
+ def __init__(
36
+ self,
37
+ base_url: str | None = None,
38
+ api_key: str | None = None,
39
+ transport: httpx.BaseTransport | None = None,
40
+ ) -> None:
41
+ self.base_url = base_url or os.getenv("SFERENCE_BASE_URL", "https://api.sference.com")
42
+ self._token = api_key or os.getenv("SFERENCE_API_KEY")
43
+ self._client = httpx.Client(base_url=self.base_url, timeout=30.0, transport=transport)
44
+
45
+ def close(self) -> None:
46
+ self._client.close()
47
+
48
+ def __enter__(self) -> "SferenceClient":
49
+ return self
50
+
51
+ def __exit__(self, *_: Any) -> None:
52
+ self.close()
53
+
54
+ def _headers(self) -> dict[str, str]:
55
+ headers = {"content-type": "application/json"}
56
+ if self._token:
57
+ headers["authorization"] = f"Bearer {self._token}"
58
+ return headers
59
+
60
+ def _request(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> Any:
61
+ response = self._client.request(method, path, headers=self._headers(), json=json_body)
62
+ if response.status_code >= 400:
63
+ try:
64
+ payload = response.json()
65
+ except Exception:
66
+ payload = {"detail": response.text}
67
+ raise ApiError(f"{response.status_code}: {payload}")
68
+ return response.json()
69
+
70
+ def _request_response(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> httpx.Response:
71
+ response = self._client.request(method, path, headers=self._headers(), json=json_body)
72
+ if response.status_code >= 400:
73
+ try:
74
+ payload = response.json()
75
+ except Exception:
76
+ payload = {"detail": response.text}
77
+ raise ApiError(f"{response.status_code}: {payload}")
78
+ return response
79
+
80
+ def login(self, username: str, password: str) -> LoginResponse:
81
+ payload = self._request("POST", "/v1/auth/login", {"username": username, "password": password})
82
+ result = LoginResponse.model_validate(payload)
83
+ self._token = result.access_token
84
+ return result
85
+
86
+ def get_me(self) -> dict[str, Any]:
87
+ return self._request("GET", "/v1/auth/me")
88
+
89
+ def submit_batch(
90
+ self,
91
+ *,
92
+ input_file: str | None = None,
93
+ requests: list[dict[str, Any]] | None = None,
94
+ model: str | None = None,
95
+ window: str = "24h",
96
+ ) -> Batch:
97
+ if window != "24h":
98
+ raise ValueError('Only window "24h" is supported in MVP.')
99
+ resolved: list[InferenceRequest] | list[dict[str, Any]] = requests or []
100
+ if input_file:
101
+ resolved = self.parse_inference_requests_jsonl(Path(input_file), model=model)
102
+ if not resolved:
103
+ raise ValueError("At least one request is required (use input_file or non-empty requests)")
104
+ if isinstance(resolved, list) and resolved and isinstance(resolved[0], dict):
105
+ # Back-compat: accept OpenAI batch-style envelopes in `requests=[...]`.
106
+ reqs: list[InferenceRequest] = []
107
+ for r in resolved:
108
+ if not isinstance(r, dict):
109
+ raise TypeError("requests must be a list of dicts or InferenceRequest objects")
110
+ body = r.get("body")
111
+ if isinstance(body, dict) and {"method", "url", "body"}.issubset(r.keys()):
112
+ reqs.append(InferenceRequest(custom_id=r.get("custom_id"), body=body))
113
+ else:
114
+ reqs.append(InferenceRequest.model_validate(r))
115
+ resolved = reqs
116
+ payload = BatchCreatePayload(window="24h", requests=resolved) # type: ignore[arg-type]
117
+ response = self._request("POST", "/v1/batches", payload.model_dump())
118
+ return Batch.model_validate(response)
119
+
120
+ def get_batch(self, batch_id: str) -> Batch:
121
+ response = self._request("GET", f"/v1/batches/{batch_id}")
122
+ return Batch.model_validate(response)
123
+
124
+ def list_batches(self) -> BatchList:
125
+ response = self._request("GET", "/v1/batches")
126
+ return BatchList.model_validate(response)
127
+
128
+ def get_results(self, batch_id: str) -> BatchResults:
129
+ response = self._request("GET", f"/v1/batches/{batch_id}/results")
130
+ return BatchResults.model_validate(response)
131
+
132
+ def cancel_batch(self, batch_id: str) -> Batch:
133
+ response = self._request("POST", f"/v1/batches/{batch_id}/cancel")
134
+ return Batch.model_validate(response)
135
+
136
+ def download_results_jsonl(self, batch_id: str, out: str | Path | BinaryIO) -> None:
137
+ resp = self._request_response("GET", f"/v1/batches/{batch_id}/results.jsonl")
138
+ if isinstance(out, (str, Path)):
139
+ Path(out).write_bytes(resp.content)
140
+ return
141
+ out.write(resp.content)
142
+
143
+ def wait_for_completion(self, batch_id: str, poll_interval: float = 1.0, timeout: float = 30.0) -> Batch:
144
+ deadline = time.time() + timeout
145
+ while True:
146
+ batch = self.get_batch(batch_id)
147
+ if batch.status in ("completed", "failed", "cancelled"):
148
+ return batch
149
+ if time.time() >= deadline:
150
+ raise TimeoutError(f"Timed out waiting for batch {batch_id}")
151
+ time.sleep(poll_interval)
152
+
153
+ def create_stream(self, name: str, window: StreamWindow = "24h") -> Stream:
154
+ response = self._request("POST", "/v1/streams", {"name": name, "window": window})
155
+ return Stream.model_validate(response)
156
+
157
+ def list_streams(self) -> StreamList:
158
+ response = self._request("GET", "/v1/streams")
159
+ return StreamList.model_validate(response)
160
+
161
+ def get_stream(self, stream_id: str) -> Stream:
162
+ response = self._request("GET", f"/v1/streams/{stream_id}")
163
+ return Stream.model_validate(response)
164
+
165
+ # Note: POST /v1/streams/{id}/items has been removed. Use create_response() with metadata.stream_id instead.
166
+
167
+ def create_response(
168
+ self,
169
+ *,
170
+ model: str,
171
+ input: list[dict[str, str]], # [{"role": "user", "content": "..."}]
172
+ instructions: str | None = None,
173
+ max_output_tokens: int | None = None,
174
+ temperature: float | None = None,
175
+ top_p: float | None = None,
176
+ metadata: dict[str, Any] | None = None,
177
+ ) -> Response:
178
+ """Create a standalone response or stream-associated response.
179
+
180
+ For stream-associated, include stream_id in metadata:
181
+ metadata={"stream_id": "uuid", "completion_window": "24h"}
182
+ """
183
+ from sference_sdk.models import Response, ResponseCreatePayload
184
+
185
+ payload: dict[str, Any] = {
186
+ "model": model,
187
+ "input": input,
188
+ }
189
+ if instructions is not None:
190
+ payload["instructions"] = instructions
191
+ if max_output_tokens is not None:
192
+ payload["max_output_tokens"] = max_output_tokens
193
+ if temperature is not None:
194
+ payload["temperature"] = temperature
195
+ if top_p is not None:
196
+ payload["top_p"] = top_p
197
+ if metadata is not None:
198
+ payload["metadata"] = metadata
199
+
200
+ response = self._request("POST", "/v1/responses", payload)
201
+ return Response.model_validate(response)
202
+
203
+ def get_response(self, response_id: str) -> Response:
204
+ """Get a response by ID."""
205
+ from sference_sdk.models import Response
206
+
207
+ response = self._request("GET", f"/v1/responses/{response_id}")
208
+ return Response.model_validate(response)
209
+
210
+ def cancel_response(self, response_id: str) -> Response:
211
+ """Cancel a pending or running response."""
212
+ from sference_sdk.models import Response
213
+
214
+ response = self._request("DELETE", f"/v1/responses/{response_id}")
215
+ return Response.model_validate(response)
216
+
217
+ def list_responses(
218
+ self,
219
+ *,
220
+ limit: int = 100,
221
+ stream_id: str | None = None,
222
+ ) -> ResponseList:
223
+ """List responses. Optionally filter by stream_id."""
224
+ from sference_sdk.models import ResponseList
225
+
226
+ params: dict[str, Any] = {"limit": limit}
227
+ if stream_id is not None:
228
+ params["stream_id"] = stream_id
229
+
230
+ response = self._request("GET", "/v1/responses", params=params)
231
+ return ResponseList.model_validate(response)
232
+
233
+ def archive_stream(self, stream_id: str) -> Stream:
234
+ response = self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "archived"})
235
+ return Stream.model_validate(response)
236
+
237
+ def cancel_stream(self, stream_id: str) -> Stream:
238
+ response = self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "cancelled"})
239
+ return Stream.model_validate(response)
240
+
241
+ def list_stream_events(
242
+ self,
243
+ stream_id: str,
244
+ *,
245
+ limit: int = 20,
246
+ starting_after: str | None = None,
247
+ ending_before: str | None = None,
248
+ wait_ms: int = 0,
249
+ ) -> StreamEventList:
250
+ params: dict[str, Any] = {"limit": limit, "wait_ms": wait_ms}
251
+ if starting_after is not None:
252
+ params["starting_after"] = starting_after
253
+ if ending_before is not None:
254
+ params["ending_before"] = ending_before
255
+ response = self._client.request(
256
+ "GET",
257
+ f"/v1/streams/{stream_id}/events",
258
+ headers=self._headers(),
259
+ params=params,
260
+ )
261
+ if response.status_code >= 400:
262
+ try:
263
+ payload = response.json()
264
+ except Exception:
265
+ payload = {"detail": response.text}
266
+ raise ApiError(f"{response.status_code}: {payload}")
267
+ return StreamEventList.model_validate(response.json())
268
+
269
+ def iter_stream_events(
270
+ self,
271
+ stream_id: str,
272
+ *,
273
+ consumer_name: str = "default",
274
+ starting_after: str | None = None,
275
+ checkpoint: bool = True,
276
+ from_latest: bool = False,
277
+ ) -> Iterator[StreamInferenceCompletionEvent]:
278
+ if from_latest:
279
+ clear_checkpoint(self.base_url, stream_id, consumer_name)
280
+ cur: str | None = starting_after
281
+ if checkpoint and cur is None:
282
+ cur = load_checkpoint(self.base_url, stream_id, consumer_name)
283
+
284
+ if cur is None:
285
+ page = self.list_stream_events(stream_id, limit=100, wait_ms=0)
286
+ for ev in reversed(page.data):
287
+ yield ev
288
+ if checkpoint:
289
+ save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
290
+ return
291
+
292
+ while True:
293
+ page = self.list_stream_events(stream_id, limit=100, starting_after=cur, wait_ms=0)
294
+ if not page.data:
295
+ return
296
+ for ev in page.data:
297
+ yield ev
298
+ if checkpoint:
299
+ save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
300
+ cur = ev.completion_id
301
+ if not page.has_more:
302
+ return
303
+
304
+ @staticmethod
305
+ def parse_inference_requests_jsonl(path: Path, model: str | None = None) -> list[InferenceRequest]:
306
+ parsed: list[InferenceRequest] = []
307
+ warn_model_ignored = False
308
+ with path.open("r", encoding="utf-8") as f:
309
+ for idx, line in enumerate(f, start=1):
310
+ line = line.strip()
311
+ if not line:
312
+ continue
313
+ obj = json.loads(line)
314
+ # OpenAI batch-style envelope: {"custom_id", "method", "url", "body"}
315
+ if isinstance(obj, dict) and {"method", "url", "body"}.issubset(obj.keys()):
316
+ if model:
317
+ warn_model_ignored = True
318
+ body = obj.get("body")
319
+ if not isinstance(body, dict) or not body.get("model"):
320
+ raise ValueError(f"Invalid JSONL line {idx}: body.model is required")
321
+ parsed.append(
322
+ InferenceRequest(
323
+ custom_id=obj.get("custom_id"),
324
+ body=body,
325
+ )
326
+ )
327
+ continue
328
+ # Content-only format: {"content": "..."}
329
+ if isinstance(obj, dict) and "content" in obj:
330
+ if not model:
331
+ raise ValueError("model is required for content-only JSONL format")
332
+ parsed.append(
333
+ InferenceRequest(
334
+ custom_id=obj.get("custom_id") or f"request-{idx}",
335
+ body={
336
+ "model": model,
337
+ "messages": [{"role": "user", "content": obj["content"]}],
338
+ },
339
+ )
340
+ )
341
+ continue
342
+ raise ValueError(f"Unsupported JSONL line format at line {idx}")
343
+ if warn_model_ignored:
344
+ warnings.warn(
345
+ "model argument is ignored for OpenAI-compatible JSONL lines",
346
+ stacklevel=2,
347
+ )
348
+ return parsed
349
+
350
+ @staticmethod
351
+ def _parse_jsonl(path: Path, model: str | None = None) -> list[dict[str, Any]]:
352
+ """Backwards-compatible JSONL parser (returns plain dicts).
353
+
354
+ Prefer `parse_inference_requests_jsonl`, which returns `InferenceRequest` objects.
355
+ """
356
+
357
+ return [r.model_dump() for r in SferenceClient.parse_inference_requests_jsonl(path, model=model)]
@@ -0,0 +1,185 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Literal
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ BatchStatus = Literal["pending", "running", "completed", "failed", "cancelled"]
9
+
10
+
11
+ class InferenceRequest(BaseModel):
12
+ """Unified inference request schema for batches and streams.
13
+
14
+ custom_id is user-provided and can be used for correlation.
15
+ """
16
+
17
+ model_config = ConfigDict(extra="forbid")
18
+
19
+ custom_id: str | None = Field(default=None, description="User-provided identifier for correlation.")
20
+ body: dict[str, Any] = Field(description="OpenAI-style request body. Must include `model`.")
21
+
22
+
23
+ class LoginResponse(BaseModel):
24
+ access_token: str
25
+ token_type: str
26
+ user: dict[str, str]
27
+
28
+
29
+ class Batch(BaseModel):
30
+ id: str
31
+ status: BatchStatus
32
+ window: Literal["24h"]
33
+ request_count: int
34
+ created_at: str
35
+ updated_at: str
36
+ completed_at: str | None = None
37
+ total_prompt_tokens: int = 0
38
+ total_completion_tokens: int = 0
39
+ total_tokens: int = 0
40
+
41
+
42
+ class BatchList(BaseModel):
43
+ items: list[Batch]
44
+
45
+
46
+ class BatchResults(BaseModel):
47
+ batch_id: str
48
+ status: BatchStatus
49
+ output_url: str | None = None
50
+ completed_at: str | None = None
51
+ results: list[dict[str, Any]] | None = None
52
+
53
+
54
+ class BatchCreatePayload(BaseModel):
55
+ window: Literal["24h"] = "24h"
56
+ requests: list[InferenceRequest] = Field(min_length=1)
57
+
58
+
59
+ StreamStatus = Literal["open", "cancelled", "archived"]
60
+ StreamWindow = Literal["1h", "24h"]
61
+
62
+
63
+ class Stream(BaseModel):
64
+ """Stream detail (GET /v1/streams/{id}); list responses omit counter fields (defaults apply)."""
65
+
66
+ id: str
67
+ name: str
68
+ window: StreamWindow
69
+ status: StreamStatus
70
+ created_at: str
71
+ updated_at: str
72
+ cancelled_at: str | None = None
73
+ archived_at: str | None = None
74
+ total_items: int = 0
75
+ pending_items: int = 0
76
+ running_items: int = 0
77
+ completed_items: int = 0
78
+ failed_items: int = 0
79
+ cancelled_items: int = 0
80
+ completion_ratio: float = 0.0
81
+ total_prompt_tokens: int = 0
82
+ total_completion_tokens: int = 0
83
+ total_tokens: int = 0
84
+ last_completion_at: str | None = None
85
+ latest_completion_id: str | None = None
86
+
87
+
88
+ class StreamInferenceCompletionEvent(BaseModel):
89
+ completion_id: str
90
+ custom_id: str | None = None
91
+ status: str
92
+ result: dict[str, Any] | None = None
93
+ error: dict[str, Any] | None = None
94
+ prompt_tokens: int | None = None
95
+ completion_tokens: int | None = None
96
+ total_tokens: int | None = None
97
+ completed_at: str | None = None
98
+
99
+
100
+ class StreamEventList(BaseModel):
101
+ object: str = "list"
102
+ data: list[StreamInferenceCompletionEvent]
103
+ has_more: bool
104
+
105
+
106
+ class StreamList(BaseModel):
107
+ items: list[Stream]
108
+
109
+
110
+ # Response (OpenAI-compatible) models
111
+ ResponseStatus = Literal["in_progress", "completed", "failed", "cancelled"]
112
+
113
+
114
+ class ResponseInputMessage(BaseModel):
115
+ """Message in the input array for a response."""
116
+
117
+ role: Literal["user", "assistant", "system", "developer"]
118
+ content: str
119
+
120
+
121
+ class ResponseCreatePayload(BaseModel):
122
+ """Request body for POST /v1/responses."""
123
+
124
+ model_config = ConfigDict(extra="forbid")
125
+
126
+ model: str = Field(description='Model identifier, e.g. "zai-org/GLM-5"')
127
+ input: list[ResponseInputMessage] = Field(min_length=1)
128
+ instructions: str | None = None
129
+ max_output_tokens: int | None = None
130
+ temperature: float | None = Field(default=None, ge=0.0, le=2.0)
131
+ top_p: float | None = Field(default=None, ge=0.0, le=1.0)
132
+ metadata: dict[str, Any] = Field(default_factory=dict)
133
+
134
+
135
+ class ResponseOutputContent(BaseModel):
136
+ """Content item in a response output."""
137
+
138
+ type: Literal["output_text"] = "output_text"
139
+ text: str
140
+
141
+
142
+ class ResponseUsage(BaseModel):
143
+ """Token usage for a response."""
144
+
145
+ input_tokens: int
146
+ output_tokens: int
147
+ total_tokens: int
148
+
149
+
150
+ class ResponseError(BaseModel):
151
+ """Error information for a failed response."""
152
+
153
+ code: str
154
+ message: str
155
+
156
+
157
+ class Response(BaseModel):
158
+ """OpenAI-compatible response object."""
159
+
160
+ id: str
161
+ object: Literal["response"] = "response"
162
+ created_at: int
163
+ model: str
164
+ status: ResponseStatus
165
+ output: list[ResponseOutputContent] | None = None
166
+ error: ResponseError | None = None
167
+ usage: ResponseUsage | None = None
168
+
169
+
170
+ class ResponseListItem(BaseModel):
171
+ """Item in a response list."""
172
+
173
+ id: str
174
+ object: Literal["response"] = "response"
175
+ created_at: int
176
+ model: str
177
+ status: ResponseStatus
178
+
179
+
180
+ class ResponseList(BaseModel):
181
+ """Paginated list of responses."""
182
+
183
+ object: Literal["list"] = "list"
184
+ data: list[ResponseListItem]
185
+ has_more: bool = False