sference-sdk 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sference_sdk-0.0.1/.gitignore +10 -0
- sference_sdk-0.0.1/PKG-INFO +186 -0
- sference_sdk-0.0.1/README.md +177 -0
- sference_sdk-0.0.1/pyproject.toml +20 -0
- sference_sdk-0.0.1/sference_sdk/__init__.py +32 -0
- sference_sdk-0.0.1/sference_sdk/async_client.py +295 -0
- sference_sdk-0.0.1/sference_sdk/checkpoint.py +62 -0
- sference_sdk-0.0.1/sference_sdk/client.py +357 -0
- sference_sdk-0.0.1/sference_sdk/models.py +185 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sference-sdk
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Python SDK for the sference batch API
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: httpx>=0.28.1
|
|
7
|
+
Requires-Dist: pydantic>=2.12.5
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# sference Python SDK
|
|
11
|
+
|
|
12
|
+
Installable package: `sference-sdk` (import: `sference_sdk`). Used by the `sference` CLI and your own automation.
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install sference-sdk
|
|
18
|
+
# or: uv add sference-sdk
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Set `SFERENCE_API_KEY` and optional `SFERENCE_BASE_URL` (default `https://api.sference.com`), or pass `api_key=` / `base_url=` to the client.
|
|
24
|
+
|
|
25
|
+
### Batches (sync)
|
|
26
|
+
|
|
27
|
+
Best for a **fixed JSONL workload**: one submit, poll until terminal, then fetch structured results or download JSONL via the API.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from sference_sdk import SferenceClient
|
|
31
|
+
|
|
32
|
+
client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
|
|
33
|
+
|
|
34
|
+
batch = client.submit_batch(
|
|
35
|
+
input_file="./workload.jsonl",
|
|
36
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
37
|
+
window="24h",
|
|
38
|
+
)
|
|
39
|
+
done = client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
|
|
40
|
+
results = client.get_results(done.id)
|
|
41
|
+
print(results.status, results.output_url)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Use a `model` supported by your sference deployment.
|
|
45
|
+
|
|
46
|
+
### OpenAI-compatible responses (sync)
|
|
47
|
+
|
|
48
|
+
Standalone or stream-associated jobs via `POST /v1/responses`. Keys need `responses:read` and `responses:write` (default on newly issued keys).
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from sference_sdk import SferenceClient
|
|
52
|
+
|
|
53
|
+
client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
|
|
54
|
+
|
|
55
|
+
created = client.create_response(
|
|
56
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
57
|
+
input=[{"role": "user", "content": "Hello"}],
|
|
58
|
+
metadata={"completion_window": "24h"},
|
|
59
|
+
)
|
|
60
|
+
row = client.get_response(created.id)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For a stream, add `stream_id` inside `metadata` next to `completion_window`.
|
|
64
|
+
|
|
65
|
+
### OpenAI Python SDK (`openai` package)
|
|
66
|
+
|
|
67
|
+
If you already use the official OpenAI client, point it at a sference-compatible **`/v1`** base URL and the same API key (with `responses:read` and `responses:write`).
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install openai
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import asyncio
|
|
75
|
+
import os
|
|
76
|
+
|
|
77
|
+
from openai import AsyncOpenAI
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def main() -> None:
|
|
81
|
+
client = AsyncOpenAI(
|
|
82
|
+
base_url="https://api.sference.com/v1",
|
|
83
|
+
api_key=os.environ["SFERENCE_API_KEY"],
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
response = await client.responses.create(
|
|
87
|
+
model="zai-org/GLM-5",
|
|
88
|
+
input=[{"role": "user", "content": "Hello, world!"}],
|
|
89
|
+
background=True,
|
|
90
|
+
)
|
|
91
|
+
# Poll GET /v1/responses/{id} until terminal; your openai version may expose
|
|
92
|
+
# something like await client.responses.retrieve(response.id), or use
|
|
93
|
+
# AsyncSferenceClient.get_response(response.id) with the same host and key.
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
asyncio.run(main())
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Self-hosted** (local API): use `base_url="http://127.0.0.1:8000/v1"` (or your `SFERENCE_BASE_URL` + `"/v1"`). **`model`** must match a model your inference workers consume.
|
|
100
|
+
|
|
101
|
+
**Metadata:** to set `completion_window` or `stream_id` like the native SDK, pass them in the request body your `openai` version supports (for example `metadata=` on `create`, or `extra_body={"metadata": {...}}` if the helper does not list those fields yet).
|
|
102
|
+
|
|
103
|
+
### Async client — batches
|
|
104
|
+
|
|
105
|
+
`AsyncSferenceClient` uses `httpx.AsyncClient` so batch polling can run alongside other async I/O without blocking threads.
|
|
106
|
+
|
|
107
|
+
**Use case:** You already know the full set of prompts (for example a JSONL file) and want one scheduled unit of work with a clear terminal state and bulk results.
|
|
108
|
+
|
|
109
|
+
**Benefits:** Simple lifecycle (submit → wait → fetch results), fits large static workloads and JSONL-heavy pipelines.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import asyncio
|
|
113
|
+
|
|
114
|
+
from sference_sdk import AsyncSferenceClient
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def main() -> None:
|
|
118
|
+
async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
|
|
119
|
+
batch = await client.submit_batch(
|
|
120
|
+
input_file="./workload.jsonl",
|
|
121
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
122
|
+
window="24h",
|
|
123
|
+
)
|
|
124
|
+
done = await client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
|
|
125
|
+
results = await client.get_results(done.id)
|
|
126
|
+
print(results.status, results.output_url)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
asyncio.run(main())
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Async client — streams
|
|
133
|
+
|
|
134
|
+
Stream-associated jobs use `create_response(..., metadata={"stream_id": ..., "completion_window": "24h"})`. Consume completions with `list_stream_events` (optional `wait_ms` long-poll) or `iter_stream_events` (paged replay; optional checkpoints align with CLI `stream tail`).
|
|
135
|
+
|
|
136
|
+
**Use case:** Work arrives over time, or you want one id to group many responses and observe completions as they land.
|
|
137
|
+
|
|
138
|
+
**Benefits:** Independent submits with aggregated progress, stream-level status in the API/UI, and efficient event tailing.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import asyncio
|
|
142
|
+
|
|
143
|
+
from sference_sdk import AsyncSferenceClient
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
async def main() -> None:
|
|
147
|
+
async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
|
|
148
|
+
stream = await client.create_stream(name="sdk-demo", window="24h")
|
|
149
|
+
await client.create_response(
|
|
150
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
151
|
+
input=[{"role": "user", "content": "Hello"}],
|
|
152
|
+
metadata={"stream_id": stream.id, "completion_window": "24h"},
|
|
153
|
+
)
|
|
154
|
+
async for ev in client.iter_stream_events(stream.id, checkpoint=False):
|
|
155
|
+
print(ev.completion_id, ev.status)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
asyncio.run(main())
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## cURL (same API the SDK calls)
|
|
162
|
+
|
|
163
|
+
API keys need `responses:read` and `responses:write` (default on newly issued keys). `X-API-Key` or `Authorization: Bearer sk_...` are both accepted.
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
export TOKEN=sk_...
|
|
167
|
+
BASE_URL=https://api.sference.com
|
|
168
|
+
|
|
169
|
+
RID=$(curl -sS -X POST "${BASE_URL}/v1/responses" \
|
|
170
|
+
-H "X-API-Key: $TOKEN" \
|
|
171
|
+
-H 'Content-Type: application/json' \
|
|
172
|
+
-d '{
|
|
173
|
+
"model": "Qwen/Qwen2.5-7B-Instruct",
|
|
174
|
+
"input": [{"role": "user", "content": "Hello"}],
|
|
175
|
+
"metadata": {"completion_window": "24h"}
|
|
176
|
+
}' | jq -r '.id')
|
|
177
|
+
|
|
178
|
+
curl -sS "${BASE_URL}/v1/responses/${RID}" \
|
|
179
|
+
-H "X-API-Key: $TOKEN"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
For self-hosted APIs, set `BASE_URL` to your API origin (no `/v1` suffix on `BASE_URL` here—the paths already include `/v1`). Without `jq`, read `id` from the POST JSON and substitute it in the GET URL.
|
|
183
|
+
|
|
184
|
+
## CLI
|
|
185
|
+
|
|
186
|
+
For `sference batch …` and `sference stream …` commands, see the [CLI README](../cli/README.md).
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# sference Python SDK
|
|
2
|
+
|
|
3
|
+
Installable package: `sference-sdk` (import: `sference_sdk`). Used by the `sference` CLI and your own automation.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install sference-sdk
|
|
9
|
+
# or: uv add sference-sdk
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
Set `SFERENCE_API_KEY` and optional `SFERENCE_BASE_URL` (default `https://api.sference.com`), or pass `api_key=` / `base_url=` to the client.
|
|
15
|
+
|
|
16
|
+
### Batches (sync)
|
|
17
|
+
|
|
18
|
+
Best for a **fixed JSONL workload**: one submit, poll until terminal, then fetch structured results or download JSONL via the API.
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from sference_sdk import SferenceClient
|
|
22
|
+
|
|
23
|
+
client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
|
|
24
|
+
|
|
25
|
+
batch = client.submit_batch(
|
|
26
|
+
input_file="./workload.jsonl",
|
|
27
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
28
|
+
window="24h",
|
|
29
|
+
)
|
|
30
|
+
done = client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
|
|
31
|
+
results = client.get_results(done.id)
|
|
32
|
+
print(results.status, results.output_url)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Use a `model` supported by your sference deployment.
|
|
36
|
+
|
|
37
|
+
### OpenAI-compatible responses (sync)
|
|
38
|
+
|
|
39
|
+
Standalone or stream-associated jobs via `POST /v1/responses`. Keys need `responses:read` and `responses:write` (default on newly issued keys).
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from sference_sdk import SferenceClient
|
|
43
|
+
|
|
44
|
+
client = SferenceClient(api_key="sk_...", base_url="https://api.sference.com")
|
|
45
|
+
|
|
46
|
+
created = client.create_response(
|
|
47
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
48
|
+
input=[{"role": "user", "content": "Hello"}],
|
|
49
|
+
metadata={"completion_window": "24h"},
|
|
50
|
+
)
|
|
51
|
+
row = client.get_response(created.id)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
For a stream, add `stream_id` inside `metadata` next to `completion_window`.
|
|
55
|
+
|
|
56
|
+
### OpenAI Python SDK (`openai` package)
|
|
57
|
+
|
|
58
|
+
If you already use the official OpenAI client, point it at a sference-compatible **`/v1`** base URL and the same API key (with `responses:read` and `responses:write`).
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install openai
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import asyncio
|
|
66
|
+
import os
|
|
67
|
+
|
|
68
|
+
from openai import AsyncOpenAI
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def main() -> None:
|
|
72
|
+
client = AsyncOpenAI(
|
|
73
|
+
base_url="https://api.sference.com/v1",
|
|
74
|
+
api_key=os.environ["SFERENCE_API_KEY"],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
response = await client.responses.create(
|
|
78
|
+
model="zai-org/GLM-5",
|
|
79
|
+
input=[{"role": "user", "content": "Hello, world!"}],
|
|
80
|
+
background=True,
|
|
81
|
+
)
|
|
82
|
+
# Poll GET /v1/responses/{id} until terminal; your openai version may expose
|
|
83
|
+
# something like await client.responses.retrieve(response.id), or use
|
|
84
|
+
# AsyncSferenceClient.get_response(response.id) with the same host and key.
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
asyncio.run(main())
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Self-hosted** (local API): use `base_url="http://127.0.0.1:8000/v1"` (or your `SFERENCE_BASE_URL` + `"/v1"`). **`model`** must match a model your inference workers consume.
|
|
91
|
+
|
|
92
|
+
**Metadata:** to set `completion_window` or `stream_id` like the native SDK, pass them in the request body your `openai` version supports (for example `metadata=` on `create`, or `extra_body={"metadata": {...}}` if the helper does not list those fields yet).
|
|
93
|
+
|
|
94
|
+
### Async client — batches
|
|
95
|
+
|
|
96
|
+
`AsyncSferenceClient` uses `httpx.AsyncClient` so batch polling can run alongside other async I/O without blocking threads.
|
|
97
|
+
|
|
98
|
+
**Use case:** You already know the full set of prompts (for example a JSONL file) and want one scheduled unit of work with a clear terminal state and bulk results.
|
|
99
|
+
|
|
100
|
+
**Benefits:** Simple lifecycle (submit → wait → fetch results), fits large static workloads and JSONL-heavy pipelines.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
import asyncio
|
|
104
|
+
|
|
105
|
+
from sference_sdk import AsyncSferenceClient
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def main() -> None:
|
|
109
|
+
async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
|
|
110
|
+
batch = await client.submit_batch(
|
|
111
|
+
input_file="./workload.jsonl",
|
|
112
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
113
|
+
window="24h",
|
|
114
|
+
)
|
|
115
|
+
done = await client.wait_for_completion(batch.id, poll_interval=2.0, timeout=3600.0)
|
|
116
|
+
results = await client.get_results(done.id)
|
|
117
|
+
print(results.status, results.output_url)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
asyncio.run(main())
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Async client — streams
|
|
124
|
+
|
|
125
|
+
Stream-associated jobs use `create_response(..., metadata={"stream_id": ..., "completion_window": "24h"})`. Consume completions with `list_stream_events` (optional `wait_ms` long-poll) or `iter_stream_events` (paged replay; optional checkpoints align with CLI `stream tail`).
|
|
126
|
+
|
|
127
|
+
**Use case:** Work arrives over time, or you want one id to group many responses and observe completions as they land.
|
|
128
|
+
|
|
129
|
+
**Benefits:** Independent submits with aggregated progress, stream-level status in the API/UI, and efficient event tailing.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
import asyncio
|
|
133
|
+
|
|
134
|
+
from sference_sdk import AsyncSferenceClient
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
async def main() -> None:
|
|
138
|
+
async with AsyncSferenceClient(api_key="sk_...", base_url="https://api.sference.com") as client:
|
|
139
|
+
stream = await client.create_stream(name="sdk-demo", window="24h")
|
|
140
|
+
await client.create_response(
|
|
141
|
+
model="Qwen/Qwen2.5-7B-Instruct",
|
|
142
|
+
input=[{"role": "user", "content": "Hello"}],
|
|
143
|
+
metadata={"stream_id": stream.id, "completion_window": "24h"},
|
|
144
|
+
)
|
|
145
|
+
async for ev in client.iter_stream_events(stream.id, checkpoint=False):
|
|
146
|
+
print(ev.completion_id, ev.status)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
asyncio.run(main())
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## cURL (same API the SDK calls)
|
|
153
|
+
|
|
154
|
+
API keys need `responses:read` and `responses:write` (default on newly issued keys). `X-API-Key` or `Authorization: Bearer sk_...` are both accepted.
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
export TOKEN=sk_...
|
|
158
|
+
BASE_URL=https://api.sference.com
|
|
159
|
+
|
|
160
|
+
RID=$(curl -sS -X POST "${BASE_URL}/v1/responses" \
|
|
161
|
+
-H "X-API-Key: $TOKEN" \
|
|
162
|
+
-H 'Content-Type: application/json' \
|
|
163
|
+
-d '{
|
|
164
|
+
"model": "Qwen/Qwen2.5-7B-Instruct",
|
|
165
|
+
"input": [{"role": "user", "content": "Hello"}],
|
|
166
|
+
"metadata": {"completion_window": "24h"}
|
|
167
|
+
}' | jq -r '.id')
|
|
168
|
+
|
|
169
|
+
curl -sS "${BASE_URL}/v1/responses/${RID}" \
|
|
170
|
+
-H "X-API-Key: $TOKEN"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
For self-hosted APIs, set `BASE_URL` to your API origin (no `/v1` suffix on `BASE_URL` here—the paths already include `/v1`). Without `jq`, read `id` from the POST JSON and substitute it in the GET URL.
|
|
174
|
+
|
|
175
|
+
## CLI
|
|
176
|
+
|
|
177
|
+
For `sference batch …` and `sference stream …` commands, see the [CLI README](../cli/README.md).
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sference-sdk"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "Python SDK for the sference batch API"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"httpx>=0.28.1",
|
|
9
|
+
"pydantic>=2.12.5",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[build-system]
|
|
13
|
+
requires = ["hatchling>=1.29.0"]
|
|
14
|
+
build-backend = "hatchling.build"
|
|
15
|
+
|
|
16
|
+
[tool.hatch.build.targets.wheel]
|
|
17
|
+
packages = ["sference_sdk"]
|
|
18
|
+
|
|
19
|
+
[tool.hatch.build.targets.sdist]
|
|
20
|
+
include = ["sference_sdk"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .async_client import AsyncSferenceClient
|
|
2
|
+
from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
|
|
3
|
+
from .client import ApiError, SferenceClient
|
|
4
|
+
from .models import (
|
|
5
|
+
Batch,
|
|
6
|
+
BatchList,
|
|
7
|
+
BatchResults,
|
|
8
|
+
InferenceRequest,
|
|
9
|
+
LoginResponse,
|
|
10
|
+
Stream,
|
|
11
|
+
StreamInferenceCompletionEvent,
|
|
12
|
+
StreamEventList,
|
|
13
|
+
StreamList,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ApiError",
|
|
18
|
+
"AsyncSferenceClient",
|
|
19
|
+
"SferenceClient",
|
|
20
|
+
"Batch",
|
|
21
|
+
"BatchList",
|
|
22
|
+
"BatchResults",
|
|
23
|
+
"LoginResponse",
|
|
24
|
+
"InferenceRequest",
|
|
25
|
+
"Stream",
|
|
26
|
+
"StreamInferenceCompletionEvent",
|
|
27
|
+
"StreamEventList",
|
|
28
|
+
"StreamList",
|
|
29
|
+
"clear_checkpoint",
|
|
30
|
+
"load_checkpoint",
|
|
31
|
+
"save_checkpoint",
|
|
32
|
+
]
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
import warnings
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, AsyncIterator, BinaryIO
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from .client import ApiError, SferenceClient
|
|
14
|
+
from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
|
|
15
|
+
from .models import (
|
|
16
|
+
Batch,
|
|
17
|
+
BatchCreatePayload,
|
|
18
|
+
BatchList,
|
|
19
|
+
BatchResults,
|
|
20
|
+
InferenceRequest,
|
|
21
|
+
LoginResponse,
|
|
22
|
+
Response,
|
|
23
|
+
ResponseList,
|
|
24
|
+
Stream,
|
|
25
|
+
StreamInferenceCompletionEvent,
|
|
26
|
+
StreamEventList,
|
|
27
|
+
StreamList,
|
|
28
|
+
StreamWindow,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AsyncSferenceClient:
|
|
33
|
+
"""Async HTTP client for the sference batch API (``httpx.AsyncClient``)."""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
base_url: str | None = None,
|
|
38
|
+
api_key: str | None = None,
|
|
39
|
+
transport: httpx.AsyncBaseTransport | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.base_url = base_url or os.getenv("SFERENCE_BASE_URL", "https://api.sference.com")
|
|
42
|
+
self._token = api_key or os.getenv("SFERENCE_API_KEY")
|
|
43
|
+
self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0, transport=transport)
|
|
44
|
+
|
|
45
|
+
async def aclose(self) -> None:
|
|
46
|
+
await self._client.aclose()
|
|
47
|
+
|
|
48
|
+
async def __aenter__(self) -> AsyncSferenceClient:
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
async def __aexit__(self, *_: Any) -> None:
|
|
52
|
+
await self.aclose()
|
|
53
|
+
|
|
54
|
+
def _headers(self) -> dict[str, str]:
|
|
55
|
+
headers = {"content-type": "application/json"}
|
|
56
|
+
if self._token:
|
|
57
|
+
headers["authorization"] = f"Bearer {self._token}"
|
|
58
|
+
return headers
|
|
59
|
+
|
|
60
|
+
async def _request(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> Any:
|
|
61
|
+
response = await self._client.request(method, path, headers=self._headers(), json=json_body)
|
|
62
|
+
if response.status_code >= 400:
|
|
63
|
+
try:
|
|
64
|
+
payload = response.json()
|
|
65
|
+
except Exception:
|
|
66
|
+
payload = {"detail": response.text}
|
|
67
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
68
|
+
return response.json()
|
|
69
|
+
|
|
70
|
+
async def _request_response(
|
|
71
|
+
self, method: str, path: str, json_body: dict[str, Any] | None = None
|
|
72
|
+
) -> httpx.Response:
|
|
73
|
+
response = await self._client.request(method, path, headers=self._headers(), json=json_body)
|
|
74
|
+
if response.status_code >= 400:
|
|
75
|
+
try:
|
|
76
|
+
payload = response.json()
|
|
77
|
+
except Exception:
|
|
78
|
+
payload = {"detail": response.text}
|
|
79
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
80
|
+
return response
|
|
81
|
+
|
|
82
|
+
async def login(self, username: str, password: str) -> LoginResponse:
|
|
83
|
+
payload = await self._request("POST", "/v1/auth/login", {"username": username, "password": password})
|
|
84
|
+
result = LoginResponse.model_validate(payload)
|
|
85
|
+
self._token = result.access_token
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
async def get_me(self) -> dict[str, Any]:
|
|
89
|
+
return await self._request("GET", "/v1/auth/me")
|
|
90
|
+
|
|
91
|
+
async def submit_batch(
|
|
92
|
+
self,
|
|
93
|
+
*,
|
|
94
|
+
input_file: str | None = None,
|
|
95
|
+
requests: list[dict[str, Any]] | None = None,
|
|
96
|
+
model: str | None = None,
|
|
97
|
+
window: str = "24h",
|
|
98
|
+
) -> Batch:
|
|
99
|
+
if window != "24h":
|
|
100
|
+
raise ValueError('Only window "24h" is supported in MVP.')
|
|
101
|
+
resolved = requests or []
|
|
102
|
+
if input_file:
|
|
103
|
+
resolved = SferenceClient.parse_inference_requests_jsonl(Path(input_file), model=model)
|
|
104
|
+
if not resolved:
|
|
105
|
+
raise ValueError("At least one request is required (use input_file or non-empty requests)")
|
|
106
|
+
if isinstance(resolved, list) and resolved and isinstance(resolved[0], dict):
|
|
107
|
+
reqs: list[InferenceRequest] = []
|
|
108
|
+
for r in resolved:
|
|
109
|
+
if not isinstance(r, dict):
|
|
110
|
+
raise TypeError("requests must be a list of dicts or InferenceRequest objects")
|
|
111
|
+
body = r.get("body")
|
|
112
|
+
if isinstance(body, dict) and {"method", "url", "body"}.issubset(r.keys()):
|
|
113
|
+
reqs.append(InferenceRequest(custom_id=r.get("custom_id"), body=body))
|
|
114
|
+
else:
|
|
115
|
+
reqs.append(InferenceRequest.model_validate(r))
|
|
116
|
+
resolved = reqs
|
|
117
|
+
payload = BatchCreatePayload(window="24h", requests=resolved) # type: ignore[arg-type]
|
|
118
|
+
response = await self._request("POST", "/v1/batches", payload.model_dump())
|
|
119
|
+
return Batch.model_validate(response)
|
|
120
|
+
|
|
121
|
+
async def get_batch(self, batch_id: str) -> Batch:
|
|
122
|
+
response = await self._request("GET", f"/v1/batches/{batch_id}")
|
|
123
|
+
return Batch.model_validate(response)
|
|
124
|
+
|
|
125
|
+
async def list_batches(self) -> BatchList:
|
|
126
|
+
response = await self._request("GET", "/v1/batches")
|
|
127
|
+
return BatchList.model_validate(response)
|
|
128
|
+
|
|
129
|
+
async def get_results(self, batch_id: str) -> BatchResults:
|
|
130
|
+
response = await self._request("GET", f"/v1/batches/{batch_id}/results")
|
|
131
|
+
return BatchResults.model_validate(response)
|
|
132
|
+
|
|
133
|
+
async def cancel_batch(self, batch_id: str) -> Batch:
|
|
134
|
+
response = await self._request("POST", f"/v1/batches/{batch_id}/cancel")
|
|
135
|
+
return Batch.model_validate(response)
|
|
136
|
+
|
|
137
|
+
async def download_results_jsonl(self, batch_id: str, out: str | Path | BinaryIO) -> None:
|
|
138
|
+
resp = await self._request_response("GET", f"/v1/batches/{batch_id}/results.jsonl")
|
|
139
|
+
if isinstance(out, (str, Path)):
|
|
140
|
+
Path(out).write_bytes(resp.content)
|
|
141
|
+
return
|
|
142
|
+
out.write(resp.content)
|
|
143
|
+
|
|
144
|
+
async def wait_for_completion(self, batch_id: str, poll_interval: float = 1.0, timeout: float = 30.0) -> Batch:
|
|
145
|
+
deadline = time.time() + timeout
|
|
146
|
+
while True:
|
|
147
|
+
batch = await self.get_batch(batch_id)
|
|
148
|
+
if batch.status in ("completed", "failed", "cancelled"):
|
|
149
|
+
return batch
|
|
150
|
+
if time.time() >= deadline:
|
|
151
|
+
raise TimeoutError(f"Timed out waiting for batch {batch_id}")
|
|
152
|
+
await asyncio.sleep(poll_interval)
|
|
153
|
+
|
|
154
|
+
async def create_stream(self, name: str, window: StreamWindow = "24h") -> Stream:
|
|
155
|
+
response = await self._request("POST", "/v1/streams", {"name": name, "window": window})
|
|
156
|
+
return Stream.model_validate(response)
|
|
157
|
+
|
|
158
|
+
async def list_streams(self) -> StreamList:
|
|
159
|
+
response = await self._request("GET", "/v1/streams")
|
|
160
|
+
return StreamList.model_validate(response)
|
|
161
|
+
|
|
162
|
+
async def get_stream(self, stream_id: str) -> Stream:
|
|
163
|
+
response = await self._request("GET", f"/v1/streams/{stream_id}")
|
|
164
|
+
return Stream.model_validate(response)
|
|
165
|
+
|
|
166
|
+
# Note: POST /v1/streams/{id}/items has been removed. Use create_response() with metadata.stream_id instead.
|
|
167
|
+
|
|
168
|
+
async def create_response(
|
|
169
|
+
self,
|
|
170
|
+
*,
|
|
171
|
+
model: str,
|
|
172
|
+
input: list[dict[str, str]], # [{"role": "user", "content": "..."}]
|
|
173
|
+
instructions: str | None = None,
|
|
174
|
+
max_output_tokens: int | None = None,
|
|
175
|
+
temperature: float | None = None,
|
|
176
|
+
top_p: float | None = None,
|
|
177
|
+
metadata: dict[str, Any] | None = None,
|
|
178
|
+
) -> Response:
|
|
179
|
+
"""Create a standalone response or stream-associated response.
|
|
180
|
+
|
|
181
|
+
For stream-associated, include stream_id in metadata:
|
|
182
|
+
metadata={"stream_id": "uuid", "completion_window": "24h"}
|
|
183
|
+
"""
|
|
184
|
+
payload: dict[str, Any] = {
|
|
185
|
+
"model": model,
|
|
186
|
+
"input": input,
|
|
187
|
+
}
|
|
188
|
+
if instructions is not None:
|
|
189
|
+
payload["instructions"] = instructions
|
|
190
|
+
if max_output_tokens is not None:
|
|
191
|
+
payload["max_output_tokens"] = max_output_tokens
|
|
192
|
+
if temperature is not None:
|
|
193
|
+
payload["temperature"] = temperature
|
|
194
|
+
if top_p is not None:
|
|
195
|
+
payload["top_p"] = top_p
|
|
196
|
+
if metadata is not None:
|
|
197
|
+
payload["metadata"] = metadata
|
|
198
|
+
|
|
199
|
+
response = await self._request("POST", "/v1/responses", payload)
|
|
200
|
+
return Response.model_validate(response)
|
|
201
|
+
|
|
202
|
+
async def get_response(self, response_id: str) -> Response:
|
|
203
|
+
"""Get a response by ID."""
|
|
204
|
+
response = await self._request("GET", f"/v1/responses/{response_id}")
|
|
205
|
+
return Response.model_validate(response)
|
|
206
|
+
|
|
207
|
+
async def cancel_response(self, response_id: str) -> Response:
|
|
208
|
+
"""Cancel a pending or running response."""
|
|
209
|
+
response = await self._request("DELETE", f"/v1/responses/{response_id}")
|
|
210
|
+
return Response.model_validate(response)
|
|
211
|
+
|
|
212
|
+
async def list_responses(
|
|
213
|
+
self,
|
|
214
|
+
*,
|
|
215
|
+
limit: int = 100,
|
|
216
|
+
stream_id: str | None = None,
|
|
217
|
+
) -> ResponseList:
|
|
218
|
+
"""List responses. Optionally filter by stream_id."""
|
|
219
|
+
params: dict[str, Any] = {"limit": limit}
|
|
220
|
+
if stream_id is not None:
|
|
221
|
+
params["stream_id"] = stream_id
|
|
222
|
+
|
|
223
|
+
response = await self._request("GET", "/v1/responses", params=params)
|
|
224
|
+
return ResponseList.model_validate(response)
|
|
225
|
+
|
|
226
|
+
async def archive_stream(self, stream_id: str) -> Stream:
|
|
227
|
+
response = await self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "archived"})
|
|
228
|
+
return Stream.model_validate(response)
|
|
229
|
+
|
|
230
|
+
async def cancel_stream(self, stream_id: str) -> Stream:
|
|
231
|
+
response = await self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "cancelled"})
|
|
232
|
+
return Stream.model_validate(response)
|
|
233
|
+
|
|
234
|
+
async def list_stream_events(
|
|
235
|
+
self,
|
|
236
|
+
stream_id: str,
|
|
237
|
+
*,
|
|
238
|
+
limit: int = 20,
|
|
239
|
+
starting_after: str | None = None,
|
|
240
|
+
ending_before: str | None = None,
|
|
241
|
+
wait_ms: int = 0,
|
|
242
|
+
) -> StreamEventList:
|
|
243
|
+
params: dict[str, Any] = {"limit": limit, "wait_ms": wait_ms}
|
|
244
|
+
if starting_after is not None:
|
|
245
|
+
params["starting_after"] = starting_after
|
|
246
|
+
if ending_before is not None:
|
|
247
|
+
params["ending_before"] = ending_before
|
|
248
|
+
response = await self._client.request(
|
|
249
|
+
"GET",
|
|
250
|
+
f"/v1/streams/{stream_id}/events",
|
|
251
|
+
headers=self._headers(),
|
|
252
|
+
params=params,
|
|
253
|
+
)
|
|
254
|
+
if response.status_code >= 400:
|
|
255
|
+
try:
|
|
256
|
+
payload = response.json()
|
|
257
|
+
except Exception:
|
|
258
|
+
payload = {"detail": response.text}
|
|
259
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
260
|
+
return StreamEventList.model_validate(response.json())
|
|
261
|
+
|
|
262
|
+
async def iter_stream_events(
|
|
263
|
+
self,
|
|
264
|
+
stream_id: str,
|
|
265
|
+
*,
|
|
266
|
+
consumer_name: str = "default",
|
|
267
|
+
starting_after: str | None = None,
|
|
268
|
+
checkpoint: bool = True,
|
|
269
|
+
from_latest: bool = False,
|
|
270
|
+
) -> AsyncIterator[StreamInferenceCompletionEvent]:
|
|
271
|
+
if from_latest:
|
|
272
|
+
clear_checkpoint(self.base_url, stream_id, consumer_name)
|
|
273
|
+
cur: str | None = starting_after
|
|
274
|
+
if checkpoint and cur is None:
|
|
275
|
+
cur = load_checkpoint(self.base_url, stream_id, consumer_name)
|
|
276
|
+
|
|
277
|
+
if cur is None:
|
|
278
|
+
page = await self.list_stream_events(stream_id, limit=100, wait_ms=0)
|
|
279
|
+
for ev in reversed(page.data):
|
|
280
|
+
yield ev
|
|
281
|
+
if checkpoint:
|
|
282
|
+
save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
while True:
|
|
286
|
+
page = await self.list_stream_events(stream_id, limit=100, starting_after=cur, wait_ms=0)
|
|
287
|
+
if not page.data:
|
|
288
|
+
return
|
|
289
|
+
for ev in page.data:
|
|
290
|
+
yield ev
|
|
291
|
+
if checkpoint:
|
|
292
|
+
save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
|
|
293
|
+
cur = ev.completion_id
|
|
294
|
+
if not page.has_more:
|
|
295
|
+
return
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _store_path() -> Path:
|
|
11
|
+
raw = os.environ.get("SFERENCE_STREAM_CHECKPOINTS")
|
|
12
|
+
if raw:
|
|
13
|
+
return Path(raw)
|
|
14
|
+
return Path.home() / ".sference" / "stream_checkpoints.json"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _checkpoint_key(base_url: str, stream_id: str, consumer_name: str) -> str:
|
|
18
|
+
return f"{base_url.rstrip('/')}:{stream_id}:{consumer_name}"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_all(path: Path) -> dict[str, Any]:
|
|
22
|
+
if not path.exists():
|
|
23
|
+
return {}
|
|
24
|
+
try:
|
|
25
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
26
|
+
return data if isinstance(data, dict) else {}
|
|
27
|
+
except (json.JSONDecodeError, OSError):
|
|
28
|
+
return {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _write_all(path: Path, data: dict[str, Any]) -> None:
|
|
32
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_checkpoint(base_url: str, stream_id: str, consumer_name: str) -> str | None:
|
|
37
|
+
path = _store_path()
|
|
38
|
+
key = _checkpoint_key(base_url, stream_id, consumer_name)
|
|
39
|
+
entry = _load_all(path).get(key)
|
|
40
|
+
if not isinstance(entry, dict):
|
|
41
|
+
return None
|
|
42
|
+
last = entry.get("last_completion_id") or entry.get("last_event_id")
|
|
43
|
+
return str(last) if last else None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def save_checkpoint(base_url: str, stream_id: str, consumer_name: str, last_completion_id: str) -> None:
|
|
47
|
+
path = _store_path()
|
|
48
|
+
key = _checkpoint_key(base_url, stream_id, consumer_name)
|
|
49
|
+
data = _load_all(path)
|
|
50
|
+
data[key] = {
|
|
51
|
+
"last_completion_id": last_completion_id,
|
|
52
|
+
"updated_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
53
|
+
}
|
|
54
|
+
_write_all(path, data)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def clear_checkpoint(base_url: str, stream_id: str, consumer_name: str) -> None:
|
|
58
|
+
path = _store_path()
|
|
59
|
+
key = _checkpoint_key(base_url, stream_id, consumer_name)
|
|
60
|
+
data = _load_all(path)
|
|
61
|
+
data.pop(key, None)
|
|
62
|
+
_write_all(path, data)
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import warnings
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, BinaryIO, Iterator
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from .checkpoint import clear_checkpoint, load_checkpoint, save_checkpoint
|
|
13
|
+
from .models import (
|
|
14
|
+
Batch,
|
|
15
|
+
BatchCreatePayload,
|
|
16
|
+
BatchList,
|
|
17
|
+
BatchResults,
|
|
18
|
+
InferenceRequest,
|
|
19
|
+
LoginResponse,
|
|
20
|
+
Response,
|
|
21
|
+
ResponseList,
|
|
22
|
+
Stream,
|
|
23
|
+
StreamInferenceCompletionEvent,
|
|
24
|
+
StreamEventList,
|
|
25
|
+
StreamList,
|
|
26
|
+
StreamWindow,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ApiError(Exception):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SferenceClient:
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
base_url: str | None = None,
|
|
38
|
+
api_key: str | None = None,
|
|
39
|
+
transport: httpx.BaseTransport | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.base_url = base_url or os.getenv("SFERENCE_BASE_URL", "https://api.sference.com")
|
|
42
|
+
self._token = api_key or os.getenv("SFERENCE_API_KEY")
|
|
43
|
+
self._client = httpx.Client(base_url=self.base_url, timeout=30.0, transport=transport)
|
|
44
|
+
|
|
45
|
+
def close(self) -> None:
|
|
46
|
+
self._client.close()
|
|
47
|
+
|
|
48
|
+
def __enter__(self) -> "SferenceClient":
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __exit__(self, *_: Any) -> None:
|
|
52
|
+
self.close()
|
|
53
|
+
|
|
54
|
+
def _headers(self) -> dict[str, str]:
|
|
55
|
+
headers = {"content-type": "application/json"}
|
|
56
|
+
if self._token:
|
|
57
|
+
headers["authorization"] = f"Bearer {self._token}"
|
|
58
|
+
return headers
|
|
59
|
+
|
|
60
|
+
def _request(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> Any:
|
|
61
|
+
response = self._client.request(method, path, headers=self._headers(), json=json_body)
|
|
62
|
+
if response.status_code >= 400:
|
|
63
|
+
try:
|
|
64
|
+
payload = response.json()
|
|
65
|
+
except Exception:
|
|
66
|
+
payload = {"detail": response.text}
|
|
67
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
68
|
+
return response.json()
|
|
69
|
+
|
|
70
|
+
def _request_response(self, method: str, path: str, json_body: dict[str, Any] | None = None) -> httpx.Response:
|
|
71
|
+
response = self._client.request(method, path, headers=self._headers(), json=json_body)
|
|
72
|
+
if response.status_code >= 400:
|
|
73
|
+
try:
|
|
74
|
+
payload = response.json()
|
|
75
|
+
except Exception:
|
|
76
|
+
payload = {"detail": response.text}
|
|
77
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
78
|
+
return response
|
|
79
|
+
|
|
80
|
+
def login(self, username: str, password: str) -> LoginResponse:
|
|
81
|
+
payload = self._request("POST", "/v1/auth/login", {"username": username, "password": password})
|
|
82
|
+
result = LoginResponse.model_validate(payload)
|
|
83
|
+
self._token = result.access_token
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
def get_me(self) -> dict[str, Any]:
|
|
87
|
+
return self._request("GET", "/v1/auth/me")
|
|
88
|
+
|
|
89
|
+
def submit_batch(
|
|
90
|
+
self,
|
|
91
|
+
*,
|
|
92
|
+
input_file: str | None = None,
|
|
93
|
+
requests: list[dict[str, Any]] | None = None,
|
|
94
|
+
model: str | None = None,
|
|
95
|
+
window: str = "24h",
|
|
96
|
+
) -> Batch:
|
|
97
|
+
if window != "24h":
|
|
98
|
+
raise ValueError('Only window "24h" is supported in MVP.')
|
|
99
|
+
resolved: list[InferenceRequest] | list[dict[str, Any]] = requests or []
|
|
100
|
+
if input_file:
|
|
101
|
+
resolved = self.parse_inference_requests_jsonl(Path(input_file), model=model)
|
|
102
|
+
if not resolved:
|
|
103
|
+
raise ValueError("At least one request is required (use input_file or non-empty requests)")
|
|
104
|
+
if isinstance(resolved, list) and resolved and isinstance(resolved[0], dict):
|
|
105
|
+
# Back-compat: accept OpenAI batch-style envelopes in `requests=[...]`.
|
|
106
|
+
reqs: list[InferenceRequest] = []
|
|
107
|
+
for r in resolved:
|
|
108
|
+
if not isinstance(r, dict):
|
|
109
|
+
raise TypeError("requests must be a list of dicts or InferenceRequest objects")
|
|
110
|
+
body = r.get("body")
|
|
111
|
+
if isinstance(body, dict) and {"method", "url", "body"}.issubset(r.keys()):
|
|
112
|
+
reqs.append(InferenceRequest(custom_id=r.get("custom_id"), body=body))
|
|
113
|
+
else:
|
|
114
|
+
reqs.append(InferenceRequest.model_validate(r))
|
|
115
|
+
resolved = reqs
|
|
116
|
+
payload = BatchCreatePayload(window="24h", requests=resolved) # type: ignore[arg-type]
|
|
117
|
+
response = self._request("POST", "/v1/batches", payload.model_dump())
|
|
118
|
+
return Batch.model_validate(response)
|
|
119
|
+
|
|
120
|
+
def get_batch(self, batch_id: str) -> Batch:
|
|
121
|
+
response = self._request("GET", f"/v1/batches/{batch_id}")
|
|
122
|
+
return Batch.model_validate(response)
|
|
123
|
+
|
|
124
|
+
def list_batches(self) -> BatchList:
|
|
125
|
+
response = self._request("GET", "/v1/batches")
|
|
126
|
+
return BatchList.model_validate(response)
|
|
127
|
+
|
|
128
|
+
def get_results(self, batch_id: str) -> BatchResults:
|
|
129
|
+
response = self._request("GET", f"/v1/batches/{batch_id}/results")
|
|
130
|
+
return BatchResults.model_validate(response)
|
|
131
|
+
|
|
132
|
+
def cancel_batch(self, batch_id: str) -> Batch:
|
|
133
|
+
response = self._request("POST", f"/v1/batches/{batch_id}/cancel")
|
|
134
|
+
return Batch.model_validate(response)
|
|
135
|
+
|
|
136
|
+
def download_results_jsonl(self, batch_id: str, out: str | Path | BinaryIO) -> None:
|
|
137
|
+
resp = self._request_response("GET", f"/v1/batches/{batch_id}/results.jsonl")
|
|
138
|
+
if isinstance(out, (str, Path)):
|
|
139
|
+
Path(out).write_bytes(resp.content)
|
|
140
|
+
return
|
|
141
|
+
out.write(resp.content)
|
|
142
|
+
|
|
143
|
+
def wait_for_completion(self, batch_id: str, poll_interval: float = 1.0, timeout: float = 30.0) -> Batch:
|
|
144
|
+
deadline = time.time() + timeout
|
|
145
|
+
while True:
|
|
146
|
+
batch = self.get_batch(batch_id)
|
|
147
|
+
if batch.status in ("completed", "failed", "cancelled"):
|
|
148
|
+
return batch
|
|
149
|
+
if time.time() >= deadline:
|
|
150
|
+
raise TimeoutError(f"Timed out waiting for batch {batch_id}")
|
|
151
|
+
time.sleep(poll_interval)
|
|
152
|
+
|
|
153
|
+
def create_stream(self, name: str, window: StreamWindow = "24h") -> Stream:
|
|
154
|
+
response = self._request("POST", "/v1/streams", {"name": name, "window": window})
|
|
155
|
+
return Stream.model_validate(response)
|
|
156
|
+
|
|
157
|
+
def list_streams(self) -> StreamList:
|
|
158
|
+
response = self._request("GET", "/v1/streams")
|
|
159
|
+
return StreamList.model_validate(response)
|
|
160
|
+
|
|
161
|
+
def get_stream(self, stream_id: str) -> Stream:
|
|
162
|
+
response = self._request("GET", f"/v1/streams/{stream_id}")
|
|
163
|
+
return Stream.model_validate(response)
|
|
164
|
+
|
|
165
|
+
# Note: POST /v1/streams/{id}/items has been removed. Use create_response() with metadata.stream_id instead.
|
|
166
|
+
|
|
167
|
+
def create_response(
|
|
168
|
+
self,
|
|
169
|
+
*,
|
|
170
|
+
model: str,
|
|
171
|
+
input: list[dict[str, str]], # [{"role": "user", "content": "..."}]
|
|
172
|
+
instructions: str | None = None,
|
|
173
|
+
max_output_tokens: int | None = None,
|
|
174
|
+
temperature: float | None = None,
|
|
175
|
+
top_p: float | None = None,
|
|
176
|
+
metadata: dict[str, Any] | None = None,
|
|
177
|
+
) -> Response:
|
|
178
|
+
"""Create a standalone response or stream-associated response.
|
|
179
|
+
|
|
180
|
+
For stream-associated, include stream_id in metadata:
|
|
181
|
+
metadata={"stream_id": "uuid", "completion_window": "24h"}
|
|
182
|
+
"""
|
|
183
|
+
from sference_sdk.models import Response, ResponseCreatePayload
|
|
184
|
+
|
|
185
|
+
payload: dict[str, Any] = {
|
|
186
|
+
"model": model,
|
|
187
|
+
"input": input,
|
|
188
|
+
}
|
|
189
|
+
if instructions is not None:
|
|
190
|
+
payload["instructions"] = instructions
|
|
191
|
+
if max_output_tokens is not None:
|
|
192
|
+
payload["max_output_tokens"] = max_output_tokens
|
|
193
|
+
if temperature is not None:
|
|
194
|
+
payload["temperature"] = temperature
|
|
195
|
+
if top_p is not None:
|
|
196
|
+
payload["top_p"] = top_p
|
|
197
|
+
if metadata is not None:
|
|
198
|
+
payload["metadata"] = metadata
|
|
199
|
+
|
|
200
|
+
response = self._request("POST", "/v1/responses", payload)
|
|
201
|
+
return Response.model_validate(response)
|
|
202
|
+
|
|
203
|
+
def get_response(self, response_id: str) -> Response:
|
|
204
|
+
"""Get a response by ID."""
|
|
205
|
+
from sference_sdk.models import Response
|
|
206
|
+
|
|
207
|
+
response = self._request("GET", f"/v1/responses/{response_id}")
|
|
208
|
+
return Response.model_validate(response)
|
|
209
|
+
|
|
210
|
+
def cancel_response(self, response_id: str) -> Response:
|
|
211
|
+
"""Cancel a pending or running response."""
|
|
212
|
+
from sference_sdk.models import Response
|
|
213
|
+
|
|
214
|
+
response = self._request("DELETE", f"/v1/responses/{response_id}")
|
|
215
|
+
return Response.model_validate(response)
|
|
216
|
+
|
|
217
|
+
def list_responses(
|
|
218
|
+
self,
|
|
219
|
+
*,
|
|
220
|
+
limit: int = 100,
|
|
221
|
+
stream_id: str | None = None,
|
|
222
|
+
) -> ResponseList:
|
|
223
|
+
"""List responses. Optionally filter by stream_id."""
|
|
224
|
+
from sference_sdk.models import ResponseList
|
|
225
|
+
|
|
226
|
+
params: dict[str, Any] = {"limit": limit}
|
|
227
|
+
if stream_id is not None:
|
|
228
|
+
params["stream_id"] = stream_id
|
|
229
|
+
|
|
230
|
+
response = self._request("GET", "/v1/responses", params=params)
|
|
231
|
+
return ResponseList.model_validate(response)
|
|
232
|
+
|
|
233
|
+
def archive_stream(self, stream_id: str) -> Stream:
|
|
234
|
+
response = self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "archived"})
|
|
235
|
+
return Stream.model_validate(response)
|
|
236
|
+
|
|
237
|
+
def cancel_stream(self, stream_id: str) -> Stream:
|
|
238
|
+
response = self._request("PATCH", f"/v1/streams/{stream_id}", {"status": "cancelled"})
|
|
239
|
+
return Stream.model_validate(response)
|
|
240
|
+
|
|
241
|
+
def list_stream_events(
|
|
242
|
+
self,
|
|
243
|
+
stream_id: str,
|
|
244
|
+
*,
|
|
245
|
+
limit: int = 20,
|
|
246
|
+
starting_after: str | None = None,
|
|
247
|
+
ending_before: str | None = None,
|
|
248
|
+
wait_ms: int = 0,
|
|
249
|
+
) -> StreamEventList:
|
|
250
|
+
params: dict[str, Any] = {"limit": limit, "wait_ms": wait_ms}
|
|
251
|
+
if starting_after is not None:
|
|
252
|
+
params["starting_after"] = starting_after
|
|
253
|
+
if ending_before is not None:
|
|
254
|
+
params["ending_before"] = ending_before
|
|
255
|
+
response = self._client.request(
|
|
256
|
+
"GET",
|
|
257
|
+
f"/v1/streams/{stream_id}/events",
|
|
258
|
+
headers=self._headers(),
|
|
259
|
+
params=params,
|
|
260
|
+
)
|
|
261
|
+
if response.status_code >= 400:
|
|
262
|
+
try:
|
|
263
|
+
payload = response.json()
|
|
264
|
+
except Exception:
|
|
265
|
+
payload = {"detail": response.text}
|
|
266
|
+
raise ApiError(f"{response.status_code}: {payload}")
|
|
267
|
+
return StreamEventList.model_validate(response.json())
|
|
268
|
+
|
|
269
|
+
def iter_stream_events(
|
|
270
|
+
self,
|
|
271
|
+
stream_id: str,
|
|
272
|
+
*,
|
|
273
|
+
consumer_name: str = "default",
|
|
274
|
+
starting_after: str | None = None,
|
|
275
|
+
checkpoint: bool = True,
|
|
276
|
+
from_latest: bool = False,
|
|
277
|
+
) -> Iterator[StreamInferenceCompletionEvent]:
|
|
278
|
+
if from_latest:
|
|
279
|
+
clear_checkpoint(self.base_url, stream_id, consumer_name)
|
|
280
|
+
cur: str | None = starting_after
|
|
281
|
+
if checkpoint and cur is None:
|
|
282
|
+
cur = load_checkpoint(self.base_url, stream_id, consumer_name)
|
|
283
|
+
|
|
284
|
+
if cur is None:
|
|
285
|
+
page = self.list_stream_events(stream_id, limit=100, wait_ms=0)
|
|
286
|
+
for ev in reversed(page.data):
|
|
287
|
+
yield ev
|
|
288
|
+
if checkpoint:
|
|
289
|
+
save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
while True:
|
|
293
|
+
page = self.list_stream_events(stream_id, limit=100, starting_after=cur, wait_ms=0)
|
|
294
|
+
if not page.data:
|
|
295
|
+
return
|
|
296
|
+
for ev in page.data:
|
|
297
|
+
yield ev
|
|
298
|
+
if checkpoint:
|
|
299
|
+
save_checkpoint(self.base_url, stream_id, consumer_name, ev.completion_id)
|
|
300
|
+
cur = ev.completion_id
|
|
301
|
+
if not page.has_more:
|
|
302
|
+
return
|
|
303
|
+
|
|
304
|
+
@staticmethod
|
|
305
|
+
def parse_inference_requests_jsonl(path: Path, model: str | None = None) -> list[InferenceRequest]:
|
|
306
|
+
parsed: list[InferenceRequest] = []
|
|
307
|
+
warn_model_ignored = False
|
|
308
|
+
with path.open("r", encoding="utf-8") as f:
|
|
309
|
+
for idx, line in enumerate(f, start=1):
|
|
310
|
+
line = line.strip()
|
|
311
|
+
if not line:
|
|
312
|
+
continue
|
|
313
|
+
obj = json.loads(line)
|
|
314
|
+
# OpenAI batch-style envelope: {"custom_id", "method", "url", "body"}
|
|
315
|
+
if isinstance(obj, dict) and {"method", "url", "body"}.issubset(obj.keys()):
|
|
316
|
+
if model:
|
|
317
|
+
warn_model_ignored = True
|
|
318
|
+
body = obj.get("body")
|
|
319
|
+
if not isinstance(body, dict) or not body.get("model"):
|
|
320
|
+
raise ValueError(f"Invalid JSONL line {idx}: body.model is required")
|
|
321
|
+
parsed.append(
|
|
322
|
+
InferenceRequest(
|
|
323
|
+
custom_id=obj.get("custom_id"),
|
|
324
|
+
body=body,
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
continue
|
|
328
|
+
# Content-only format: {"content": "..."}
|
|
329
|
+
if isinstance(obj, dict) and "content" in obj:
|
|
330
|
+
if not model:
|
|
331
|
+
raise ValueError("model is required for content-only JSONL format")
|
|
332
|
+
parsed.append(
|
|
333
|
+
InferenceRequest(
|
|
334
|
+
custom_id=obj.get("custom_id") or f"request-{idx}",
|
|
335
|
+
body={
|
|
336
|
+
"model": model,
|
|
337
|
+
"messages": [{"role": "user", "content": obj["content"]}],
|
|
338
|
+
},
|
|
339
|
+
)
|
|
340
|
+
)
|
|
341
|
+
continue
|
|
342
|
+
raise ValueError(f"Unsupported JSONL line format at line {idx}")
|
|
343
|
+
if warn_model_ignored:
|
|
344
|
+
warnings.warn(
|
|
345
|
+
"model argument is ignored for OpenAI-compatible JSONL lines",
|
|
346
|
+
stacklevel=2,
|
|
347
|
+
)
|
|
348
|
+
return parsed
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def _parse_jsonl(path: Path, model: str | None = None) -> list[dict[str, Any]]:
|
|
352
|
+
"""Backwards-compatible JSONL parser (returns plain dicts).
|
|
353
|
+
|
|
354
|
+
Prefer `parse_inference_requests_jsonl`, which returns `InferenceRequest` objects.
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
return [r.model_dump() for r in SferenceClient.parse_inference_requests_jsonl(path, model=model)]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
BatchStatus = Literal["pending", "running", "completed", "failed", "cancelled"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class InferenceRequest(BaseModel):
|
|
12
|
+
"""Unified inference request schema for batches and streams.
|
|
13
|
+
|
|
14
|
+
custom_id is user-provided and can be used for correlation.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
model_config = ConfigDict(extra="forbid")
|
|
18
|
+
|
|
19
|
+
custom_id: str | None = Field(default=None, description="User-provided identifier for correlation.")
|
|
20
|
+
body: dict[str, Any] = Field(description="OpenAI-style request body. Must include `model`.")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LoginResponse(BaseModel):
|
|
24
|
+
access_token: str
|
|
25
|
+
token_type: str
|
|
26
|
+
user: dict[str, str]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Batch(BaseModel):
|
|
30
|
+
id: str
|
|
31
|
+
status: BatchStatus
|
|
32
|
+
window: Literal["24h"]
|
|
33
|
+
request_count: int
|
|
34
|
+
created_at: str
|
|
35
|
+
updated_at: str
|
|
36
|
+
completed_at: str | None = None
|
|
37
|
+
total_prompt_tokens: int = 0
|
|
38
|
+
total_completion_tokens: int = 0
|
|
39
|
+
total_tokens: int = 0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BatchList(BaseModel):
|
|
43
|
+
items: list[Batch]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BatchResults(BaseModel):
|
|
47
|
+
batch_id: str
|
|
48
|
+
status: BatchStatus
|
|
49
|
+
output_url: str | None = None
|
|
50
|
+
completed_at: str | None = None
|
|
51
|
+
results: list[dict[str, Any]] | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class BatchCreatePayload(BaseModel):
|
|
55
|
+
window: Literal["24h"] = "24h"
|
|
56
|
+
requests: list[InferenceRequest] = Field(min_length=1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
StreamStatus = Literal["open", "cancelled", "archived"]
|
|
60
|
+
StreamWindow = Literal["1h", "24h"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Stream(BaseModel):
|
|
64
|
+
"""Stream detail (GET /v1/streams/{id}); list responses omit counter fields (defaults apply)."""
|
|
65
|
+
|
|
66
|
+
id: str
|
|
67
|
+
name: str
|
|
68
|
+
window: StreamWindow
|
|
69
|
+
status: StreamStatus
|
|
70
|
+
created_at: str
|
|
71
|
+
updated_at: str
|
|
72
|
+
cancelled_at: str | None = None
|
|
73
|
+
archived_at: str | None = None
|
|
74
|
+
total_items: int = 0
|
|
75
|
+
pending_items: int = 0
|
|
76
|
+
running_items: int = 0
|
|
77
|
+
completed_items: int = 0
|
|
78
|
+
failed_items: int = 0
|
|
79
|
+
cancelled_items: int = 0
|
|
80
|
+
completion_ratio: float = 0.0
|
|
81
|
+
total_prompt_tokens: int = 0
|
|
82
|
+
total_completion_tokens: int = 0
|
|
83
|
+
total_tokens: int = 0
|
|
84
|
+
last_completion_at: str | None = None
|
|
85
|
+
latest_completion_id: str | None = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class StreamInferenceCompletionEvent(BaseModel):
|
|
89
|
+
completion_id: str
|
|
90
|
+
custom_id: str | None = None
|
|
91
|
+
status: str
|
|
92
|
+
result: dict[str, Any] | None = None
|
|
93
|
+
error: dict[str, Any] | None = None
|
|
94
|
+
prompt_tokens: int | None = None
|
|
95
|
+
completion_tokens: int | None = None
|
|
96
|
+
total_tokens: int | None = None
|
|
97
|
+
completed_at: str | None = None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class StreamEventList(BaseModel):
|
|
101
|
+
object: str = "list"
|
|
102
|
+
data: list[StreamInferenceCompletionEvent]
|
|
103
|
+
has_more: bool
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class StreamList(BaseModel):
|
|
107
|
+
items: list[Stream]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Response (OpenAI-compatible) models
|
|
111
|
+
ResponseStatus = Literal["in_progress", "completed", "failed", "cancelled"]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class ResponseInputMessage(BaseModel):
|
|
115
|
+
"""Message in the input array for a response."""
|
|
116
|
+
|
|
117
|
+
role: Literal["user", "assistant", "system", "developer"]
|
|
118
|
+
content: str
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ResponseCreatePayload(BaseModel):
|
|
122
|
+
"""Request body for POST /v1/responses."""
|
|
123
|
+
|
|
124
|
+
model_config = ConfigDict(extra="forbid")
|
|
125
|
+
|
|
126
|
+
model: str = Field(description='Model identifier, e.g. "zai-org/GLM-5"')
|
|
127
|
+
input: list[ResponseInputMessage] = Field(min_length=1)
|
|
128
|
+
instructions: str | None = None
|
|
129
|
+
max_output_tokens: int | None = None
|
|
130
|
+
temperature: float | None = Field(default=None, ge=0.0, le=2.0)
|
|
131
|
+
top_p: float | None = Field(default=None, ge=0.0, le=1.0)
|
|
132
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ResponseOutputContent(BaseModel):
|
|
136
|
+
"""Content item in a response output."""
|
|
137
|
+
|
|
138
|
+
type: Literal["output_text"] = "output_text"
|
|
139
|
+
text: str
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ResponseUsage(BaseModel):
|
|
143
|
+
"""Token usage for a response."""
|
|
144
|
+
|
|
145
|
+
input_tokens: int
|
|
146
|
+
output_tokens: int
|
|
147
|
+
total_tokens: int
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class ResponseError(BaseModel):
|
|
151
|
+
"""Error information for a failed response."""
|
|
152
|
+
|
|
153
|
+
code: str
|
|
154
|
+
message: str
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class Response(BaseModel):
|
|
158
|
+
"""OpenAI-compatible response object."""
|
|
159
|
+
|
|
160
|
+
id: str
|
|
161
|
+
object: Literal["response"] = "response"
|
|
162
|
+
created_at: int
|
|
163
|
+
model: str
|
|
164
|
+
status: ResponseStatus
|
|
165
|
+
output: list[ResponseOutputContent] | None = None
|
|
166
|
+
error: ResponseError | None = None
|
|
167
|
+
usage: ResponseUsage | None = None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class ResponseListItem(BaseModel):
|
|
171
|
+
"""Item in a response list."""
|
|
172
|
+
|
|
173
|
+
id: str
|
|
174
|
+
object: Literal["response"] = "response"
|
|
175
|
+
created_at: int
|
|
176
|
+
model: str
|
|
177
|
+
status: ResponseStatus
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class ResponseList(BaseModel):
|
|
181
|
+
"""Paginated list of responses."""
|
|
182
|
+
|
|
183
|
+
object: Literal["list"] = "list"
|
|
184
|
+
data: list[ResponseListItem]
|
|
185
|
+
has_more: bool = False
|