flexinference 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ node_modules/
2
+ dist/
3
+ build/
4
+ *.egg-info/
5
+ __pycache__/
6
+ .venv/
7
+ .mypy_cache/
8
+ .ruff_cache/
9
+ .pytest_cache/
10
+ .env*
11
+ *.log
12
+ .DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aditya Perswal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,120 @@
1
+ Metadata-Version: 2.4
2
+ Name: flexinference
3
+ Version: 0.1.0
4
+ Summary: Official Python SDK for FlexInference - a deadline-aware, OpenAI-compatible inference router.
5
+ Project-URL: Homepage, https://flexinference.com
6
+ Project-URL: Documentation, https://flexinference.com/docs
7
+ Author-email: Aditya Perswal <adityaperswal@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: ai,flexinference,gpt,inference,llm,openai
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Typing :: Typed
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: httpx>=0.27
17
+ Requires-Dist: typing-extensions>=4.12
18
+ Provides-Extra: dev
19
+ Requires-Dist: mypy>=1.13; extra == 'dev'
20
+ Requires-Dist: pytest>=8; extra == 'dev'
21
+ Requires-Dist: ruff>=0.7; extra == 'dev'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # FlexInference (Python)
25
+
26
+ The official Python SDK for [FlexInference](https://flexinference.com) - a deadline-aware, OpenAI-compatible inference router. Send the OpenAI requests you already send, bring your own OpenAI key, and add one field - `start_within` - to trade latency for cost.
27
+
28
+ ```bash
29
+ pip install flexinference
30
+ ```
31
+
32
+ ## Quickstart
33
+
34
+ ```python
35
+ from flexinference import FlexInference
36
+
37
+ client = FlexInference(api_key="flex_live_...")
38
+
39
+ res = client.responses.create({
40
+ "model": "gpt-5.5",
41
+ "input": "Write a haiku about cheap GPUs.",
42
+ "start_within": "00h-00m-30s",
43
+ })
44
+
45
+ print(res["output_text"])
46
+ ```
47
+
48
+ `start_within` takes `"priority"`, `"standard"`, or a duration `"HHh-MMm-SSs"` (5s-10m) that races OpenAI's flex tier and falls back to standard if it can't start in time. See the [docs](https://flexinference.com/docs/deadline-routing).
49
+
50
+ ## Streaming
51
+
52
+ ```python
53
+ stream = client.responses.create(
54
+ {"model": "gpt-5-nano", "input": "Count to ten.", "start_within": "00h-00m-20s"},
55
+ stream=True,
56
+ )
57
+ for event in stream:
58
+ if event.get("type") == "response.output_text.delta":
59
+ print(event["delta"], end="")
60
+ ```
61
+
62
+ ## Chat Completions
63
+
64
+ ```python
65
+ res = client.chat.completions.create({
66
+ "model": "gpt-5.5",
67
+ "messages": [{"role": "user", "content": "Hello!"}],
68
+ "start_within": "standard",
69
+ })
70
+ print(res["choices"][0]["message"]["content"])
71
+ ```
72
+
73
+ ## Closing the client
74
+
75
+ The client holds a pooled `httpx.Client`, so close it when you're done to release connections. Use it as a context manager:
76
+
77
+ ```python
78
+ with FlexInference(api_key="flex_live_...") as client:
79
+ res = client.responses.create({"model": "gpt-5.5", "input": "Hi."})
80
+ print(res["output_text"])
81
+ # connections are released on exit
82
+ ```
83
+
84
+ Or close it yourself:
85
+
86
+ ```python
87
+ client = FlexInference(api_key="flex_live_...")
88
+ try:
89
+ ...
90
+ finally:
91
+ client.close()
92
+ ```
93
+
94
+ ## Errors
95
+
96
+ Non-2xx responses raise `FlexInferenceError`, carrying the OpenAI-shaped `status`, `type`, `code`, and `param`:
97
+
98
+ ```python
99
+ from flexinference import FlexInferenceError
100
+
101
+ try:
102
+ client.responses.create({"model": "gpt-5.5", "input": "hi", "start_within": "priority"})
103
+ except FlexInferenceError as err:
104
+ if err.code == "no_byok_key":
105
+ print("Add your OpenAI key in the dashboard.")
106
+ else:
107
+ raise
108
+ ```
109
+
110
+ ## Configuration
111
+
112
+ | Argument | Default | Description |
113
+ | --- | --- | --- |
114
+ | `api_key` | (required) | Your `flex_live_` key. |
115
+ | `base_url` | `https://api.flexinference.com/v1` | Override the router endpoint. |
116
+ | `client` | `httpx.Client` with a 600s read timeout | Provide your own `httpx.Client`. |
117
+
118
+ ## License
119
+
120
+ MIT
@@ -0,0 +1,97 @@
1
+ # FlexInference (Python)
2
+
3
+ The official Python SDK for [FlexInference](https://flexinference.com) - a deadline-aware, OpenAI-compatible inference router. Send the OpenAI requests you already send, bring your own OpenAI key, and add one field - `start_within` - to trade latency for cost.
4
+
5
+ ```bash
6
+ pip install flexinference
7
+ ```
8
+
9
+ ## Quickstart
10
+
11
+ ```python
12
+ from flexinference import FlexInference
13
+
14
+ client = FlexInference(api_key="flex_live_...")
15
+
16
+ res = client.responses.create({
17
+ "model": "gpt-5.5",
18
+ "input": "Write a haiku about cheap GPUs.",
19
+ "start_within": "00h-00m-30s",
20
+ })
21
+
22
+ print(res["output_text"])
23
+ ```
24
+
25
+ `start_within` takes `"priority"`, `"standard"`, or a duration `"HHh-MMm-SSs"` (5s-10m) that races OpenAI's flex tier and falls back to standard if it can't start in time. See the [docs](https://flexinference.com/docs/deadline-routing).
26
+
27
+ ## Streaming
28
+
29
+ ```python
30
+ stream = client.responses.create(
31
+ {"model": "gpt-5-nano", "input": "Count to ten.", "start_within": "00h-00m-20s"},
32
+ stream=True,
33
+ )
34
+ for event in stream:
35
+ if event.get("type") == "response.output_text.delta":
36
+ print(event["delta"], end="")
37
+ ```
38
+
39
+ ## Chat Completions
40
+
41
+ ```python
42
+ res = client.chat.completions.create({
43
+ "model": "gpt-5.5",
44
+ "messages": [{"role": "user", "content": "Hello!"}],
45
+ "start_within": "standard",
46
+ })
47
+ print(res["choices"][0]["message"]["content"])
48
+ ```
49
+
50
+ ## Closing the client
51
+
52
+ The client holds a pooled `httpx.Client`, so close it when you're done to release connections. Use it as a context manager:
53
+
54
+ ```python
55
+ with FlexInference(api_key="flex_live_...") as client:
56
+ res = client.responses.create({"model": "gpt-5.5", "input": "Hi."})
57
+ print(res["output_text"])
58
+ # connections are released on exit
59
+ ```
60
+
61
+ Or close it yourself:
62
+
63
+ ```python
64
+ client = FlexInference(api_key="flex_live_...")
65
+ try:
66
+ ...
67
+ finally:
68
+ client.close()
69
+ ```
70
+
71
+ ## Errors
72
+
73
+ Non-2xx responses raise `FlexInferenceError`, carrying the OpenAI-shaped `status`, `type`, `code`, and `param`:
74
+
75
+ ```python
76
+ from flexinference import FlexInferenceError
77
+
78
+ try:
79
+ client.responses.create({"model": "gpt-5.5", "input": "hi", "start_within": "priority"})
80
+ except FlexInferenceError as err:
81
+ if err.code == "no_byok_key":
82
+ print("Add your OpenAI key in the dashboard.")
83
+ else:
84
+ raise
85
+ ```
86
+
87
+ ## Configuration
88
+
89
+ | Argument | Default | Description |
90
+ | --- | --- | --- |
91
+ | `api_key` | (required) | Your `flex_live_` key. |
92
+ | `base_url` | `https://api.flexinference.com/v1` | Override the router endpoint. |
93
+ | `client` | `httpx.Client` with a 600s read timeout | Provide your own `httpx.Client`. |
94
+
95
+ ## License
96
+
97
+ MIT
@@ -0,0 +1,79 @@
1
+ [project]
2
+ name = "flexinference"
3
+ version = "0.1.0"
4
+ description = "Official Python SDK for FlexInference - a deadline-aware, OpenAI-compatible inference router."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [{ name = "Aditya Perswal", email = "adityaperswal@gmail.com" }]
8
+ requires-python = ">=3.12"
9
+ keywords = ["flexinference", "openai", "llm", "inference", "ai", "gpt"]
10
+ classifiers = [
11
+ "Programming Language :: Python :: 3",
12
+ "License :: OSI Approved :: MIT License",
13
+ "Operating System :: OS Independent",
14
+ "Typing :: Typed",
15
+ ]
16
+ dependencies = [
17
+ "httpx>=0.27",
18
+ "typing-extensions>=4.12",
19
+ ]
20
+
21
+ [project.urls]
22
+ Homepage = "https://flexinference.com"
23
+ Documentation = "https://flexinference.com/docs"
24
+
25
+ [project.optional-dependencies]
26
+ dev = [
27
+ "mypy>=1.13",
28
+ "ruff>=0.7",
29
+ "pytest>=8",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["hatchling"]
34
+ build-backend = "hatchling.build"
35
+
36
+ [tool.hatch.build.targets.wheel]
37
+ packages = ["src/flexinference"]
38
+
39
+ [tool.mypy]
40
+ strict = true
41
+ python_version = "3.12"
42
+
43
+ # models.py is generated by datamodel-code-generator; its types are used to check
44
+ # our code, but its own TypedDict-extension quirks are not ours to fix.
45
+ [[tool.mypy.overrides]]
46
+ module = "flexinference.models"
47
+ ignore_errors = true
48
+
49
+ [tool.ruff]
50
+ line-length = 100
51
+ extend-exclude = ["src/flexinference/models.py"]
52
+
53
+ [tool.ruff.lint]
54
+ # A strong, library-appropriate rule set for a typed HTTP client. No ANN: mypy
55
+ # strict already enforces annotations, so ANN would be redundant noise. No app
56
+ # rules (FastAPI/SQLAlchemy/web) - this is just a typed client.
57
+ select = [
58
+ "E", # pycodestyle errors
59
+ "F", # pyflakes
60
+ "W", # pycodestyle warnings
61
+ "I", # isort
62
+ "UP", # pyupgrade
63
+ "B", # flake8-bugbear
64
+ "SIM", # flake8-simplify
65
+ "N", # pep8-naming
66
+ "C4", # flake8-comprehensions
67
+ "PT", # flake8-pytest-style
68
+ "RUF", # ruff-specific rules
69
+ "PIE", # flake8-pie
70
+ "TID", # flake8-tidy-imports
71
+ ]
72
+
73
+ [tool.ruff.lint.per-file-ignores]
74
+ # Tests assert freely and may carry long literal SSE/fixture lines.
75
+ "tests/**" = ["S101", "E501"]
76
+ # models.py is generated by datamodel-code-generator.
77
+ "src/flexinference/models.py" = ["ALL"]
78
+
79
+ [tool.ruff.format]
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from . import models
4
+ from ._client import DEFAULT_BASE_URL, FlexInference, FlexInferenceError
5
+
6
+ __all__ = ["DEFAULT_BASE_URL", "FlexInference", "FlexInferenceError", "models"]
@@ -0,0 +1,156 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections.abc import Iterator, Mapping
5
+ from typing import Any, Literal, cast, overload
6
+
7
+ import httpx
8
+
9
+ from .models import (
10
+ CreateChatCompletionRequest,
11
+ CreateChatCompletionResponse,
12
+ CreateChatCompletionStreamResponse,
13
+ CreateResponse,
14
+ Response,
15
+ ResponseStreamEvent,
16
+ )
17
+
18
+ DEFAULT_BASE_URL = "https://api.flexinference.com/v1"
19
+
20
+
21
+ class FlexInferenceError(Exception):
22
+ """An error returned by FlexInference or by the upstream provider, passed through."""
23
+
24
+ def __init__(self, status: int, body: Mapping[str, Any] | None, fallback: str) -> None:
25
+ err = body.get("error") if isinstance(body, Mapping) else None
26
+ err = err if isinstance(err, Mapping) else {}
27
+ super().__init__(err.get("message") or fallback)
28
+ self.status = status
29
+ self.type: str | None = err.get("type")
30
+ self.code: str | None = err.get("code")
31
+ self.param: str | None = err.get("param")
32
+
33
+
34
+ class FlexInference:
35
+ def __init__(
36
+ self,
37
+ *,
38
+ api_key: str,
39
+ base_url: str = DEFAULT_BASE_URL,
40
+ client: httpx.Client | None = None,
41
+ ) -> None:
42
+ if not api_key:
43
+ raise ValueError("FlexInference: `api_key` is required.")
44
+ self._base_url = base_url.rstrip("/")
45
+ # Inference calls routinely run far longer than httpx's 5s default; match the
46
+ # OpenAI SDK's generous read budget while keeping a short connect timeout.
47
+ self._client = client or httpx.Client(timeout=httpx.Timeout(600.0, connect=10.0))
48
+ self._headers = {
49
+ "Authorization": f"Bearer {api_key}",
50
+ "Content-Type": "application/json",
51
+ "Accept": "text/event-stream, application/json",
52
+ }
53
+ self.responses = _Responses(self)
54
+ self.chat = _Chat(self)
55
+
56
+ def close(self) -> None:
57
+ self._client.close()
58
+
59
+ def __enter__(self) -> FlexInference:
60
+ return self
61
+
62
+ def __exit__(self, *exc: object) -> None:
63
+ self.close()
64
+
65
+ def _post_json(self, path: str, payload: Mapping[str, Any]) -> Any:
66
+ r = self._client.post(
67
+ f"{self._base_url}{path}", headers=self._headers, content=json.dumps(payload)
68
+ )
69
+ if r.status_code >= 400:
70
+ raise FlexInferenceError(r.status_code, _safe_json(r), f"HTTP {r.status_code}")
71
+ return r.json()
72
+
73
+ def _post_stream(self, path: str, payload: Mapping[str, Any]) -> Iterator[Any]:
74
+ with self._client.stream(
75
+ "POST", f"{self._base_url}{path}", headers=self._headers, content=json.dumps(payload)
76
+ ) as r:
77
+ if r.status_code >= 400:
78
+ r.read()
79
+ raise FlexInferenceError(r.status_code, _safe_json(r), f"HTTP {r.status_code}")
80
+ yield from _parse_sse(r)
81
+
82
+
83
+ def _safe_json(r: httpx.Response) -> Mapping[str, Any] | None:
84
+ try:
85
+ data = r.json()
86
+ except (json.JSONDecodeError, ValueError):
87
+ return None
88
+ return data if isinstance(data, Mapping) else None
89
+
90
+
91
+ def _parse_sse(r: httpx.Response) -> Iterator[Any]:
92
+ data_lines: list[str] = []
93
+ for line in r.iter_lines():
94
+ if line == "":
95
+ if data_lines:
96
+ data = "\n".join(data_lines)
97
+ data_lines = []
98
+ if data == "[DONE]":
99
+ return
100
+ yield json.loads(data)
101
+ continue
102
+ if line.startswith("data:"):
103
+ data_lines.append(line[5:].lstrip(" "))
104
+ if data_lines:
105
+ data = "\n".join(data_lines)
106
+ if data != "[DONE]":
107
+ yield json.loads(data)
108
+
109
+
110
+ class _Responses:
111
+ def __init__(self, parent: FlexInference) -> None:
112
+ self._parent = parent
113
+
114
+ @overload
115
+ def create(self, body: CreateResponse, *, stream: Literal[False] = False) -> Response: ...
116
+ @overload
117
+ def create(
118
+ self, body: CreateResponse, *, stream: Literal[True]
119
+ ) -> Iterator[ResponseStreamEvent]: ...
120
+ def create(
121
+ self, body: CreateResponse, *, stream: bool = False
122
+ ) -> Response | Iterator[ResponseStreamEvent]:
123
+ payload = dict(body)
124
+ payload["stream"] = stream
125
+ if stream:
126
+ return self._parent._post_stream("/responses", payload)
127
+ return cast(Response, self._parent._post_json("/responses", payload))
128
+
129
+
130
+ class _Chat:
131
+ def __init__(self, parent: FlexInference) -> None:
132
+ self.completions = _ChatCompletions(parent)
133
+
134
+
135
+ class _ChatCompletions:
136
+ def __init__(self, parent: FlexInference) -> None:
137
+ self._parent = parent
138
+
139
+ @overload
140
+ def create(
141
+ self, body: CreateChatCompletionRequest, *, stream: Literal[False] = False
142
+ ) -> CreateChatCompletionResponse: ...
143
+ @overload
144
+ def create(
145
+ self, body: CreateChatCompletionRequest, *, stream: Literal[True]
146
+ ) -> Iterator[CreateChatCompletionStreamResponse]: ...
147
+ def create(
148
+ self, body: CreateChatCompletionRequest, *, stream: bool = False
149
+ ) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
150
+ payload = dict(body)
151
+ payload["stream"] = stream
152
+ if stream:
153
+ return self._parent._post_stream("/chat/completions", payload)
154
+ return cast(
155
+ CreateChatCompletionResponse, self._parent._post_json("/chat/completions", payload)
156
+ )