lmstd 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lmstd-0.1.0 → lmstd-0.2.0}/LICENSE +1 -1
- {lmstd-0.1.0 → lmstd-0.2.0}/PKG-INFO +5 -3
- {lmstd-0.1.0 → lmstd-0.2.0}/README.md +3 -1
- {lmstd-0.1.0 → lmstd-0.2.0}/lmstd.egg-info/PKG-INFO +5 -3
- lmstd-0.2.0/lmstd.py +612 -0
- {lmstd-0.1.0 → lmstd-0.2.0}/pyproject.toml +2 -2
- lmstd-0.1.0/lmstd.py +0 -316
- {lmstd-0.1.0 → lmstd-0.2.0}/lmstd.egg-info/SOURCES.txt +0 -0
- {lmstd-0.1.0 → lmstd-0.2.0}/lmstd.egg-info/dependency_links.txt +0 -0
- {lmstd-0.1.0 → lmstd-0.2.0}/lmstd.egg-info/requires.txt +0 -0
- {lmstd-0.1.0 → lmstd-0.2.0}/lmstd.egg-info/top_level.txt +0 -0
- {lmstd-0.1.0 → lmstd-0.2.0}/setup.cfg +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lmstd
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: LM Studio v1 REST API Client Library
|
|
5
5
|
Author: LM Studio User
|
|
6
|
-
License: MIT
|
|
6
|
+
License-Expression: MIT
|
|
7
7
|
Requires-Python: >=3.7
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
@@ -51,4 +51,6 @@ print(models)
|
|
|
51
51
|
|
|
52
52
|
## License
|
|
53
53
|
|
|
54
|
-
|
|
54
|
+
Copyright (c) 2026 EMuVi (emuvi@outlook.com.br)
|
|
55
|
+
|
|
56
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lmstd
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: LM Studio v1 REST API Client Library
|
|
5
5
|
Author: LM Studio User
|
|
6
|
-
License: MIT
|
|
6
|
+
License-Expression: MIT
|
|
7
7
|
Requires-Python: >=3.7
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
@@ -51,4 +51,6 @@ print(models)
|
|
|
51
51
|
|
|
52
52
|
## License
|
|
53
53
|
|
|
54
|
-
|
|
54
|
+
Copyright (c) 2026 EMuVi (emuvi@outlook.com.br)
|
|
55
|
+
|
|
56
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
lmstd-0.2.0/lmstd.py
ADDED
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
# Copyright (c) 2026 EMuVi (emuvi@outlook.com.br)
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
LM Studio v1 REST API Client Library
|
|
6
|
+
|
|
7
|
+
This single-file library provides a clean, fully documented Python interface
|
|
8
|
+
to interact with an LM Studio local server based on the v1 REST API endpoints.
|
|
9
|
+
|
|
10
|
+
Features supported natively via the v1 API:
|
|
11
|
+
- Stateful chats
|
|
12
|
+
- Model Context Protocol (MCP) integrations via API
|
|
13
|
+
- Authentication configuration with API tokens
|
|
14
|
+
- Advanced model lifecycle management (download, load, unload)
|
|
15
|
+
|
|
16
|
+
Dependencies:
|
|
17
|
+
requests
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
from typing import Any, Dict, Iterator, List, Optional, Union, Literal, TypedDict
|
|
23
|
+
|
|
24
|
+
class TextInput(TypedDict):
|
|
25
|
+
type: Literal["message"]
|
|
26
|
+
content: str
|
|
27
|
+
|
|
28
|
+
class ImageInput(TypedDict):
|
|
29
|
+
type: Literal["image"]
|
|
30
|
+
data_url: str
|
|
31
|
+
|
|
32
|
+
InputItem = Union[TextInput, ImageInput]
|
|
33
|
+
|
|
34
|
+
class PluginIntegrationBase(TypedDict):
|
|
35
|
+
type: Literal["plugin"]
|
|
36
|
+
id: str
|
|
37
|
+
|
|
38
|
+
class PluginIntegration(PluginIntegrationBase, total=False):
|
|
39
|
+
allowed_tools: List[str]
|
|
40
|
+
|
|
41
|
+
class EphemeralMCPIntegrationBase(TypedDict):
|
|
42
|
+
type: Literal["ephemeral_mcp"]
|
|
43
|
+
server_label: str
|
|
44
|
+
server_url: str
|
|
45
|
+
|
|
46
|
+
class EphemeralMCPIntegration(EphemeralMCPIntegrationBase, total=False):
|
|
47
|
+
allowed_tools: List[str]
|
|
48
|
+
|
|
49
|
+
Integration = Union[str, PluginIntegration, EphemeralMCPIntegration]
|
|
50
|
+
|
|
51
|
+
class ProviderInfoPlugin(TypedDict):
|
|
52
|
+
type: Literal["plugin"]
|
|
53
|
+
plugin_id: str
|
|
54
|
+
|
|
55
|
+
class ProviderInfoEphemeralMCP(TypedDict):
|
|
56
|
+
type: Literal["ephemeral_mcp"]
|
|
57
|
+
server_label: str
|
|
58
|
+
|
|
59
|
+
ProviderInfo = Union[ProviderInfoPlugin, ProviderInfoEphemeralMCP]
|
|
60
|
+
|
|
61
|
+
class MessageOutput(TypedDict):
|
|
62
|
+
type: Literal["message"]
|
|
63
|
+
content: str
|
|
64
|
+
|
|
65
|
+
class ToolCallOutput(TypedDict, total=False):
|
|
66
|
+
type: Literal["tool_call"]
|
|
67
|
+
tool: str
|
|
68
|
+
arguments: Dict[str, Any]
|
|
69
|
+
output: str
|
|
70
|
+
provider_info: ProviderInfo
|
|
71
|
+
|
|
72
|
+
class ReasoningOutput(TypedDict):
|
|
73
|
+
type: Literal["reasoning"]
|
|
74
|
+
content: str
|
|
75
|
+
|
|
76
|
+
class InvalidToolCallMetadata(TypedDict, total=False):
|
|
77
|
+
type: Literal["invalid_name", "invalid_arguments"]
|
|
78
|
+
tool_name: str
|
|
79
|
+
arguments: Dict[str, Any]
|
|
80
|
+
provider_info: ProviderInfo
|
|
81
|
+
|
|
82
|
+
class InvalidToolCallOutput(TypedDict, total=False):
|
|
83
|
+
type: Literal["invalid_tool_call"]
|
|
84
|
+
reason: str
|
|
85
|
+
metadata: InvalidToolCallMetadata
|
|
86
|
+
|
|
87
|
+
OutputItem = Union[MessageOutput, ToolCallOutput, ReasoningOutput, InvalidToolCallOutput]
|
|
88
|
+
|
|
89
|
+
class ChatStats(TypedDict, total=False):
|
|
90
|
+
input_tokens: int
|
|
91
|
+
total_output_tokens: int
|
|
92
|
+
reasoning_output_tokens: int
|
|
93
|
+
tokens_per_second: float
|
|
94
|
+
time_to_first_token_seconds: float
|
|
95
|
+
model_load_time_seconds: float
|
|
96
|
+
|
|
97
|
+
class ChatResponse(TypedDict, total=False):
|
|
98
|
+
model_instance_id: str
|
|
99
|
+
output: List[OutputItem]
|
|
100
|
+
stats: ChatStats
|
|
101
|
+
response_id: str
|
|
102
|
+
|
|
103
|
+
class ChatStartEvent(TypedDict):
|
|
104
|
+
type: Literal["chat.start"]
|
|
105
|
+
model_instance_id: str
|
|
106
|
+
|
|
107
|
+
class ModelLoadStartEvent(TypedDict):
|
|
108
|
+
type: Literal["model_load.start"]
|
|
109
|
+
model_instance_id: str
|
|
110
|
+
|
|
111
|
+
class ModelLoadProgressEvent(TypedDict):
|
|
112
|
+
type: Literal["model_load.progress"]
|
|
113
|
+
model_instance_id: str
|
|
114
|
+
progress: float
|
|
115
|
+
|
|
116
|
+
class ModelLoadEndEvent(TypedDict):
|
|
117
|
+
type: Literal["model_load.end"]
|
|
118
|
+
model_instance_id: str
|
|
119
|
+
load_time_seconds: float
|
|
120
|
+
|
|
121
|
+
class PromptProcessingStartEvent(TypedDict):
|
|
122
|
+
type: Literal["prompt_processing.start"]
|
|
123
|
+
|
|
124
|
+
class PromptProcessingProgressEvent(TypedDict):
|
|
125
|
+
type: Literal["prompt_processing.progress"]
|
|
126
|
+
progress: float
|
|
127
|
+
|
|
128
|
+
class PromptProcessingEndEvent(TypedDict):
|
|
129
|
+
type: Literal["prompt_processing.end"]
|
|
130
|
+
|
|
131
|
+
class ReasoningStartEvent(TypedDict):
|
|
132
|
+
type: Literal["reasoning.start"]
|
|
133
|
+
|
|
134
|
+
class ReasoningDeltaEvent(TypedDict):
|
|
135
|
+
type: Literal["reasoning.delta"]
|
|
136
|
+
content: str
|
|
137
|
+
|
|
138
|
+
class ReasoningEndEvent(TypedDict):
|
|
139
|
+
type: Literal["reasoning.end"]
|
|
140
|
+
|
|
141
|
+
class ToolCallStartEvent(TypedDict):
|
|
142
|
+
type: Literal["tool_call.start"]
|
|
143
|
+
tool: str
|
|
144
|
+
provider_info: ProviderInfo
|
|
145
|
+
|
|
146
|
+
class ToolCallArgumentsEvent(TypedDict):
|
|
147
|
+
type: Literal["tool_call.arguments"]
|
|
148
|
+
tool: str
|
|
149
|
+
arguments: Dict[str, Any]
|
|
150
|
+
provider_info: ProviderInfo
|
|
151
|
+
|
|
152
|
+
class ToolCallSuccessEvent(TypedDict):
|
|
153
|
+
type: Literal["tool_call.success"]
|
|
154
|
+
tool: str
|
|
155
|
+
arguments: Dict[str, Any]
|
|
156
|
+
output: str
|
|
157
|
+
provider_info: ProviderInfo
|
|
158
|
+
|
|
159
|
+
class ToolCallFailureEvent(TypedDict, total=False):
|
|
160
|
+
type: Literal["tool_call.failure"]
|
|
161
|
+
reason: str
|
|
162
|
+
metadata: InvalidToolCallMetadata
|
|
163
|
+
|
|
164
|
+
class MessageStartEvent(TypedDict):
|
|
165
|
+
type: Literal["message.start"]
|
|
166
|
+
|
|
167
|
+
class MessageDeltaEvent(TypedDict):
|
|
168
|
+
type: Literal["message.delta"]
|
|
169
|
+
content: str
|
|
170
|
+
|
|
171
|
+
class MessageEndEvent(TypedDict):
|
|
172
|
+
type: Literal["message.end"]
|
|
173
|
+
|
|
174
|
+
class ErrorInfo(TypedDict, total=False):
|
|
175
|
+
type: Literal["invalid_request", "unknown", "mcp_connection_error", "plugin_connection_error", "not_implemented", "model_not_found", "job_not_found", "internal_error"]
|
|
176
|
+
message: str
|
|
177
|
+
code: str
|
|
178
|
+
param: str
|
|
179
|
+
|
|
180
|
+
class ErrorEvent(TypedDict):
|
|
181
|
+
type: Literal["error"]
|
|
182
|
+
error: ErrorInfo
|
|
183
|
+
|
|
184
|
+
class ChatEndEvent(TypedDict):
|
|
185
|
+
type: Literal["chat.end"]
|
|
186
|
+
result: ChatResponse
|
|
187
|
+
|
|
188
|
+
ChatStreamEvent = Union[
|
|
189
|
+
ChatStartEvent,
|
|
190
|
+
ModelLoadStartEvent,
|
|
191
|
+
ModelLoadProgressEvent,
|
|
192
|
+
ModelLoadEndEvent,
|
|
193
|
+
PromptProcessingStartEvent,
|
|
194
|
+
PromptProcessingProgressEvent,
|
|
195
|
+
PromptProcessingEndEvent,
|
|
196
|
+
ReasoningStartEvent,
|
|
197
|
+
ReasoningDeltaEvent,
|
|
198
|
+
ReasoningEndEvent,
|
|
199
|
+
ToolCallStartEvent,
|
|
200
|
+
ToolCallArgumentsEvent,
|
|
201
|
+
ToolCallSuccessEvent,
|
|
202
|
+
ToolCallFailureEvent,
|
|
203
|
+
MessageStartEvent,
|
|
204
|
+
MessageDeltaEvent,
|
|
205
|
+
MessageEndEvent,
|
|
206
|
+
ErrorEvent,
|
|
207
|
+
ChatEndEvent
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
class ModelQuantization(TypedDict, total=False):
|
|
211
|
+
name: Optional[str]
|
|
212
|
+
bits_per_weight: Optional[int]
|
|
213
|
+
|
|
214
|
+
class ModelConfig(TypedDict, total=False):
|
|
215
|
+
context_length: int
|
|
216
|
+
eval_batch_size: int
|
|
217
|
+
parallel: int
|
|
218
|
+
flash_attention: bool
|
|
219
|
+
num_experts: int
|
|
220
|
+
offload_kv_cache_to_gpu: bool
|
|
221
|
+
|
|
222
|
+
class LoadedInstance(TypedDict):
|
|
223
|
+
id: str
|
|
224
|
+
config: ModelConfig
|
|
225
|
+
|
|
226
|
+
class ModelCapabilities(TypedDict, total=False):
|
|
227
|
+
vision: bool
|
|
228
|
+
trained_for_tool_use: bool
|
|
229
|
+
|
|
230
|
+
class ModelReasoning(TypedDict):
|
|
231
|
+
allowed_options: List[Literal["off", "on", "low", "medium", "high"]]
|
|
232
|
+
default: Literal["off", "on", "low", "medium", "high"]
|
|
233
|
+
|
|
234
|
+
class ModelInfo(TypedDict, total=False):
|
|
235
|
+
type: Literal["llm", "embedding"]
|
|
236
|
+
publisher: str
|
|
237
|
+
key: str
|
|
238
|
+
display_name: str
|
|
239
|
+
architecture: Optional[str]
|
|
240
|
+
quantization: Optional[ModelQuantization]
|
|
241
|
+
size_bytes: int
|
|
242
|
+
params_string: Optional[str]
|
|
243
|
+
loaded_instances: List[LoadedInstance]
|
|
244
|
+
max_context_length: int
|
|
245
|
+
format: Optional[Literal["gguf", "mlx"]]
|
|
246
|
+
capabilities: ModelCapabilities
|
|
247
|
+
reasoning: ModelReasoning
|
|
248
|
+
description: Optional[str]
|
|
249
|
+
variants: List[str]
|
|
250
|
+
selected_variant: str
|
|
251
|
+
|
|
252
|
+
class ListModelsResponse(TypedDict):
|
|
253
|
+
models: List[ModelInfo]
|
|
254
|
+
|
|
255
|
+
class LoadConfig(TypedDict, total=False):
|
|
256
|
+
context_length: int
|
|
257
|
+
eval_batch_size: int
|
|
258
|
+
flash_attention: bool
|
|
259
|
+
num_experts: int
|
|
260
|
+
offload_kv_cache_to_gpu: bool
|
|
261
|
+
|
|
262
|
+
class LoadModelResponse(TypedDict, total=False):
|
|
263
|
+
type: Literal["llm", "embedding"]
|
|
264
|
+
instance_id: str
|
|
265
|
+
load_time_seconds: float
|
|
266
|
+
status: Literal["loaded"]
|
|
267
|
+
load_config: LoadConfig
|
|
268
|
+
|
|
269
|
+
class UnloadModelResponse(TypedDict):
|
|
270
|
+
instance_id: str
|
|
271
|
+
|
|
272
|
+
class DownloadStatusResponse(TypedDict, total=False):
|
|
273
|
+
job_id: str
|
|
274
|
+
status: Literal["downloading", "paused", "completed", "failed", "already_downloaded"]
|
|
275
|
+
bytes_per_second: int
|
|
276
|
+
estimated_completion: str
|
|
277
|
+
completed_at: str
|
|
278
|
+
total_size_bytes: int
|
|
279
|
+
downloaded_bytes: int
|
|
280
|
+
started_at: str
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class LMStdError(Exception):
|
|
285
|
+
"""Exception raised for errors returned by the LM Studio API."""
|
|
286
|
+
def __init__(self, status_code: int, response_text: str):
|
|
287
|
+
self.status_code = status_code
|
|
288
|
+
self.response_text = response_text
|
|
289
|
+
super().__init__(f"API Error {status_code}: {response_text}")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class LMStd:
|
|
293
|
+
"""
|
|
294
|
+
A client library for interacting with LM Studio's native v1 REST API.
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
def __init__(self, base_url: str = "http://localhost:1234", api_token: Optional[str] = None):
|
|
298
|
+
"""
|
|
299
|
+
Initializes the LM Studio API Client.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
base_url (str): The base URL where your LM Studio local server is running.
|
|
303
|
+
By default, the server is available at http://localhost:1234.
|
|
304
|
+
api_token (str, optional): The LM_API_TOKEN authorization bearer token if required.
|
|
305
|
+
Passed as an Authorization header[cite: 57, 58].
|
|
306
|
+
"""
|
|
307
|
+
import requests
|
|
308
|
+
self.base_url = base_url.rstrip('/')
|
|
309
|
+
self.session = requests.Session()
|
|
310
|
+
|
|
311
|
+
self.session.headers.update({
|
|
312
|
+
"Content-Type": "application/json"
|
|
313
|
+
})
|
|
314
|
+
if api_token:
|
|
315
|
+
self.session.headers.update({
|
|
316
|
+
"Authorization": f"Bearer {api_token}"
|
|
317
|
+
})
|
|
318
|
+
|
|
319
|
+
def _request(self, method: str, endpoint: str, json_data: Optional[Dict[str, Any]] = None) -> Any:
|
|
320
|
+
"""Internal helper to process HTTP requests cleanly."""
|
|
321
|
+
url = f"{self.base_url}{endpoint}"
|
|
322
|
+
try:
|
|
323
|
+
response = self.session.request(method=method, url=url, json=json_data)
|
|
324
|
+
if response.status_code not in (200, 201):
|
|
325
|
+
raise LMStdError(response.status_code, response.text)
|
|
326
|
+
return response.json()
|
|
327
|
+
except Exception as e:
|
|
328
|
+
if isinstance(e, LMStdError):
|
|
329
|
+
raise e
|
|
330
|
+
raise RuntimeError(f"Failed to connect or process request to {url}: {e}")
|
|
331
|
+
|
|
332
|
+
def chat(
|
|
333
|
+
self,
|
|
334
|
+
model: Optional[str] = None,
|
|
335
|
+
input_data: Optional[Union[str, List[InputItem]]] = None,
|
|
336
|
+
system_prompt: Optional[str] = None,
|
|
337
|
+
integrations: Optional[List[Integration]] = None,
|
|
338
|
+
headers: Optional[Dict[str, str]] = None,
|
|
339
|
+
temperature: Optional[float] = None,
|
|
340
|
+
top_p: Optional[float] = None,
|
|
341
|
+
top_k: Optional[int] = None,
|
|
342
|
+
min_p: Optional[float] = None,
|
|
343
|
+
repeat_penalty: Optional[float] = None,
|
|
344
|
+
max_output_tokens: Optional[int] = None,
|
|
345
|
+
reasoning: Optional[Literal["off", "low", "medium", "high", "on"]] = None,
|
|
346
|
+
context_length: Optional[int] = None,
|
|
347
|
+
store: Optional[bool] = True,
|
|
348
|
+
previous_response_id: Optional[str] = None
|
|
349
|
+
) -> ChatResponse:
|
|
350
|
+
"""
|
|
351
|
+
POST /api/v1/chat
|
|
352
|
+
Send a message to a model and receive a full response.
|
|
353
|
+
The /api/v1/chat endpoint is stateful by default, storing and managing context automatically.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
model (str): Unique identifier for the model to use.
|
|
357
|
+
input_data (str or list): Text message string or an array of input items (messages/images).
|
|
358
|
+
Images can be passed using 'type': 'image' and 'data_url'[cite: 642, 656, 660].
|
|
359
|
+
system_prompt (str, optional): System message that sets model behavior or instructions.
|
|
360
|
+
integrations (list, optional): List of integrations (plugins, ephemeral MCP servers) to enable for this request.
|
|
361
|
+
headers (dict, optional): Custom HTTP headers to send with requests to the server.
|
|
362
|
+
temperature (float, optional): Randomness in token selection (0 is deterministic, [0,1]).
|
|
363
|
+
top_p (float, optional): Minimum cumulative probability for the possible next tokens [0,1].
|
|
364
|
+
top_k (int, optional): Limits next token selection to top-k most probable tokens.
|
|
365
|
+
min_p (float, optional): Minimum base probability for a token to be selected for output [0,1].
|
|
366
|
+
repeat_penalty (float, optional): Penalty for repeating token sequences. 1 is no penalty.
|
|
367
|
+
max_output_tokens (int, optional): Maximum number of tokens to generate.
|
|
368
|
+
reasoning (str, optional): Reasoning setting ('off', 'low', 'medium', 'high', 'on').
|
|
369
|
+
context_length (int, optional): Number of tokens to consider as context. Higher values recommended for MCP usage.
|
|
370
|
+
store (bool, optional): Whether to store the chat. If set to true, response will return a 'response_id' field.
|
|
371
|
+
previous_response_id (str, optional): Identifier of existing response to append to. Must start with "resp_".
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
ChatResponse: Response fields containing 'model_instance_id', an 'output' array (messages, tool_calls, reasoning),
|
|
375
|
+
'stats' (token usage/metrics), and an optional 'response_id'[cite: 753, 756, 804, 837].
|
|
376
|
+
"""
|
|
377
|
+
model = model or os.environ.get("LMSTD_MODEL")
|
|
378
|
+
if not model:
|
|
379
|
+
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
380
|
+
if input_data is None:
|
|
381
|
+
raise ValueError("input_data must be provided.")
|
|
382
|
+
|
|
383
|
+
payload = {
|
|
384
|
+
"model": model,
|
|
385
|
+
"input": input_data,
|
|
386
|
+
"stream": False,
|
|
387
|
+
"store": store
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if system_prompt is not None: payload["system_prompt"] = system_prompt
|
|
391
|
+
if integrations is not None: payload["integrations"] = integrations
|
|
392
|
+
if headers is not None: payload["headers"] = headers
|
|
393
|
+
if temperature is not None: payload["temperature"] = temperature
|
|
394
|
+
if top_p is not None: payload["top_p"] = top_p
|
|
395
|
+
if top_k is not None: payload["top_k"] = top_k
|
|
396
|
+
if min_p is not None: payload["min_p"] = min_p
|
|
397
|
+
if repeat_penalty is not None: payload["repeat_penalty"] = repeat_penalty
|
|
398
|
+
if max_output_tokens is not None: payload["max_output_tokens"] = max_output_tokens
|
|
399
|
+
if reasoning is not None: payload["reasoning"] = reasoning
|
|
400
|
+
if context_length is not None: payload["context_length"] = context_length
|
|
401
|
+
if previous_response_id is not None: payload["previous_response_id"] = previous_response_id
|
|
402
|
+
|
|
403
|
+
return self._request("POST", "/api/v1/chat", json_data=payload)
|
|
404
|
+
|
|
405
|
+
def chat_stream(
|
|
406
|
+
self,
|
|
407
|
+
model: Optional[str] = None,
|
|
408
|
+
input_data: Optional[Union[str, List[InputItem]]] = None,
|
|
409
|
+
system_prompt: Optional[str] = None,
|
|
410
|
+
integrations: Optional[List[Integration]] = None,
|
|
411
|
+
headers: Optional[Dict[str, str]] = None,
|
|
412
|
+
temperature: Optional[float] = None,
|
|
413
|
+
top_p: Optional[float] = None,
|
|
414
|
+
top_k: Optional[int] = None,
|
|
415
|
+
min_p: Optional[float] = None,
|
|
416
|
+
repeat_penalty: Optional[float] = None,
|
|
417
|
+
max_output_tokens: Optional[int] = None,
|
|
418
|
+
reasoning: Optional[Literal["off", "low", "medium", "high", "on"]] = None,
|
|
419
|
+
context_length: Optional[int] = None,
|
|
420
|
+
store: Optional[bool] = True,
|
|
421
|
+
previous_response_id: Optional[str] = None
|
|
422
|
+
) -> Iterator[ChatStreamEvent]:
|
|
423
|
+
"""
|
|
424
|
+
POST /api/v1/chat (Streaming)
|
|
425
|
+
Send a message to a model with `stream` set to true. The response is sent as a stream of events using Server-Sent Events (SSE).
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
model (str, optional): Unique identifier for the model to use. Can be an LLM or embedding model.
|
|
429
|
+
input_data (str | List[InputItem], optional): Message to send to the model. Text message string or an array of InputItem objects.
|
|
430
|
+
system_prompt (str, optional): System message that sets model behavior or instructions.
|
|
431
|
+
integrations (List[Integration], optional): List of integrations (plugins, ephemeral MCP servers, etc...) to enable for this request.
|
|
432
|
+
headers (dict, optional): Custom HTTP headers to send with requests to the server.
|
|
433
|
+
temperature (float, optional): Randomness in token selection. 0 is deterministic, higher values increase creativity [0,1].
|
|
434
|
+
top_p (float, optional): Minimum cumulative probability for the possible next tokens [0,1].
|
|
435
|
+
top_k (int, optional): Limits next token selection to top-k most probable tokens.
|
|
436
|
+
min_p (float, optional): Minimum base probability for a token to be selected for output [0,1].
|
|
437
|
+
repeat_penalty (float, optional): Penalty for repeating token sequences. 1 is no penalty, higher values discourage repetition.
|
|
438
|
+
max_output_tokens (int, optional): Maximum number of tokens to generate.
|
|
439
|
+
reasoning (Literal["off", "low", "medium", "high", "on"], optional): Reasoning setting. Will error if the model being used does not support the reasoning setting using. Defaults to the automatically chosen setting for the model.
|
|
440
|
+
context_length (int, optional): Number of tokens to consider as context. Higher values recommended for MCP usage.
|
|
441
|
+
store (bool, optional): Whether to store the chat. If set, response will return a 'response_id' field. Default true.
|
|
442
|
+
previous_response_id (str, optional): Identifier of existing response to append to. Must start with "resp_".
|
|
443
|
+
|
|
444
|
+
Yields:
|
|
445
|
+
ChatStreamEvent: Parsed JSON objects corresponding to streaming events. Events arrive in order and may include multiple deltas.
|
|
446
|
+
Events: 'chat.start', 'model_load.start', 'model_load.progress', 'model_load.end',
|
|
447
|
+
'prompt_processing.start', 'prompt_processing.progress', 'prompt_processing.end',
|
|
448
|
+
'reasoning.start', 'reasoning.delta', 'reasoning.end',
|
|
449
|
+
'tool_call.start', 'tool_call.arguments', 'tool_call.success', 'tool_call.failure',
|
|
450
|
+
'message.start', 'message.delta', 'message.end', 'error', 'chat.end'.
|
|
451
|
+
"""
|
|
452
|
+
model = model or os.environ.get("LMSTD_MODEL")
|
|
453
|
+
if not model:
|
|
454
|
+
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
455
|
+
if input_data is None:
|
|
456
|
+
raise ValueError("input_data must be provided.")
|
|
457
|
+
|
|
458
|
+
url = f"{self.base_url}/api/v1/chat"
|
|
459
|
+
payload = {
|
|
460
|
+
"model": model,
|
|
461
|
+
"input": input_data,
|
|
462
|
+
"stream": True,
|
|
463
|
+
"store": store
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
if system_prompt is not None: payload["system_prompt"] = system_prompt
|
|
467
|
+
if integrations is not None: payload["integrations"] = integrations
|
|
468
|
+
if headers is not None: payload["headers"] = headers
|
|
469
|
+
if temperature is not None: payload["temperature"] = temperature
|
|
470
|
+
if top_p is not None: payload["top_p"] = top_p
|
|
471
|
+
if top_k is not None: payload["top_k"] = top_k
|
|
472
|
+
if min_p is not None: payload["min_p"] = min_p
|
|
473
|
+
if repeat_penalty is not None: payload["repeat_penalty"] = repeat_penalty
|
|
474
|
+
if max_output_tokens is not None: payload["max_output_tokens"] = max_output_tokens
|
|
475
|
+
if reasoning is not None: payload["reasoning"] = reasoning
|
|
476
|
+
if context_length is not None: payload["context_length"] = context_length
|
|
477
|
+
if previous_response_id is not None: payload["previous_response_id"] = previous_response_id
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
response = self.session.post(url, json=payload, stream=True)
|
|
481
|
+
if response.status_code not in (200, 201):
|
|
482
|
+
raise LMStdError(response.status_code, response.text)
|
|
483
|
+
|
|
484
|
+
for line in response.iter_lines():
|
|
485
|
+
if line:
|
|
486
|
+
decoded_line = line.decode('utf-8')
|
|
487
|
+
if decoded_line.startswith('data: '):
|
|
488
|
+
data_str = decoded_line[6:].strip()
|
|
489
|
+
if data_str:
|
|
490
|
+
yield json.loads(data_str)
|
|
491
|
+
|
|
492
|
+
except Exception as e:
|
|
493
|
+
if isinstance(e, LMStdError):
|
|
494
|
+
raise e
|
|
495
|
+
raise RuntimeError(f"Failed to connect or stream request to {url}: {e}")
|
|
496
|
+
|
|
497
|
+
def list_models(self) -> ListModelsResponse:
|
|
498
|
+
"""
|
|
499
|
+
GET /api/v1/models
|
|
500
|
+
Get a list of available models on your system, including both LLMs and embedding models.
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
ListModelsResponse: JSON object containing a list of available models, their configs (context_length,
|
|
504
|
+
architecture, format), and currently loaded instances[cite: 1102, 1118, 1130, 1134].
|
|
505
|
+
"""
|
|
506
|
+
return self._request("GET", "/api/v1/models")
|
|
507
|
+
|
|
508
|
+
def load_model(
|
|
509
|
+
self,
|
|
510
|
+
model: Optional[str] = None,
|
|
511
|
+
context_length: Optional[int] = None,
|
|
512
|
+
eval_batch_size: Optional[int] = None,
|
|
513
|
+
flash_attention: Optional[bool] = None,
|
|
514
|
+
num_experts: Optional[int] = None,
|
|
515
|
+
offload_kv_cache_to_gpu: Optional[bool] = None,
|
|
516
|
+
echo_load_config: Optional[bool] = False
|
|
517
|
+
) -> LoadModelResponse:
|
|
518
|
+
"""
|
|
519
|
+
POST /api/v1/models/load
|
|
520
|
+
Load an LLM or Embedding model into memory with custom configuration for inference.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
model (str): Unique identifier for the model to load.
|
|
524
|
+
context_length (int, optional): Maximum number of tokens that the model will consider.
|
|
525
|
+
eval_batch_size (int, optional): Number of input tokens to process together in a single batch during evaluation.
|
|
526
|
+
flash_attention (bool, optional): Whether to optimize attention computation. Can decrease memory usage and improve speed.
|
|
527
|
+
num_experts (int, optional): Number of experts to use during inference for MoE (Mixture of Experts) models.
|
|
528
|
+
offload_kv_cache_to_gpu (bool, optional): Whether KV cache is offloaded to GPU memory.
|
|
529
|
+
echo_load_config (bool, optional): If true, echoes the final load configuration in the response.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
LoadModelResponse: Response featuring 'type', 'instance_id', 'load_time_seconds', 'status', and optionally 'load_config'[cite: 1251, 1261].
|
|
533
|
+
"""
|
|
534
|
+
model = model or os.environ.get("LMSTD_MODEL")
|
|
535
|
+
if not model:
|
|
536
|
+
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
537
|
+
|
|
538
|
+
payload = {
|
|
539
|
+
"model": model,
|
|
540
|
+
"echo_load_config": echo_load_config
|
|
541
|
+
}
|
|
542
|
+
if context_length is not None: payload["context_length"] = context_length
|
|
543
|
+
if eval_batch_size is not None: payload["eval_batch_size"] = eval_batch_size
|
|
544
|
+
if flash_attention is not None: payload["flash_attention"] = flash_attention
|
|
545
|
+
if num_experts is not None: payload["num_experts"] = num_experts
|
|
546
|
+
if offload_kv_cache_to_gpu is not None: payload["offload_kv_cache_to_gpu"] = offload_kv_cache_to_gpu
|
|
547
|
+
|
|
548
|
+
return self._request("POST", "/api/v1/models/load", json_data=payload)
|
|
549
|
+
|
|
550
|
+
def unload_model(self, instance_id: str) -> UnloadModelResponse:
|
|
551
|
+
"""
|
|
552
|
+
POST /api/v1/models/unload
|
|
553
|
+
Unload a loaded model from memory.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
instance_id (str): Unique identifier of the model instance to unload.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
UnloadModelResponse: Response containing the 'instance_id' of the unloaded model instance.
|
|
560
|
+
"""
|
|
561
|
+
payload = {"instance_id": instance_id}
|
|
562
|
+
return self._request("POST", "/api/v1/models/unload", json_data=payload)
|
|
563
|
+
|
|
564
|
+
def download_model(self, model: Optional[str] = None, quantization: Optional[str] = None) -> DownloadStatusResponse:
|
|
565
|
+
"""
|
|
566
|
+
POST /api/v1/models/download
|
|
567
|
+
Download LLMs and embedding models.
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
model (str, optional): The model to download. Accepts model catalog identifiers (e.g., openai/gpt-oss-20b) and exact Hugging Face links (e.g., https://huggingface.co/lmstudio-community/gpt-oss-20b-GGUF).
|
|
571
|
+
quantization (str, optional): Quantization level of the model to download (e.g., Q4_K_M). Only supported for Hugging Face links.
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
DownloadStatusResponse: Returns a download job status object. The response varies based on the download status ('downloading', 'paused', 'completed', 'failed', 'already_downloaded').
|
|
575
|
+
"""
|
|
576
|
+
model = model or os.environ.get("LMSTD_MODEL")
|
|
577
|
+
if not model:
|
|
578
|
+
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
579
|
+
|
|
580
|
+
payload = {"model": model}
|
|
581
|
+
if quantization is not None:
|
|
582
|
+
payload["quantization"] = quantization
|
|
583
|
+
return self._request("POST", "/api/v1/models/download", json_data=payload)
|
|
584
|
+
|
|
585
|
+
def get_download_status(self, job_id: str) -> DownloadStatusResponse:
|
|
586
|
+
"""
|
|
587
|
+
GET /api/v1/models/download/status/:job_id
|
|
588
|
+
Get the status of model downloads.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
job_id (str): The unique identifier of the download job.
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
DownloadStatusResponse: Download job status object including 'status', 'bytes_per_second', 'total_size_bytes',
|
|
595
|
+
'downloaded_bytes', 'estimated_completion', etc[cite: 1391, 1394, 1396, 1400, 1402].
|
|
596
|
+
"""
|
|
597
|
+
return self._request("GET", f"/api/v1/models/download/status/{job_id}")
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
# --- Basic Usage Verification Example ---
|
|
601
|
+
if __name__ == "__main__":
|
|
602
|
+
client = LMStd(api_token=os.environ.get("LMSTD_APIKEY"))
|
|
603
|
+
|
|
604
|
+
print("1. Listing system models...")
|
|
605
|
+
try:
|
|
606
|
+
models = client.list_models()
|
|
607
|
+
print(json.dumps(models, indent=2))
|
|
608
|
+
|
|
609
|
+
except Exception as error:
|
|
610
|
+
print(f"Server communication failed: {error}")
|
|
611
|
+
|
|
612
|
+
input()
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "lmstd"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "LM Studio v1 REST API Client Library"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.7"
|
|
11
|
-
license =
|
|
11
|
+
license = "MIT"
|
|
12
12
|
authors = [
|
|
13
13
|
{ name = "LM Studio User" }
|
|
14
14
|
]
|
lmstd-0.1.0/lmstd.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
LM Studio v1 REST API Client Library
|
|
3
|
-
|
|
4
|
-
This single-file library provides a clean, fully documented Python interface
|
|
5
|
-
to interact with an LM Studio local server based on the v1 REST API endpoints.
|
|
6
|
-
|
|
7
|
-
Features supported natively via the v1 API:
|
|
8
|
-
- Stateful chats
|
|
9
|
-
- Model Context Protocol (MCP) integrations via API
|
|
10
|
-
- Authentication configuration with API tokens
|
|
11
|
-
- Advanced model lifecycle management (download, load, unload)
|
|
12
|
-
|
|
13
|
-
Dependencies:
|
|
14
|
-
requests
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import json
|
|
18
|
-
import os
|
|
19
|
-
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class LMStdError(Exception):
|
|
23
|
-
"""Exception raised for errors returned by the LM Studio API."""
|
|
24
|
-
def __init__(self, status_code: int, response_text: str):
|
|
25
|
-
self.status_code = status_code
|
|
26
|
-
self.response_text = response_text
|
|
27
|
-
super().__init__(f"API Error {status_code}: {response_text}")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class LMStd:
|
|
31
|
-
"""
|
|
32
|
-
A client library for interacting with LM Studio's native v1 REST API.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self, base_url: str = "http://localhost:1234", api_token: Optional[str] = None):
|
|
36
|
-
"""
|
|
37
|
-
Initializes the LM Studio API Client.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
base_url (str): The base URL where your LM Studio local server is running.
|
|
41
|
-
By default, the server is available at http://localhost:1234.
|
|
42
|
-
api_token (str, optional): The LM_API_TOKEN authorization bearer token if required.
|
|
43
|
-
Passed as an Authorization header[cite: 57, 58].
|
|
44
|
-
"""
|
|
45
|
-
import requests
|
|
46
|
-
self.base_url = base_url.rstrip('/')
|
|
47
|
-
self.session = requests.Session()
|
|
48
|
-
|
|
49
|
-
self.session.headers.update({
|
|
50
|
-
"Content-Type": "application/json"
|
|
51
|
-
})
|
|
52
|
-
if api_token:
|
|
53
|
-
self.session.headers.update({
|
|
54
|
-
"Authorization": f"Bearer {api_token}"
|
|
55
|
-
})
|
|
56
|
-
|
|
57
|
-
def _request(self, method: str, endpoint: str, json_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
58
|
-
"""Internal helper to process HTTP requests cleanly."""
|
|
59
|
-
url = f"{self.base_url}{endpoint}"
|
|
60
|
-
try:
|
|
61
|
-
response = self.session.request(method=method, url=url, json=json_data)
|
|
62
|
-
if response.status_code not in (200, 201):
|
|
63
|
-
raise LMStdError(response.status_code, response.text)
|
|
64
|
-
return response.json()
|
|
65
|
-
except Exception as e:
|
|
66
|
-
if isinstance(e, LMStdError):
|
|
67
|
-
raise e
|
|
68
|
-
raise RuntimeError(f"Failed to connect or process request to {url}: {e}")
|
|
69
|
-
|
|
70
|
-
def chat(
|
|
71
|
-
self,
|
|
72
|
-
model: Optional[str] = None,
|
|
73
|
-
input_data: Optional[Union[str, List[Dict[str, Any]]]] = None,
|
|
74
|
-
system_prompt: Optional[str] = None,
|
|
75
|
-
integrations: Optional[List[Union[str, Dict[str, Any]]]] = None,
|
|
76
|
-
headers: Optional[Dict[str, str]] = None,
|
|
77
|
-
temperature: Optional[float] = None,
|
|
78
|
-
top_p: Optional[float] = None,
|
|
79
|
-
top_k: Optional[int] = None,
|
|
80
|
-
min_p: Optional[float] = None,
|
|
81
|
-
repeat_penalty: Optional[float] = None,
|
|
82
|
-
max_output_tokens: Optional[int] = None,
|
|
83
|
-
reasoning: Optional[str] = None,
|
|
84
|
-
context_length: Optional[int] = None,
|
|
85
|
-
store: bool = True,
|
|
86
|
-
previous_response_id: Optional[str] = None
|
|
87
|
-
) -> Dict[str, Any]:
|
|
88
|
-
"""
|
|
89
|
-
POST /api/v1/chat
|
|
90
|
-
Send a message to a model and receive a full response.
|
|
91
|
-
The /api/v1/chat endpoint is stateful by default, storing and managing context automatically.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
model (str): Unique identifier for the model to use.
|
|
95
|
-
input_data (str or list): Text message string or an array of input items (messages/images).
|
|
96
|
-
Images can be passed using 'type': 'image' and 'data_url'[cite: 642, 656, 660].
|
|
97
|
-
system_prompt (str, optional): System message that sets model behavior or instructions.
|
|
98
|
-
integrations (list, optional): List of integrations (plugins, ephemeral MCP servers) to enable for this request.
|
|
99
|
-
headers (dict, optional): Custom HTTP headers to send with requests to the server.
|
|
100
|
-
temperature (float, optional): Randomness in token selection (0 is deterministic, [0,1]).
|
|
101
|
-
top_p (float, optional): Minimum cumulative probability for the possible next tokens [0,1].
|
|
102
|
-
top_k (int, optional): Limits next token selection to top-k most probable tokens.
|
|
103
|
-
min_p (float, optional): Minimum base probability for a token to be selected for output [0,1].
|
|
104
|
-
repeat_penalty (float, optional): Penalty for repeating token sequences. 1 is no penalty.
|
|
105
|
-
max_output_tokens (int, optional): Maximum number of tokens to generate.
|
|
106
|
-
reasoning (str, optional): Reasoning setting ('off', 'low', 'medium', 'high', 'on').
|
|
107
|
-
context_length (int, optional): Number of tokens to consider as context. Higher values recommended for MCP usage.
|
|
108
|
-
store (bool, optional): Whether to store the chat. If set to true, response will return a 'response_id' field.
|
|
109
|
-
previous_response_id (str, optional): Identifier of existing response to append to. Must start with "resp_".
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
Dict[str, Any]: Response fields containing 'model_instance_id', an 'output' array (messages, tool_calls, reasoning),
|
|
113
|
-
'stats' (token usage/metrics), and an optional 'response_id'[cite: 753, 756, 804, 837].
|
|
114
|
-
"""
|
|
115
|
-
model = model or os.environ.get("LMSTD_MODEL")
|
|
116
|
-
if not model:
|
|
117
|
-
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
118
|
-
if input_data is None:
|
|
119
|
-
raise ValueError("input_data must be provided.")
|
|
120
|
-
|
|
121
|
-
payload = {
|
|
122
|
-
"model": model,
|
|
123
|
-
"input": input_data,
|
|
124
|
-
"stream": False,
|
|
125
|
-
"store": store
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
if system_prompt is not None: payload["system_prompt"] = system_prompt
|
|
129
|
-
if integrations is not None: payload["integrations"] = integrations
|
|
130
|
-
if headers is not None: payload["headers"] = headers
|
|
131
|
-
if temperature is not None: payload["temperature"] = temperature
|
|
132
|
-
if top_p is not None: payload["top_p"] = top_p
|
|
133
|
-
if top_k is not None: payload["top_k"] = top_k
|
|
134
|
-
if min_p is not None: payload["min_p"] = min_p
|
|
135
|
-
if repeat_penalty is not None: payload["repeat_penalty"] = repeat_penalty
|
|
136
|
-
if max_output_tokens is not None: payload["max_output_tokens"] = max_output_tokens
|
|
137
|
-
if reasoning is not None: payload["reasoning"] = reasoning
|
|
138
|
-
if context_length is not None: payload["context_length"] = context_length
|
|
139
|
-
if previous_response_id is not None: payload["previous_response_id"] = previous_response_id
|
|
140
|
-
|
|
141
|
-
return self._request("POST", "/api/v1/chat", json_data=payload)
|
|
142
|
-
|
|
143
|
-
def chat_stream(
|
|
144
|
-
self,
|
|
145
|
-
model: Optional[str] = None,
|
|
146
|
-
input_data: Optional[Union[str, List[Dict[str, Any]]]] = None,
|
|
147
|
-
**kwargs
|
|
148
|
-
) -> Iterator[Dict[str, Any]]:
|
|
149
|
-
"""
|
|
150
|
-
POST /api/v1/chat (Streaming)
|
|
151
|
-
Send a message to a model with `stream` set to true. The response is sent as a stream of events using Server-Sent Events (SSE).
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
model (str): Unique identifier for the model to use.
|
|
155
|
-
input_data (str or list): Text message string or an array of input items.
|
|
156
|
-
**kwargs: Additional parameters matching the `chat` function (e.g., system_prompt, integrations, store, temperature, etc.).
|
|
157
|
-
|
|
158
|
-
Yields:
|
|
159
|
-
Dict[str, Any]: Parsed JSON objects corresponding to streaming events. Events arrive in order and include:
|
|
160
|
-
'chat.start', 'model_load.*', 'prompt_processing.*', 'reasoning.*', 'tool_call.*', 'message.*',
|
|
161
|
-
'error', and finally 'chat.end'[cite: 211, 216, 217, 220, 227, 238].
|
|
162
|
-
"""
|
|
163
|
-
model = model or os.environ.get("LMSTD_MODEL")
|
|
164
|
-
if not model:
|
|
165
|
-
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
166
|
-
if input_data is None:
|
|
167
|
-
raise ValueError("input_data must be provided.")
|
|
168
|
-
|
|
169
|
-
url = f"{self.base_url}/api/v1/chat"
|
|
170
|
-
payload = {
|
|
171
|
-
"model": model,
|
|
172
|
-
"input": input_data,
|
|
173
|
-
"stream": True,
|
|
174
|
-
"store": kwargs.get("store", True)
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
for key in ["system_prompt", "integrations", "headers", "temperature", "top_p", "top_k",
|
|
178
|
-
"min_p", "repeat_penalty", "max_output_tokens", "reasoning",
|
|
179
|
-
"context_length", "previous_response_id"]:
|
|
180
|
-
if key in kwargs and kwargs[key] is not None:
|
|
181
|
-
payload[key] = kwargs[key]
|
|
182
|
-
|
|
183
|
-
try:
|
|
184
|
-
response = self.session.post(url, json=payload, stream=True)
|
|
185
|
-
if response.status_code not in (200, 201):
|
|
186
|
-
raise LMStdError(response.status_code, response.text)
|
|
187
|
-
|
|
188
|
-
for line in response.iter_lines():
|
|
189
|
-
if line:
|
|
190
|
-
decoded_line = line.decode('utf-8')
|
|
191
|
-
if decoded_line.startswith('data: '):
|
|
192
|
-
data_str = decoded_line[6:].strip()
|
|
193
|
-
if data_str:
|
|
194
|
-
yield json.loads(data_str)
|
|
195
|
-
|
|
196
|
-
except Exception as e:
|
|
197
|
-
if isinstance(e, LMStdError):
|
|
198
|
-
raise e
|
|
199
|
-
raise RuntimeError(f"Failed to connect or stream request to {url}: {e}")
|
|
200
|
-
|
|
201
|
-
def list_models(self) -> Dict[str, Any]:
|
|
202
|
-
"""
|
|
203
|
-
GET /api/v1/models
|
|
204
|
-
Get a list of available models on your system, including both LLMs and embedding models.
|
|
205
|
-
|
|
206
|
-
Returns:
|
|
207
|
-
Dict[str, Any]: JSON object containing a list of available models, their configs (context_length,
|
|
208
|
-
architecture, format), and currently loaded instances[cite: 1102, 1118, 1130, 1134].
|
|
209
|
-
"""
|
|
210
|
-
return self._request("GET", "/api/v1/models")
|
|
211
|
-
|
|
212
|
-
def load_model(
|
|
213
|
-
self,
|
|
214
|
-
model: Optional[str] = None,
|
|
215
|
-
context_length: Optional[int] = None,
|
|
216
|
-
eval_batch_size: Optional[int] = None,
|
|
217
|
-
flash_attention: Optional[bool] = None,
|
|
218
|
-
num_experts: Optional[int] = None,
|
|
219
|
-
offload_kv_cache_to_gpu: Optional[bool] = None,
|
|
220
|
-
echo_load_config: bool = False
|
|
221
|
-
) -> Dict[str, Any]:
|
|
222
|
-
"""
|
|
223
|
-
POST /api/v1/models/load
|
|
224
|
-
Load an LLM or Embedding model into memory with custom configuration for inference.
|
|
225
|
-
|
|
226
|
-
Args:
|
|
227
|
-
model (str): Unique identifier for the model to load.
|
|
228
|
-
context_length (int, optional): Maximum number of tokens that the model will consider.
|
|
229
|
-
eval_batch_size (int, optional): Number of input tokens to process together in a single batch during evaluation.
|
|
230
|
-
flash_attention (bool, optional): Whether to optimize attention computation. Can decrease memory usage and improve speed.
|
|
231
|
-
num_experts (int, optional): Number of experts to use during inference for MoE (Mixture of Experts) models.
|
|
232
|
-
offload_kv_cache_to_gpu (bool, optional): Whether KV cache is offloaded to GPU memory.
|
|
233
|
-
echo_load_config (bool, optional): If true, echoes the final load configuration in the response.
|
|
234
|
-
|
|
235
|
-
Returns:
|
|
236
|
-
Dict[str, Any]: Response featuring 'type', 'instance_id', 'load_time_seconds', 'status', and optionally 'load_config'[cite: 1251, 1261].
|
|
237
|
-
"""
|
|
238
|
-
model = model or os.environ.get("LMSTD_MODEL")
|
|
239
|
-
if not model:
|
|
240
|
-
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
241
|
-
|
|
242
|
-
payload = {
|
|
243
|
-
"model": model,
|
|
244
|
-
"echo_load_config": echo_load_config
|
|
245
|
-
}
|
|
246
|
-
if context_length is not None: payload["context_length"] = context_length
|
|
247
|
-
if eval_batch_size is not None: payload["eval_batch_size"] = eval_batch_size
|
|
248
|
-
if flash_attention is not None: payload["flash_attention"] = flash_attention
|
|
249
|
-
if num_experts is not None: payload["num_experts"] = num_experts
|
|
250
|
-
if offload_kv_cache_to_gpu is not None: payload["offload_kv_cache_to_gpu"] = offload_kv_cache_to_gpu
|
|
251
|
-
|
|
252
|
-
return self._request("POST", "/api/v1/models/load", json_data=payload)
|
|
253
|
-
|
|
254
|
-
def unload_model(self, instance_id: str) -> Dict[str, Any]:
|
|
255
|
-
"""
|
|
256
|
-
POST /api/v1/models/unload
|
|
257
|
-
Unload a loaded model from memory.
|
|
258
|
-
|
|
259
|
-
Args:
|
|
260
|
-
instance_id (str): Unique identifier of the model instance to unload.
|
|
261
|
-
|
|
262
|
-
Returns:
|
|
263
|
-
Dict[str, Any]: Confirmation of the unloaded model 'instance_id'.
|
|
264
|
-
"""
|
|
265
|
-
payload = {"instance_id": instance_id}
|
|
266
|
-
return self._request("POST", "/api/v1/models/unload", json_data=payload)
|
|
267
|
-
|
|
268
|
-
def download_model(self, model: Optional[str] = None, quantization: Optional[str] = None) -> Dict[str, Any]:
|
|
269
|
-
"""
|
|
270
|
-
POST /api/v1/models/download
|
|
271
|
-
Download LLMs and embedding models.
|
|
272
|
-
|
|
273
|
-
Args:
|
|
274
|
-
model (str): The model to download. Accepts model catalog identifiers and exact Hugging Face links.
|
|
275
|
-
quantization (str, optional): Quantization level of the model to download (e.g. 'Q4_K_M'). Only supported for Hugging Face links.
|
|
276
|
-
|
|
277
|
-
Returns:
|
|
278
|
-
Dict[str, Any]: Returns a download job status object (e.g., 'job_id', 'status', 'total_size_bytes', 'started_at')[cite: 1321, 1333, 1335].
|
|
279
|
-
"""
|
|
280
|
-
model = model or os.environ.get("LMSTD_MODEL")
|
|
281
|
-
if not model:
|
|
282
|
-
raise ValueError("Model must be provided or set via the LMSTD_MODEL environment variable.")
|
|
283
|
-
|
|
284
|
-
payload = {"model": model}
|
|
285
|
-
if quantization is not None:
|
|
286
|
-
payload["quantization"] = quantization
|
|
287
|
-
return self._request("POST", "/api/v1/models/download", json_data=payload)
|
|
288
|
-
|
|
289
|
-
def get_download_status(self, job_id: str) -> Dict[str, Any]:
|
|
290
|
-
"""
|
|
291
|
-
GET /api/v1/models/download/status/:job_id
|
|
292
|
-
Get the status of model downloads.
|
|
293
|
-
|
|
294
|
-
Args:
|
|
295
|
-
job_id (str): The unique identifier of the download job.
|
|
296
|
-
|
|
297
|
-
Returns:
|
|
298
|
-
Dict[str, Any]: Download job status object including 'status', 'bytes_per_second', 'total_size_bytes',
|
|
299
|
-
'downloaded_bytes', 'estimated_completion', etc[cite: 1391, 1394, 1396, 1400, 1402].
|
|
300
|
-
"""
|
|
301
|
-
return self._request("GET", f"/api/v1/models/download/status/{job_id}")
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
# --- Basic Usage Verification Example ---
|
|
305
|
-
if __name__ == "__main__":
|
|
306
|
-
client = LMStd(api_token=os.environ.get("LMSTD_APIKEY"))
|
|
307
|
-
|
|
308
|
-
print("1. Listing system models...")
|
|
309
|
-
try:
|
|
310
|
-
models = client.list_models()
|
|
311
|
-
print(json.dumps(models, indent=2))
|
|
312
|
-
|
|
313
|
-
except Exception as error:
|
|
314
|
-
print(f"Server communication failed: {error}")
|
|
315
|
-
|
|
316
|
-
input()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|