lyceum-cli 1.0.28__py3-none-any.whl → 1.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyceum/external/compute/execution/gpu_selection.py +1023 -0
- lyceum/external/compute/inference/batch.py +94 -304
- lyceum/external/compute/inference/chat.py +104 -189
- lyceum/external/compute/inference/infer.py +101 -0
- lyceum/external/compute/inference/models.py +26 -199
- lyceum/main.py +6 -1
- lyceum/shared/config.py +5 -9
- lyceum/shared/streaming.py +45 -17
- {lyceum_cli-1.0.28.dist-info → lyceum_cli-1.0.29.dist-info}/METADATA +1 -1
- {lyceum_cli-1.0.28.dist-info → lyceum_cli-1.0.29.dist-info}/RECORD +13 -11
- {lyceum_cli-1.0.28.dist-info → lyceum_cli-1.0.29.dist-info}/WHEEL +1 -1
- {lyceum_cli-1.0.28.dist-info → lyceum_cli-1.0.29.dist-info}/entry_points.txt +0 -0
- {lyceum_cli-1.0.28.dist-info → lyceum_cli-1.0.29.dist-info}/top_level.txt +0 -0
|
@@ -1,221 +1,136 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Inference chat command"""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
4
6
|
|
|
5
7
|
import typer
|
|
6
8
|
from rich.console import Console
|
|
7
|
-
from rich.table import Table
|
|
8
9
|
|
|
9
10
|
from ....shared.config import config
|
|
10
11
|
|
|
11
12
|
console = Console()
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
temperature: float = typer.Option(0.7, "--temperature", "-t", help="Temperature (0.0-2.0)"),
|
|
25
|
-
system_prompt: str | None = typer.Option(None, "--system", "-s", help="System prompt"),
|
|
14
|
+
def chat_cmd(
|
|
15
|
+
prompt: str = typer.Option(None, "--prompt", "-p", help="The message or path to file (.txt/.yaml/.xml)"),
|
|
16
|
+
no_stream: bool = typer.Option(False, "--no-stream", "-n", help="Disable streaming response"),
|
|
17
|
+
image: str = typer.Option(None, "--image", "-i", help="Image path or base64"),
|
|
18
|
+
image_url: str = typer.Option(None, "--url", help="Image URL"),
|
|
19
|
+
image_dir: str = typer.Option(None, "--dir", help="Directory of images"),
|
|
20
|
+
base64: bool = typer.Option(False, "--base64", help="Treat image input as base64"),
|
|
21
|
+
model: str = typer.Option("gpt-4", "--model", "-m", help="Model to use"),
|
|
22
|
+
max_tokens: int = typer.Option(1000, "--tokens", "-t", help="Max output tokens"),
|
|
23
|
+
output_type: str = typer.Option("text", "--type", help="Output type (e.g. json, markdown)"),
|
|
24
|
+
batch_file: str = typer.Option(None, "--batch", "-b", help="JSONL file for batch processing"),
|
|
26
25
|
):
|
|
27
|
-
"""
|
|
26
|
+
"""
|
|
27
|
+
Perform inference (Chat, Image, or Batch).
|
|
28
|
+
"""
|
|
28
29
|
try:
|
|
29
30
|
config.get_client()
|
|
30
|
-
|
|
31
|
-
# Create the sync request payload directly
|
|
32
|
-
sync_request = {
|
|
33
|
-
"model_id": model,
|
|
34
|
-
"input": {
|
|
35
|
-
"text": message,
|
|
36
|
-
"parameters": {"system_prompt": system_prompt} if system_prompt else {}
|
|
37
|
-
},
|
|
38
|
-
"max_tokens": max_tokens,
|
|
39
|
-
"temperature": temperature,
|
|
40
|
-
"top_p": 1.0,
|
|
41
|
-
"stream": False
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
console.print(f"[dim]🤖 Sending message to {model}...[/dim]")
|
|
45
|
-
|
|
46
|
-
# Make the API call using httpx directly (since we don't have generated client for sync inference yet)
|
|
47
31
|
import httpx
|
|
48
32
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
usage = result['usage']
|
|
69
|
-
console.print(f"[dim]📊 Tokens: {usage.get('total_tokens', 0)} | "
|
|
70
|
-
f"Latency: {result.get('latency_ms', 0)}ms | "
|
|
71
|
-
f"Cost: ${result.get('cost', 0):.4f}[/dim]")
|
|
72
|
-
|
|
73
|
-
elif response.status_code == 503:
|
|
74
|
-
console.print(f"[red]❌ Model {model} is not running. Please contact support to start the model.[/red]")
|
|
33
|
+
# 1. Batch Processing
|
|
34
|
+
if batch_file:
|
|
35
|
+
console.print(f"[dim]Initiating batch processing from {batch_file}...[/dim]")
|
|
36
|
+
if not os.path.exists(batch_file):
|
|
37
|
+
console.print(f"[red]File not found: {batch_file}[/red]")
|
|
38
|
+
raise typer.Exit(1)
|
|
39
|
+
|
|
40
|
+
# Upload
|
|
41
|
+
with open(batch_file, 'rb') as f:
|
|
42
|
+
files = {'file': (os.path.basename(batch_file), f, 'application/jsonl')}
|
|
43
|
+
response = httpx.post(
|
|
44
|
+
f"{config.base_url}/api/v2/external/files",
|
|
45
|
+
headers={"Authorization": f"Bearer {config.api_key}"},
|
|
46
|
+
files=files,
|
|
47
|
+
data={'purpose': 'batch'},
|
|
48
|
+
timeout=60.0
|
|
49
|
+
)
|
|
50
|
+
if response.status_code != 200:
|
|
51
|
+
console.print(f"[red]Upload failed: {response.text}[/red]")
|
|
75
52
|
raise typer.Exit(1)
|
|
76
|
-
else:
|
|
77
|
-
console.print(f"[red]❌ Error: HTTP {response.status_code}[/red]")
|
|
78
|
-
console.print(f"[red]{response.text}[/red]")
|
|
79
|
-
raise typer.Exit(1)
|
|
80
|
-
|
|
81
|
-
except Exception as e:
|
|
82
|
-
console.print(f"[red]❌ Error: {e}[/red]")
|
|
83
|
-
raise typer.Exit(1)
|
|
84
53
|
|
|
54
|
+
file_id = response.json()['id']
|
|
85
55
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
url = f"{config.base_url}/api/v2/external/models/"
|
|
96
|
-
headers = {"Authorization": f"Bearer {config.api_key}"}
|
|
97
|
-
|
|
98
|
-
with httpx.Client() as http_client:
|
|
99
|
-
response = http_client.get(url, headers=headers, timeout=10.0)
|
|
100
|
-
|
|
101
|
-
if response.status_code == 200:
|
|
102
|
-
models = response.json()
|
|
103
|
-
|
|
104
|
-
if models:
|
|
105
|
-
# Create a table
|
|
106
|
-
table = Table(title="Available AI Models")
|
|
107
|
-
table.add_column("Model", style="cyan", no_wrap=True)
|
|
108
|
-
table.add_column("Type", style="magenta")
|
|
109
|
-
table.add_column("Status", justify="center")
|
|
110
|
-
table.add_column("Price/1K tokens", justify="right", style="green")
|
|
111
|
-
|
|
112
|
-
# Sort models: running first, then by type, then by name
|
|
113
|
-
sorted_models = sorted(models, key=lambda m: (
|
|
114
|
-
not m.get('available', False), # Running models first
|
|
115
|
-
m.get('type', 'text'), # Then by type
|
|
116
|
-
m.get('model_id', '') # Then by name
|
|
117
|
-
))
|
|
118
|
-
|
|
119
|
-
for model in sorted_models:
|
|
120
|
-
# Status with emoji
|
|
121
|
-
status = "🟢 Running" if model.get('available') else "🔴 Stopped"
|
|
122
|
-
|
|
123
|
-
# Model type with emoji
|
|
124
|
-
model_type = model.get('type', 'text')
|
|
125
|
-
type_emoji = {
|
|
126
|
-
'text': 'Text',
|
|
127
|
-
'image': 'Image',
|
|
128
|
-
'audio': 'Audio',
|
|
129
|
-
'multimodal': 'Multi',
|
|
130
|
-
'embedding': 'Embed'
|
|
131
|
-
}.get(model_type, f'❓ {model_type.title()}')
|
|
132
|
-
|
|
133
|
-
# Price
|
|
134
|
-
price = model.get('price_per_1k_tokens', 0)
|
|
135
|
-
price_str = f"${price:.4f}" if price > 0 else "Free"
|
|
136
|
-
|
|
137
|
-
table.add_row(
|
|
138
|
-
model.get('model_id', 'Unknown'),
|
|
139
|
-
type_emoji,
|
|
140
|
-
status,
|
|
141
|
-
price_str
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
console.print(table)
|
|
145
|
-
|
|
146
|
-
# Show summary
|
|
147
|
-
running_count = sum(1 for m in models if m.get('available'))
|
|
148
|
-
total_count = len(models)
|
|
149
|
-
console.print(f"\n[dim]📊 {running_count}/{total_count} models running[/dim]")
|
|
150
|
-
|
|
151
|
-
else:
|
|
152
|
-
console.print("[yellow]No models are currently available[/yellow]")
|
|
153
|
-
else:
|
|
154
|
-
console.print(f"[red]❌ Error: HTTP {response.status_code}[/red]")
|
|
155
|
-
console.print(f"[red]{response.text}[/red]")
|
|
56
|
+
# Create Batch
|
|
57
|
+
response = httpx.post(
|
|
58
|
+
f"{config.base_url}/api/v2/external/batches",
|
|
59
|
+
headers={"Authorization": f"Bearer {config.api_key}"},
|
|
60
|
+
json={"input_file_id": file_id, "model": model},
|
|
61
|
+
timeout=30.0
|
|
62
|
+
)
|
|
63
|
+
if response.status_code != 200:
|
|
64
|
+
console.print(f"[red]Batch creation failed: {response.text}[/red]")
|
|
156
65
|
raise typer.Exit(1)
|
|
157
66
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
67
|
+
data = response.json()
|
|
68
|
+
console.print(f"[green]Batch Job Created: {data['id']}[/green]")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# 2. Image Analysis
|
|
72
|
+
if image or image_url or image_dir:
|
|
73
|
+
if image_dir:
|
|
74
|
+
console.print("[yellow]Directory processing not yet implemented[/yellow]")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
console.print(f"[dim]Analyzing image with {model}...[/dim]")
|
|
78
|
+
|
|
79
|
+
img_input = image_url if image_url else image
|
|
80
|
+
payload = {
|
|
81
|
+
"model_id": model,
|
|
82
|
+
"input": {
|
|
83
|
+
"text": prompt or "Describe this image",
|
|
84
|
+
# Simple heuristic: if it looks like a URL, treat as URL, else file/base64 logic
|
|
85
|
+
"image_url": img_input
|
|
86
|
+
},
|
|
87
|
+
"max_tokens": max_tokens,
|
|
88
|
+
"stream": not no_stream
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
url = f"{config.base_url}/api/v2/external/sync/"
|
|
92
|
+
headers = {"Authorization": f"Bearer {config.api_key}"}
|
|
93
|
+
|
|
94
|
+
with httpx.Client() as client:
|
|
95
|
+
response = client.post(url, json=payload, headers=headers, timeout=60.0)
|
|
96
|
+
if response.status_code != 200:
|
|
97
|
+
console.print(f"[red]Error: {response.text}[/red]")
|
|
98
|
+
raise typer.Exit(1)
|
|
161
99
|
|
|
100
|
+
result = response.json()
|
|
101
|
+
console.print(f"[cyan]{result.get('output', '')}[/cyan]")
|
|
102
|
+
return
|
|
162
103
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
raw_output: bool = typer.Option(False, "--raw", help="Return full model response instead of just content"),
|
|
169
|
-
):
|
|
170
|
-
"""Analyze an image with AI vision models"""
|
|
171
|
-
try:
|
|
172
|
-
config.get_client()
|
|
173
|
-
|
|
174
|
-
# Create request payload for image analysis
|
|
175
|
-
sync_request = {
|
|
176
|
-
"model_id": model,
|
|
177
|
-
"input": {
|
|
178
|
-
"text": prompt,
|
|
179
|
-
"image_url": image_url
|
|
180
|
-
},
|
|
181
|
-
"max_tokens": 1000,
|
|
182
|
-
"temperature": 0.7,
|
|
183
|
-
"raw_output": raw_output
|
|
184
|
-
}
|
|
104
|
+
# 3. Text Chat (Prompt)
|
|
105
|
+
if prompt:
|
|
106
|
+
# Check if prompt is a file
|
|
107
|
+
if os.path.exists(prompt):
|
|
108
|
+
prompt = Path(prompt).read_text()
|
|
185
109
|
|
|
186
|
-
|
|
110
|
+
console.print(f"[dim]Sending message to {model}...[/dim]")
|
|
187
111
|
|
|
188
|
-
|
|
112
|
+
payload = {
|
|
113
|
+
"model_id": model,
|
|
114
|
+
"input": {"text": prompt},
|
|
115
|
+
"max_tokens": max_tokens,
|
|
116
|
+
"stream": not no_stream
|
|
117
|
+
}
|
|
189
118
|
|
|
190
|
-
|
|
191
|
-
|
|
119
|
+
url = f"{config.base_url}/api/v2/external/sync/"
|
|
120
|
+
headers = {"Authorization": f"Bearer {config.api_key}"}
|
|
192
121
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
timeout=60.0
|
|
199
|
-
)
|
|
122
|
+
with httpx.Client() as client:
|
|
123
|
+
response = client.post(url, json=payload, headers=headers, timeout=60.0)
|
|
124
|
+
if response.status_code != 200:
|
|
125
|
+
console.print(f"[red]Error: {response.text}[/red]")
|
|
126
|
+
raise typer.Exit(1)
|
|
200
127
|
|
|
201
|
-
if response.status_code == 200:
|
|
202
128
|
result = response.json()
|
|
129
|
+
console.print(f"[cyan]{result.get('output', '')}[/cyan]")
|
|
130
|
+
return
|
|
203
131
|
|
|
204
|
-
|
|
205
|
-
console.print("[green]✅ Raw Response:[/green]")
|
|
206
|
-
console.print(json.dumps(result.get('raw_response', result['output']), indent=2))
|
|
207
|
-
else:
|
|
208
|
-
console.print("[green]✅ Image Analysis:[/green]")
|
|
209
|
-
console.print(f"[cyan]{result['output']}[/cyan]")
|
|
210
|
-
|
|
211
|
-
elif response.status_code == 503:
|
|
212
|
-
console.print(f"[red]❌ Vision model {model} is not running.[/red]")
|
|
213
|
-
raise typer.Exit(1)
|
|
214
|
-
else:
|
|
215
|
-
console.print(f"[red]❌ Error: HTTP {response.status_code}[/red]")
|
|
216
|
-
console.print(f"[red]{response.text}[/red]")
|
|
217
|
-
raise typer.Exit(1)
|
|
132
|
+
console.print("[yellow]Please provide input via --prompt, --image, or --batch[/yellow]")
|
|
218
133
|
|
|
219
134
|
except Exception as e:
|
|
220
|
-
console.print(f"[red]
|
|
135
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
221
136
|
raise typer.Exit(1)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Inference command group"""
|
|
2
|
+
import typer
|
|
3
|
+
import os
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
import httpx
|
|
6
|
+
from ....shared.config import config
|
|
7
|
+
from .chat import chat_cmd
|
|
8
|
+
from .models import models_cmd
|
|
9
|
+
from .batch import jobs_cmd
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
infer_app = typer.Typer(name="infer", help="Inference commands")
|
|
14
|
+
|
|
15
|
+
# Mount sub-apps / commands
|
|
16
|
+
infer_app.command("chat")(chat_cmd)
|
|
17
|
+
infer_app.command("models")(models_cmd)
|
|
18
|
+
infer_app.command("jobs")(jobs_cmd)
|
|
19
|
+
|
|
20
|
+
@infer_app.command("deploy")
|
|
21
|
+
def deploy_model(
|
|
22
|
+
hf_model_id: str = typer.Argument(..., help="HuggingFace model ID to deploy"),
|
|
23
|
+
vllm_config: str = typer.Option(None, "--config", "-c", help="Name of server-side vLLM config to use"),
|
|
24
|
+
hf_token: str = typer.Option(None, "--hf-token", "-t", help="HuggingFace token to use for deployment"),
|
|
25
|
+
):
|
|
26
|
+
"""Deploy a model for inference"""
|
|
27
|
+
# Interactive prompt for token if not provided and not in env
|
|
28
|
+
if hf_token is None and not (os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")):
|
|
29
|
+
if typer.confirm("No HF token detected. Do you want to provide one (required for private models)?", default=False):
|
|
30
|
+
hf_token = typer.prompt("HuggingFace Token", hide_input=True)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
config.get_client()
|
|
34
|
+
|
|
35
|
+
url = f"{config.base_url}/api/v2/external/inference/deploy"
|
|
36
|
+
headers = {"Authorization": f"Bearer {config.api_key}"}
|
|
37
|
+
|
|
38
|
+
payload = {
|
|
39
|
+
"hf_model_id": hf_model_id
|
|
40
|
+
}
|
|
41
|
+
if vllm_config:
|
|
42
|
+
payload["vllm_config_name"] = vllm_config
|
|
43
|
+
if hf_token:
|
|
44
|
+
payload["hf_token"] = hf_token
|
|
45
|
+
|
|
46
|
+
console.print(f"[dim]Deploying model {hf_model_id}...[/dim]")
|
|
47
|
+
|
|
48
|
+
with httpx.Client() as client:
|
|
49
|
+
response = client.post(url, json=payload, headers=headers, timeout=30.0)
|
|
50
|
+
|
|
51
|
+
if response.status_code != 200:
|
|
52
|
+
console.print(f"[red]Error: HTTP {response.status_code}[/red]")
|
|
53
|
+
console.print(f"[red]{response.text}[/red]")
|
|
54
|
+
raise typer.Exit(1)
|
|
55
|
+
|
|
56
|
+
data = response.json()
|
|
57
|
+
|
|
58
|
+
console.print(f"[green]Deployment initiated![/green]")
|
|
59
|
+
console.print(f"Model ID: [cyan]{data.get('model_id')}[/cyan]")
|
|
60
|
+
console.print(f"Deployment ID: [cyan]{data.get('deployment_id')}[/cyan]")
|
|
61
|
+
console.print(f"Status: [yellow]{data.get('status')}[/yellow]")
|
|
62
|
+
console.print(f"Instance URL: [blue]{data.get('instance_url')}[/blue]")
|
|
63
|
+
console.print("\n[dim]Use 'lyceum infer models' to check status[/dim]")
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
|
|
69
|
+
@infer_app.command("spindown")
|
|
70
|
+
def spindown_model(
|
|
71
|
+
model_id: str = typer.Argument(..., help="Model ID to spin down"),
|
|
72
|
+
):
|
|
73
|
+
"""Spin down a deployed model"""
|
|
74
|
+
try:
|
|
75
|
+
config.get_client()
|
|
76
|
+
|
|
77
|
+
url = f"{config.base_url}/api/v2/external/inference/spindown"
|
|
78
|
+
headers = {"Authorization": f"Bearer {config.api_key}"}
|
|
79
|
+
|
|
80
|
+
payload = {"model_id": model_id}
|
|
81
|
+
|
|
82
|
+
console.print(f"[dim]Spinning down model {model_id}...[/dim]")
|
|
83
|
+
|
|
84
|
+
with httpx.Client() as client:
|
|
85
|
+
response = client.post(url, json=payload, headers=headers, timeout=30.0)
|
|
86
|
+
|
|
87
|
+
if response.status_code != 200:
|
|
88
|
+
console.print(f"[red]Error: HTTP {response.status_code}[/red]")
|
|
89
|
+
console.print(f"[red]{response.text}[/red]")
|
|
90
|
+
raise typer.Exit(1)
|
|
91
|
+
|
|
92
|
+
data = response.json()
|
|
93
|
+
|
|
94
|
+
console.print(f"[green]Spindown initiated![/green]")
|
|
95
|
+
console.print(f"Model ID: [cyan]{data.get('model_id')}[/cyan]")
|
|
96
|
+
console.print(f"Status: [yellow]{data.get('status')}[/yellow]")
|
|
97
|
+
console.print(f"Message: {data.get('message')}")
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
console.print(f"[red]Error: {e}[/red]")
|
|
101
|
+
raise typer.Exit(1)
|