truss 0.11.1rc14__py3-none-any.whl → 0.11.2rc500__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of truss might be problematic. Click here for more details.
- truss/base/constants.py +0 -1
- truss/cli/train/core.py +0 -156
- truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +1 -13
- truss/cli/train_commands.py +0 -72
- truss/templates/base.Dockerfile.jinja +3 -1
- truss/templates/control/control/endpoints.py +33 -82
- truss/templates/control/requirements.txt +1 -1
- truss/templates/server/common/errors.py +0 -1
- truss/templates/server/entrypoint.sh +16 -0
- truss/templates/server/truss_server.py +3 -5
- truss/templates/server.Dockerfile.jinja +4 -2
- truss/tests/cli/train/test_deploy_checkpoints.py +3 -3
- truss/tests/templates/control/control/test_endpoints.py +14 -20
- {truss-0.11.1rc14.dist-info → truss-0.11.2rc500.dist-info}/METADATA +1 -1
- {truss-0.11.1rc14.dist-info → truss-0.11.2rc500.dist-info}/RECORD +20 -22
- truss_chains/public_types.py +0 -1
- truss_chains/remote_chainlet/utils.py +0 -18
- truss/templates/train/config.py +0 -46
- truss/templates/train/run.sh +0 -11
- truss/tests/cli/train/test_train_init.py +0 -499
- {truss-0.11.1rc14.dist-info → truss-0.11.2rc500.dist-info}/WHEEL +0 -0
- {truss-0.11.1rc14.dist-info → truss-0.11.2rc500.dist-info}/entry_points.txt +0 -0
- {truss-0.11.1rc14.dist-info → truss-0.11.2rc500.dist-info}/licenses/LICENSE +0 -0
truss/base/constants.py
CHANGED
|
@@ -18,7 +18,6 @@ SHARED_SERVING_AND_TRAINING_CODE_DIR: pathlib.Path = (
|
|
|
18
18
|
CONTROL_SERVER_CODE_DIR: pathlib.Path = TEMPLATES_DIR / "control"
|
|
19
19
|
CHAINS_CODE_DIR: pathlib.Path = _TRUSS_ROOT.parent / "truss-chains" / "truss_chains"
|
|
20
20
|
TRUSS_CODE_DIR: pathlib.Path = _TRUSS_ROOT.parent / "truss"
|
|
21
|
-
TRAINING_TEMPLATE_DIR = TEMPLATES_DIR / "train"
|
|
22
21
|
# Must be sorted ascendingly.
|
|
23
22
|
SUPPORTED_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
24
23
|
|
truss/cli/train/core.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
import json
|
|
3
2
|
import os
|
|
4
3
|
import tarfile
|
|
@@ -9,7 +8,6 @@ from pathlib import Path
|
|
|
9
8
|
from typing import Any, Callable, Dict, Optional, Tuple
|
|
10
9
|
|
|
11
10
|
import click
|
|
12
|
-
import requests
|
|
13
11
|
import rich
|
|
14
12
|
from InquirerPy import inquirer
|
|
15
13
|
from rich.text import Text
|
|
@@ -357,7 +355,6 @@ def download_training_job_data(
|
|
|
357
355
|
temp_path.write_bytes(content)
|
|
358
356
|
|
|
359
357
|
unzip_dir = output_dir / artifact_base_name
|
|
360
|
-
unzip_dir = Path(str(unzip_dir).replace(" ", "-"))
|
|
361
358
|
if unzip_dir.exists():
|
|
362
359
|
raise click.ClickException(
|
|
363
360
|
f"Directory '{unzip_dir}' already exists. "
|
|
@@ -370,7 +367,6 @@ def download_training_job_data(
|
|
|
370
367
|
|
|
371
368
|
return unzip_dir
|
|
372
369
|
else:
|
|
373
|
-
target_path = Path(str(target_path).replace(" ", "-"))
|
|
374
370
|
target_path.write_bytes(content)
|
|
375
371
|
return target_path
|
|
376
372
|
|
|
@@ -421,158 +417,6 @@ def status_page_url(remote_url: str, training_job_id: str) -> str:
|
|
|
421
417
|
return f"{remote_url}/training/jobs/{training_job_id}"
|
|
422
418
|
|
|
423
419
|
|
|
424
|
-
def _get_all_train_init_example_options(
|
|
425
|
-
repo_id: str = "ml-cookbook",
|
|
426
|
-
examples_subdir: str = "examples",
|
|
427
|
-
token: Optional[str] = None,
|
|
428
|
-
) -> list[str]:
|
|
429
|
-
"""
|
|
430
|
-
Retrieve a list of all example options from the ml-cookbook repository to
|
|
431
|
-
copy locally for training initialization. This method generates a list
|
|
432
|
-
of examples and URL paths to show the user for selection.
|
|
433
|
-
"""
|
|
434
|
-
headers = {}
|
|
435
|
-
if token:
|
|
436
|
-
headers["Authorization"] = f"token {token}"
|
|
437
|
-
|
|
438
|
-
url = (
|
|
439
|
-
f"https://api.github.com/repos/basetenlabs/{repo_id}/contents/{examples_subdir}"
|
|
440
|
-
)
|
|
441
|
-
try:
|
|
442
|
-
response = requests.get(url, headers=headers)
|
|
443
|
-
response.raise_for_status()
|
|
444
|
-
|
|
445
|
-
items = response.json()
|
|
446
|
-
if not isinstance(items, list):
|
|
447
|
-
items = [items]
|
|
448
|
-
items = [item["name"] for item in items if item["type"] == "dir"]
|
|
449
|
-
return items
|
|
450
|
-
|
|
451
|
-
except requests.exceptions.RequestException as e:
|
|
452
|
-
click.echo(
|
|
453
|
-
f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
|
|
454
|
-
)
|
|
455
|
-
return []
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
def _get_train_init_example_info(
|
|
459
|
-
repo_id: str = "ml-cookbook",
|
|
460
|
-
examples_subdir: str = "examples",
|
|
461
|
-
example_name: Optional[str] = None,
|
|
462
|
-
token: Optional[str] = None,
|
|
463
|
-
) -> list[Dict[str, str]]:
|
|
464
|
-
"""
|
|
465
|
-
Retrieve directory download links for the example from the ml-cookbook repository to
|
|
466
|
-
copy locally for training initialization.
|
|
467
|
-
"""
|
|
468
|
-
headers = {}
|
|
469
|
-
if token:
|
|
470
|
-
headers["Authorization"] = f"token {token}"
|
|
471
|
-
|
|
472
|
-
url = f"https://api.github.com/repos/basetenlabs/{repo_id}/contents/{examples_subdir}/{example_name}"
|
|
473
|
-
|
|
474
|
-
try:
|
|
475
|
-
response = requests.get(url, headers=headers)
|
|
476
|
-
response.raise_for_status()
|
|
477
|
-
|
|
478
|
-
items = response.json()
|
|
479
|
-
if not isinstance(items, list):
|
|
480
|
-
items = [items]
|
|
481
|
-
return items
|
|
482
|
-
|
|
483
|
-
except requests.exceptions.HTTPError as e:
|
|
484
|
-
if response.status_code == 404:
|
|
485
|
-
# example_name does not exist, return empty list
|
|
486
|
-
return []
|
|
487
|
-
else:
|
|
488
|
-
# Other HTTP errors
|
|
489
|
-
click.echo(
|
|
490
|
-
f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
|
|
491
|
-
)
|
|
492
|
-
return []
|
|
493
|
-
except requests.exceptions.RequestException as e:
|
|
494
|
-
# Network or other request errors
|
|
495
|
-
click.echo(
|
|
496
|
-
f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
|
|
497
|
-
)
|
|
498
|
-
return []
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
def download_git_directory(
|
|
502
|
-
git_api_url: str, local_dir: str, token: Optional[str] = None
|
|
503
|
-
):
|
|
504
|
-
"""
|
|
505
|
-
Recursively download directory contents from git api url.
|
|
506
|
-
Special handling for 'training' directory: downloads its contents directly
|
|
507
|
-
to local_dir without creating a 'training' subdirectory.
|
|
508
|
-
Args:
|
|
509
|
-
git_api_url (str): Example format "https://api.github.com/repos/basetenlabs/ml-cookbook/contents/examples/llama-finetune-8b-lora?ref=main"
|
|
510
|
-
local_dir(str): Local directory to download this directory to
|
|
511
|
-
"""
|
|
512
|
-
headers = {}
|
|
513
|
-
if token:
|
|
514
|
-
headers["Authorization"] = f"token {token}"
|
|
515
|
-
try:
|
|
516
|
-
response = requests.get(git_api_url, headers=headers)
|
|
517
|
-
response.raise_for_status()
|
|
518
|
-
items = response.json()
|
|
519
|
-
|
|
520
|
-
# Handle single file case
|
|
521
|
-
if not isinstance(items, list):
|
|
522
|
-
items = [items]
|
|
523
|
-
|
|
524
|
-
# Create local directory
|
|
525
|
-
print(f"Creating directory {local_dir}")
|
|
526
|
-
os.makedirs(local_dir, exist_ok=True)
|
|
527
|
-
|
|
528
|
-
# Check if there's a 'training' directory in the items
|
|
529
|
-
training_dir = None
|
|
530
|
-
other_items = []
|
|
531
|
-
|
|
532
|
-
for item in items:
|
|
533
|
-
if item["name"] == "training" and item["type"] == "dir":
|
|
534
|
-
training_dir = item
|
|
535
|
-
else:
|
|
536
|
-
other_items.append(item)
|
|
537
|
-
|
|
538
|
-
# If training directory exists, download its contents directly to local_dir
|
|
539
|
-
if training_dir:
|
|
540
|
-
print(
|
|
541
|
-
f"📁 Found training directory, downloading its contents to {local_dir}"
|
|
542
|
-
)
|
|
543
|
-
return download_git_directory(training_dir["url"], local_dir)
|
|
544
|
-
|
|
545
|
-
# If no training directory, download all files normally
|
|
546
|
-
for item in other_items:
|
|
547
|
-
item_name = item["name"]
|
|
548
|
-
local_item_path = os.path.join(local_dir, item_name)
|
|
549
|
-
|
|
550
|
-
if item["type"] == "file":
|
|
551
|
-
print(f"📄 Downloading {item_name}")
|
|
552
|
-
if item.get("download_url"):
|
|
553
|
-
# Download file directly
|
|
554
|
-
file_response = requests.get(item["download_url"])
|
|
555
|
-
file_response.raise_for_status()
|
|
556
|
-
with open(local_item_path, "wb") as f:
|
|
557
|
-
f.write(file_response.content)
|
|
558
|
-
elif item.get("content"):
|
|
559
|
-
# Decode base64 content (for small files)
|
|
560
|
-
try:
|
|
561
|
-
content = base64.b64decode(item["content"])
|
|
562
|
-
with open(local_item_path, "wb") as f:
|
|
563
|
-
f.write(content)
|
|
564
|
-
except Exception as e:
|
|
565
|
-
print(f"⚠️ Could not decode {item_name}: {e}")
|
|
566
|
-
elif item["type"] == "dir":
|
|
567
|
-
print(f"📁 Entering directory {item_name}")
|
|
568
|
-
# Use the API URL from the response for subdirectories
|
|
569
|
-
download_git_directory(item["url"], local_item_path)
|
|
570
|
-
return True
|
|
571
|
-
except Exception as e:
|
|
572
|
-
print(f"Error processing response: {e}")
|
|
573
|
-
return False
|
|
574
|
-
|
|
575
|
-
|
|
576
420
|
def fetch_project_by_name_or_id(
|
|
577
421
|
remote_provider: BasetenRemote, project_identifier: str
|
|
578
422
|
) -> dict:
|
|
@@ -296,22 +296,10 @@ def _get_checkpoint_ids_to_deploy(
|
|
|
296
296
|
return checkpoint_ids
|
|
297
297
|
|
|
298
298
|
|
|
299
|
-
def _select_single_checkpoint(checkpoint_id_options: List[str]) -> List[str]:
|
|
300
|
-
"""Select a single checkpoint using interactive prompt."""
|
|
301
|
-
checkpoint_id = inquirer.select(
|
|
302
|
-
message="Select the checkpoints to deploy:", choices=checkpoint_id_options
|
|
303
|
-
).execute()
|
|
304
|
-
|
|
305
|
-
if not checkpoint_id:
|
|
306
|
-
raise click.UsageError("A checkpoint must be selected.")
|
|
307
|
-
|
|
308
|
-
return [checkpoint_id]
|
|
309
|
-
|
|
310
|
-
|
|
311
299
|
def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
|
|
312
300
|
"""Select multiple checkpoints using interactive checkbox."""
|
|
313
301
|
checkpoint_ids = inquirer.checkbox(
|
|
314
|
-
message="
|
|
302
|
+
message="Select the checkpoint to deploy. Use spacebar to select/deselect.",
|
|
315
303
|
choices=checkpoint_id_options,
|
|
316
304
|
).execute()
|
|
317
305
|
|
truss/cli/train_commands.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import sys
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
from typing import Optional, cast
|
|
@@ -6,7 +5,6 @@ from typing import Optional, cast
|
|
|
6
5
|
import rich_click as click
|
|
7
6
|
|
|
8
7
|
import truss.cli.train.core as train_cli
|
|
9
|
-
from truss.base.constants import TRAINING_TEMPLATE_DIR
|
|
10
8
|
from truss.cli import remote_cli
|
|
11
9
|
from truss.cli.cli import push, truss_cli
|
|
12
10
|
from truss.cli.logs import utils as cli_log_utils
|
|
@@ -27,7 +25,6 @@ from truss.cli.utils.output import console, error_console
|
|
|
27
25
|
from truss.remote.baseten.core import get_training_job_logs_with_pagination
|
|
28
26
|
from truss.remote.baseten.remote import BasetenRemote
|
|
29
27
|
from truss.remote.remote_factory import RemoteFactory
|
|
30
|
-
from truss.util.path import copy_tree_path
|
|
31
28
|
from truss_train import TrainingJob
|
|
32
29
|
|
|
33
30
|
|
|
@@ -384,75 +381,6 @@ def download_checkpoint_artifacts(job_id: Optional[str], remote: Optional[str])
|
|
|
384
381
|
sys.exit(1)
|
|
385
382
|
|
|
386
383
|
|
|
387
|
-
@train.command(name="init")
|
|
388
|
-
@click.option("--list-examples", is_flag=True, help="List all available examples.")
|
|
389
|
-
@click.option("--target-directory", type=str, required=False)
|
|
390
|
-
@click.option("--examples", type=str, required=False)
|
|
391
|
-
@common.common_options()
|
|
392
|
-
def init_training_job(
|
|
393
|
-
list_examples: Optional[bool],
|
|
394
|
-
target_directory: Optional[str],
|
|
395
|
-
examples: Optional[str],
|
|
396
|
-
) -> None:
|
|
397
|
-
try:
|
|
398
|
-
if list_examples:
|
|
399
|
-
all_examples = train_cli._get_all_train_init_example_options()
|
|
400
|
-
console.print("Available training examples:", style="bold")
|
|
401
|
-
for example in all_examples:
|
|
402
|
-
console.print(f"- {example}")
|
|
403
|
-
console.print(
|
|
404
|
-
"To launch, run `truss train init --examples <example1,example2>`",
|
|
405
|
-
style="bold",
|
|
406
|
-
)
|
|
407
|
-
return
|
|
408
|
-
|
|
409
|
-
selected_options = examples.split(",") if examples else []
|
|
410
|
-
|
|
411
|
-
# No examples selected, initialize empty training project structure
|
|
412
|
-
if not selected_options:
|
|
413
|
-
if target_directory is None:
|
|
414
|
-
target_directory = "truss-train-init"
|
|
415
|
-
console.print(f"Initializing empty training project at {target_directory}")
|
|
416
|
-
os.makedirs(target_directory)
|
|
417
|
-
copy_tree_path(Path(TRAINING_TEMPLATE_DIR), Path(target_directory))
|
|
418
|
-
console.print(
|
|
419
|
-
f"✨ Empty training project initialized at {target_directory}",
|
|
420
|
-
style="bold green",
|
|
421
|
-
)
|
|
422
|
-
return
|
|
423
|
-
|
|
424
|
-
if target_directory is None:
|
|
425
|
-
target_directory = os.getcwd()
|
|
426
|
-
for example_to_download in selected_options:
|
|
427
|
-
download_info = train_cli._get_train_init_example_info(
|
|
428
|
-
example_name=example_to_download
|
|
429
|
-
)
|
|
430
|
-
local_dir = os.path.join(target_directory, example_to_download)
|
|
431
|
-
|
|
432
|
-
if not download_info:
|
|
433
|
-
all_examples = train_cli._get_all_train_init_example_options()
|
|
434
|
-
error_console.print(
|
|
435
|
-
f"Example {example_to_download} not found in the ml-cookbook repository. Examples have to be one or more comma separated values from: {', '.join(all_examples)}"
|
|
436
|
-
)
|
|
437
|
-
continue
|
|
438
|
-
success = train_cli.download_git_directory(
|
|
439
|
-
git_api_url=download_info[0]["url"], local_dir=local_dir
|
|
440
|
-
)
|
|
441
|
-
if success:
|
|
442
|
-
console.print(
|
|
443
|
-
f"✨ Training directory for {example_to_download} initialized at {local_dir}",
|
|
444
|
-
style="bold green",
|
|
445
|
-
)
|
|
446
|
-
else:
|
|
447
|
-
error_console.print(
|
|
448
|
-
f"Failed to initialize training artifacts to {local_dir}"
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
except Exception as e:
|
|
452
|
-
error_console.print(f"Failed to initialize training artifacts: {str(e)}")
|
|
453
|
-
sys.exit(1)
|
|
454
|
-
|
|
455
|
-
|
|
456
384
|
@train.group(name="cache")
|
|
457
385
|
def cache():
|
|
458
386
|
"""Cache-related subcommands for truss train"""
|
|
@@ -18,6 +18,8 @@ ENV PYTHON_EXECUTABLE="{{ python_executable }}"
|
|
|
18
18
|
ENV HOME=/home/{{ app_username }}
|
|
19
19
|
{# Directory containing inference server code. #}
|
|
20
20
|
ENV APP_HOME=/{{ app_username }}
|
|
21
|
+
{# Directory for truss-transfer cache #}
|
|
22
|
+
ENV TRUSS_TRANSFER_CACHE_DIR=${APP_HOME}/.cache/truss_transfer
|
|
21
23
|
RUN mkdir -p ${APP_HOME} {{ control_server_dir }}
|
|
22
24
|
{# Create a non-root user to run model containers. #}
|
|
23
25
|
RUN useradd -u {{ app_user_uid }} -ms /bin/bash {{ app_username }}
|
|
@@ -30,7 +32,7 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|
|
30
32
|
{# to allow the non-root user to install packages. #}
|
|
31
33
|
{%- if non_root_user and enable_model_container_admin_commands %}
|
|
32
34
|
RUN apt update && apt install -y sudo
|
|
33
|
-
{%- set allowed_admin_commands = ["/usr/bin/apt install *", "/usr/bin/apt update"] %}
|
|
35
|
+
{%- set allowed_admin_commands = ["/usr/bin/apt install *", "/usr/bin/apt update", "sudo chown -R app:app /cache/model", "sudo chown -R app:app /cache/org"] %}
|
|
34
36
|
RUN echo "Defaults:{{ app_username }} passwd_tries=0\n{{ app_username }} ALL=(root) NOPASSWD: {{ allowed_admin_commands | join(", ") }}" > /etc/sudoers.d/app-packages
|
|
35
37
|
RUN chmod 0440 /etc/sudoers.d/app-packages
|
|
36
38
|
{#- optional but good practice: check if the sudoers file is valid #}
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Callable, Dict
|
|
3
|
+
from typing import Any, Callable, Dict
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
from fastapi import APIRouter, WebSocket
|
|
7
7
|
from fastapi.responses import JSONResponse, StreamingResponse
|
|
8
|
-
from httpx_ws import AsyncWebSocketSession, WebSocketDisconnect, aconnect_ws
|
|
9
8
|
from httpx_ws import _exceptions as httpx_ws_exceptions
|
|
9
|
+
from httpx_ws import aconnect_ws
|
|
10
10
|
from starlette.requests import ClientDisconnect, Request
|
|
11
11
|
from starlette.responses import Response
|
|
12
|
-
from starlette.websockets import WebSocketDisconnect as StartletteWebSocketDisconnect
|
|
13
12
|
from tenacity import RetryCallState, Retrying, retry_if_exception_type, wait_fixed
|
|
14
13
|
from wsproto.events import BytesMessage, TextMessage
|
|
15
14
|
|
|
@@ -30,15 +29,6 @@ BASE_RETRY_EXCEPTIONS = (
|
|
|
30
29
|
|
|
31
30
|
control_app = APIRouter()
|
|
32
31
|
|
|
33
|
-
WEBSOCKET_NORMAL_CLOSURE_CODE = 1000
|
|
34
|
-
WEBSOCKET_SERVER_ERROR_CODE = 1011
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class CloseableWebsocket(Protocol):
|
|
38
|
-
async def close(
|
|
39
|
-
self, code: int = WEBSOCKET_NORMAL_CLOSURE_CODE, reason: Optional[str] = None
|
|
40
|
-
) -> None: ...
|
|
41
|
-
|
|
42
32
|
|
|
43
33
|
@control_app.get("/")
|
|
44
34
|
def index():
|
|
@@ -128,79 +118,13 @@ def inference_retries(
|
|
|
128
118
|
yield attempt
|
|
129
119
|
|
|
130
120
|
|
|
131
|
-
async def _safe_close_ws(
|
|
132
|
-
ws: CloseableWebsocket,
|
|
133
|
-
logger: logging.Logger,
|
|
134
|
-
code: int,
|
|
135
|
-
reason: Optional[str] = None,
|
|
136
|
-
):
|
|
121
|
+
async def _safe_close_ws(ws: WebSocket, logger: logging.Logger):
|
|
137
122
|
try:
|
|
138
|
-
await ws.close(
|
|
123
|
+
await ws.close()
|
|
139
124
|
except RuntimeError as close_error:
|
|
140
125
|
logger.debug(f"Duplicate close of websocket: `{close_error}`.")
|
|
141
126
|
|
|
142
127
|
|
|
143
|
-
async def forward_to_server(
|
|
144
|
-
client_ws: WebSocket, server_ws: AsyncWebSocketSession
|
|
145
|
-
) -> None:
|
|
146
|
-
while True:
|
|
147
|
-
message = await client_ws.receive()
|
|
148
|
-
if message.get("type") == "websocket.disconnect":
|
|
149
|
-
raise StartletteWebSocketDisconnect(
|
|
150
|
-
message.get("code", 1000), message.get("reason")
|
|
151
|
-
)
|
|
152
|
-
if "text" in message:
|
|
153
|
-
await server_ws.send_text(message["text"])
|
|
154
|
-
elif "bytes" in message:
|
|
155
|
-
await server_ws.send_bytes(message["bytes"])
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
async def forward_to_client(client_ws: WebSocket, server_ws: AsyncWebSocketSession):
|
|
159
|
-
while True:
|
|
160
|
-
message = await server_ws.receive()
|
|
161
|
-
if isinstance(message, TextMessage):
|
|
162
|
-
await client_ws.send_text(message.data)
|
|
163
|
-
elif isinstance(message, BytesMessage):
|
|
164
|
-
await client_ws.send_bytes(message.data)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# NB(nikhil): _handle_websocket_forwarding uses some py311 specific syntax, but in newer
|
|
168
|
-
# versions of truss we're guaranteed to be running the control server with at least that version.
|
|
169
|
-
async def _handle_websocket_forwarding(
|
|
170
|
-
client_ws: WebSocket, server_ws: AsyncWebSocketSession
|
|
171
|
-
):
|
|
172
|
-
logger = client_ws.app.state.logger
|
|
173
|
-
try:
|
|
174
|
-
async with asyncio.TaskGroup() as tg: # type: ignore[attr-defined]
|
|
175
|
-
tg.create_task(forward_to_client(client_ws, server_ws))
|
|
176
|
-
tg.create_task(forward_to_server(client_ws, server_ws))
|
|
177
|
-
except ExceptionGroup as eg: # type: ignore[name-defined] # noqa: F821
|
|
178
|
-
# NB(nikhil): The first websocket proxy method to raise an error will
|
|
179
|
-
# be surfaced here, and that contains the information we want to forward to the
|
|
180
|
-
# other websocket. Further errors might raise as a result of cancellation, but we
|
|
181
|
-
# can safely ignore those.
|
|
182
|
-
exc = eg.exceptions[0]
|
|
183
|
-
if isinstance(exc, WebSocketDisconnect):
|
|
184
|
-
await _safe_close_ws(client_ws, logger, exc.code, exc.reason)
|
|
185
|
-
elif isinstance(exc, StartletteWebSocketDisconnect):
|
|
186
|
-
await _safe_close_ws(server_ws, logger, exc.code, exc.reason)
|
|
187
|
-
else:
|
|
188
|
-
logger.warning(f"Ungraceful websocket close: {exc}")
|
|
189
|
-
finally:
|
|
190
|
-
# NB(nikhil): In most common cases, both websockets would have been successfully
|
|
191
|
-
# closed with applicable codes above, these lines are just a failsafe.
|
|
192
|
-
await _safe_close_ws(client_ws, logger, code=WEBSOCKET_SERVER_ERROR_CODE)
|
|
193
|
-
await _safe_close_ws(server_ws, logger, code=WEBSOCKET_SERVER_ERROR_CODE)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
async def _attempt_websocket_proxy(
|
|
197
|
-
client_ws: WebSocket, proxy_client: httpx.AsyncClient, logger
|
|
198
|
-
):
|
|
199
|
-
async with aconnect_ws("/v1/websocket", proxy_client) as server_ws: # type: ignore
|
|
200
|
-
await client_ws.accept()
|
|
201
|
-
await _handle_websocket_forwarding(client_ws, server_ws)
|
|
202
|
-
|
|
203
|
-
|
|
204
128
|
async def proxy_ws(client_ws: WebSocket):
|
|
205
129
|
proxy_client: httpx.AsyncClient = client_ws.app.state.proxy_client
|
|
206
130
|
logger = client_ws.app.state.logger
|
|
@@ -208,10 +132,37 @@ async def proxy_ws(client_ws: WebSocket):
|
|
|
208
132
|
for attempt in inference_retries():
|
|
209
133
|
with attempt:
|
|
210
134
|
try:
|
|
211
|
-
|
|
135
|
+
async with aconnect_ws("/v1/websocket", proxy_client) as server_ws: # type: ignore
|
|
136
|
+
# Unfortunate, but FastAPI and httpx-ws have slightly different abstractions
|
|
137
|
+
# for sending data, so it's not easy to create a unified wrapper.
|
|
138
|
+
async def forward_to_server():
|
|
139
|
+
while True:
|
|
140
|
+
message = await client_ws.receive()
|
|
141
|
+
if message.get("type") == "websocket.disconnect":
|
|
142
|
+
break
|
|
143
|
+
if "text" in message:
|
|
144
|
+
await server_ws.send_text(message["text"])
|
|
145
|
+
elif "bytes" in message:
|
|
146
|
+
await server_ws.send_bytes(message["bytes"])
|
|
147
|
+
|
|
148
|
+
async def forward_to_client():
|
|
149
|
+
while True:
|
|
150
|
+
message = await server_ws.receive()
|
|
151
|
+
if message is None:
|
|
152
|
+
break
|
|
153
|
+
if isinstance(message, TextMessage):
|
|
154
|
+
await client_ws.send_text(message.data)
|
|
155
|
+
elif isinstance(message, BytesMessage):
|
|
156
|
+
await client_ws.send_bytes(message.data)
|
|
157
|
+
|
|
158
|
+
await client_ws.accept()
|
|
159
|
+
try:
|
|
160
|
+
await asyncio.gather(forward_to_client(), forward_to_server())
|
|
161
|
+
finally:
|
|
162
|
+
await _safe_close_ws(client_ws, logger)
|
|
212
163
|
except httpx_ws_exceptions.HTTPXWSException as e:
|
|
213
164
|
logger.warning(f"WebSocket connection rejected: {e}")
|
|
214
|
-
await _safe_close_ws(client_ws, logger
|
|
165
|
+
await _safe_close_ws(client_ws, logger)
|
|
215
166
|
break
|
|
216
167
|
|
|
217
168
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
# Fix ownership of cache directories if they exist
|
|
5
|
+
if [ -d "/cache/model" ]; then
|
|
6
|
+
echo "Setting ownership for /cache/model"
|
|
7
|
+
sudo chown -R app:app /cache/model
|
|
8
|
+
fi
|
|
9
|
+
|
|
10
|
+
if [ -d "/cache/org" ]; then
|
|
11
|
+
echo "Setting ownership for /cache/org"
|
|
12
|
+
sudo chown -R app:app /cache/org
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
# Execute the original command
|
|
16
|
+
exec "$@"
|
|
@@ -76,7 +76,7 @@ async def parse_body(request: Request) -> bytes:
|
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
async def _safe_close_websocket(
|
|
79
|
-
ws: WebSocket,
|
|
79
|
+
ws: WebSocket, reason: Optional[str], status_code: int = 1000
|
|
80
80
|
) -> None:
|
|
81
81
|
try:
|
|
82
82
|
await ws.close(code=status_code, reason=reason)
|
|
@@ -257,16 +257,14 @@ class BasetenEndpoints:
|
|
|
257
257
|
try:
|
|
258
258
|
await ws.accept()
|
|
259
259
|
await self._model.websocket(ws)
|
|
260
|
-
await _safe_close_websocket(ws, status_code=1000
|
|
260
|
+
await _safe_close_websocket(ws, None, status_code=1000)
|
|
261
261
|
except WebSocketDisconnect as ws_error:
|
|
262
262
|
logging.info(
|
|
263
263
|
f"Client terminated websocket connection: `{ws_error}`."
|
|
264
264
|
)
|
|
265
265
|
except Exception:
|
|
266
266
|
await _safe_close_websocket(
|
|
267
|
-
ws,
|
|
268
|
-
status_code=errors.WEBSOCKET_SERVER_ERROR_CODE,
|
|
269
|
-
reason=errors.MODEL_ERROR_MESSAGE,
|
|
267
|
+
ws, errors.MODEL_ERROR_MESSAGE, status_code=1011
|
|
270
268
|
)
|
|
271
269
|
raise # Re raise to let `intercept_exceptions` deal with it.
|
|
272
270
|
|
|
@@ -141,9 +141,11 @@ ENTRYPOINT ["/control/.env/bin/python", "/control/control/server.py"]
|
|
|
141
141
|
|
|
142
142
|
{%- else %} {#- else (default inference server) #}
|
|
143
143
|
ENV INFERENCE_SERVER_PORT="8080"
|
|
144
|
-
ENV SERVER_START_CMD="{{ python_executable }} /app/main.py"
|
|
144
|
+
ENV SERVER_START_CMD="/app/entrypoint.sh {{ python_executable }} /app/main.py"
|
|
145
|
+
COPY --chown={{ default_owner }} ./server/entrypoint.sh /app/entrypoint.sh
|
|
146
|
+
RUN chmod +x /app/entrypoint.sh
|
|
145
147
|
{{ chown_and_switch_to_regular_user_if_enabled() }}
|
|
146
|
-
ENTRYPOINT ["{{ python_executable }}", "/app/main.py"]
|
|
148
|
+
ENTRYPOINT ["/app/entrypoint.sh", "{{ python_executable }}", "/app/main.py"]
|
|
147
149
|
{%- endif %} {#- endif config.docker_server / live_reload #}
|
|
148
150
|
|
|
149
151
|
{% endblock %} {#- endblock run #}
|
|
@@ -584,7 +584,7 @@ def test_get_checkpoint_ids_to_deploy_full_checkpoints():
|
|
|
584
584
|
mock_checkbox.assert_called_once()
|
|
585
585
|
assert (
|
|
586
586
|
mock_checkbox.call_args[1]["message"]
|
|
587
|
-
== "
|
|
587
|
+
== "Select the checkpoint to deploy. Use spacebar to select/deselect."
|
|
588
588
|
)
|
|
589
589
|
assert mock_checkbox.call_args[1]["choices"] == checkpoint_options
|
|
590
590
|
|
|
@@ -621,7 +621,7 @@ def test_get_checkpoint_ids_to_deploy_lora_checkpoints():
|
|
|
621
621
|
mock_checkbox.assert_called_once()
|
|
622
622
|
assert (
|
|
623
623
|
mock_checkbox.call_args[1]["message"]
|
|
624
|
-
== "
|
|
624
|
+
== "Select the checkpoint to deploy. Use spacebar to select/deselect."
|
|
625
625
|
)
|
|
626
626
|
assert mock_checkbox.call_args[1]["choices"] == checkpoint_options
|
|
627
627
|
|
|
@@ -656,7 +656,7 @@ def test_get_checkpoint_ids_to_deploy_mixed_checkpoints():
|
|
|
656
656
|
mock_checkbox.assert_called_once()
|
|
657
657
|
assert (
|
|
658
658
|
mock_checkbox.call_args[1]["message"]
|
|
659
|
-
== "
|
|
659
|
+
== "Select the checkpoint to deploy. Use spacebar to select/deselect."
|
|
660
660
|
)
|
|
661
661
|
assert mock_checkbox.call_args[1]["choices"] == checkpoint_options
|
|
662
662
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
from unittest.mock import AsyncMock, MagicMock, call, patch
|
|
1
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
3
2
|
|
|
4
3
|
import pytest
|
|
5
4
|
from fastapi import FastAPI, WebSocket
|
|
@@ -32,38 +31,33 @@ def client_ws(app):
|
|
|
32
31
|
|
|
33
32
|
@pytest.mark.asyncio
|
|
34
33
|
async def test_proxy_ws_bidirectional_messaging(client_ws):
|
|
35
|
-
|
|
36
|
-
client_ws.receive =
|
|
34
|
+
"""Test that both directions of communication work and clean up properly"""
|
|
35
|
+
client_ws.receive.side_effect = [
|
|
36
|
+
{"type": "websocket.receive", "text": "msg1"},
|
|
37
|
+
{"type": "websocket.receive", "text": "msg2"},
|
|
38
|
+
{"type": "websocket.disconnect"},
|
|
39
|
+
]
|
|
37
40
|
|
|
38
|
-
server_queue = asyncio.Queue()
|
|
39
41
|
mock_server_ws = AsyncMock(spec=AsyncWebSocketSession)
|
|
40
|
-
mock_server_ws.receive =
|
|
42
|
+
mock_server_ws.receive.side_effect = [
|
|
43
|
+
TextMessage(data="response1"),
|
|
44
|
+
TextMessage(data="response2"),
|
|
45
|
+
None, # server closing connection
|
|
46
|
+
]
|
|
41
47
|
mock_server_ws.__aenter__.return_value = mock_server_ws
|
|
42
48
|
mock_server_ws.__aexit__.return_value = None
|
|
43
49
|
|
|
44
|
-
client_queue.put_nowait({"type": "websocket.receive", "text": "msg1"})
|
|
45
|
-
client_queue.put_nowait({"type": "websocket.receive", "text": "msg2"})
|
|
46
|
-
server_queue.put_nowait(TextMessage(data="response1"))
|
|
47
|
-
server_queue.put_nowait(TextMessage(data="response2"))
|
|
48
|
-
|
|
49
50
|
with patch(
|
|
50
51
|
"truss.templates.control.control.endpoints.aconnect_ws",
|
|
51
52
|
return_value=mock_server_ws,
|
|
52
53
|
):
|
|
53
|
-
|
|
54
|
-
client_queue.put_nowait(
|
|
55
|
-
{"type": "websocket.disconnect", "code": 1002, "reason": "test-closure"}
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
await proxy_task
|
|
54
|
+
await proxy_ws(client_ws)
|
|
59
55
|
|
|
60
56
|
assert mock_server_ws.send_text.call_count == 2
|
|
61
57
|
assert mock_server_ws.send_text.call_args_list == [(("msg1",),), (("msg2",),)]
|
|
62
58
|
assert client_ws.send_text.call_count == 2
|
|
63
59
|
assert client_ws.send_text.call_args_list == [(("response1",),), (("response2",),)]
|
|
64
|
-
|
|
65
|
-
assert mock_server_ws.close.call_args_list[0] == call(1002, "test-closure")
|
|
66
|
-
client_ws.close.assert_called()
|
|
60
|
+
client_ws.close.assert_called_once()
|
|
67
61
|
|
|
68
62
|
|
|
69
63
|
@pytest.mark.asyncio
|