truss 0.11.2rc503__py3-none-any.whl → 0.11.2rc505__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of truss might be problematic. Click here for more details.

Files changed (33) hide show
  1. truss/base/constants.py +3 -0
  2. truss/cli/chains_commands.py +20 -7
  3. truss/cli/train/core.py +156 -0
  4. truss/cli/train/deploy_checkpoints/deploy_checkpoints.py +1 -1
  5. truss/cli/train_commands.py +72 -0
  6. truss/templates/base.Dockerfile.jinja +1 -3
  7. truss/templates/control/control/endpoints.py +82 -33
  8. truss/templates/control/control/helpers/truss_patch/model_container_patch_applier.py +3 -20
  9. truss/templates/control/requirements.txt +1 -1
  10. truss/templates/server/common/errors.py +1 -0
  11. truss/templates/server/truss_server.py +5 -3
  12. truss/templates/server.Dockerfile.jinja +2 -4
  13. truss/templates/train/config.py +46 -0
  14. truss/templates/train/run.sh +11 -0
  15. truss/tests/cli/train/test_deploy_checkpoints.py +3 -3
  16. truss/tests/cli/train/test_train_init.py +499 -0
  17. truss/tests/patch/test_calc_patch.py +14 -26
  18. truss/tests/templates/control/control/test_endpoints.py +20 -14
  19. truss/tests/test_control_truss_patching.py +0 -17
  20. truss/truss_handle/patch/calc_patch.py +5 -20
  21. {truss-0.11.2rc503.dist-info → truss-0.11.2rc505.dist-info}/METADATA +1 -1
  22. {truss-0.11.2rc503.dist-info → truss-0.11.2rc505.dist-info}/RECORD +32 -29
  23. truss_chains/deployment/code_gen.py +5 -1
  24. truss_chains/deployment/deployment_client.py +45 -7
  25. truss_chains/public_types.py +6 -3
  26. truss_chains/remote_chainlet/utils.py +46 -7
  27. truss_train/__init__.py +4 -0
  28. truss_train/definitions.py +47 -2
  29. truss_train/restore_from_checkpoint.py +42 -0
  30. truss/templates/server/entrypoint.sh +0 -32
  31. {truss-0.11.2rc503.dist-info → truss-0.11.2rc505.dist-info}/WHEEL +0 -0
  32. {truss-0.11.2rc503.dist-info → truss-0.11.2rc505.dist-info}/entry_points.txt +0 -0
  33. {truss-0.11.2rc503.dist-info → truss-0.11.2rc505.dist-info}/licenses/LICENSE +0 -0
truss/base/constants.py CHANGED
@@ -18,6 +18,7 @@ SHARED_SERVING_AND_TRAINING_CODE_DIR: pathlib.Path = (
18
18
  CONTROL_SERVER_CODE_DIR: pathlib.Path = TEMPLATES_DIR / "control"
19
19
  CHAINS_CODE_DIR: pathlib.Path = _TRUSS_ROOT.parent / "truss-chains" / "truss_chains"
20
20
  TRUSS_CODE_DIR: pathlib.Path = _TRUSS_ROOT.parent / "truss"
21
+ TRAINING_TEMPLATE_DIR = TEMPLATES_DIR / "train"
21
22
  # Must be sorted ascendingly.
22
23
  SUPPORTED_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
23
24
 
@@ -84,3 +85,5 @@ OPENAI_NON_COMPATIBLE_TAG = "force-legacy-api-non-openai-compatible" # deprecat
84
85
  PRODUCTION_ENVIRONMENT_NAME = "production"
85
86
 
86
87
  TRUSS_BASE_IMAGE_NAME = "baseten/truss-server-base"
88
+
89
+ DEFAULT_TRAINING_CHECKPOINT_FOLDER = "/tmp/loaded_checkpoints"
@@ -43,18 +43,31 @@ def _load_example_chainlet_code() -> str:
43
43
  return source
44
44
 
45
45
 
46
- def _make_chains_curl_snippet(run_remote_url: str, environment: Optional[str]) -> str:
46
+ def _make_chains_curl_snippet(
47
+ run_remote_url: str, environment: Optional[str], is_websocket: bool = False
48
+ ) -> str:
47
49
  if environment:
48
50
  idx = run_remote_url.find("deployment")
49
51
  if idx != -1:
50
52
  run_remote_url = (
51
53
  run_remote_url[:idx] + f"environments/{environment}/run_remote"
52
54
  )
53
- return (
54
- f"curl -X POST '{run_remote_url}' \\\n"
55
- ' -H "Authorization: Api-Key $BASETEN_API_KEY" \\\n'
56
- " -d '<JSON_INPUT>'"
57
- )
55
+
56
+ if is_websocket:
57
+ # Replace 'run_remote' with 'websocket' for websocket endpoints
58
+ websocket_url = run_remote_url.replace("run_remote", "websocket").replace(
59
+ "https", "wss"
60
+ )
61
+ return (
62
+ f'websocat -H="Authorization: Api-Key $BASETEN_API_KEY" \\\n'
63
+ f" {websocket_url}"
64
+ )
65
+ else:
66
+ return (
67
+ f"curl -X POST '{run_remote_url}' \\\n"
68
+ ' -H "Authorization: Api-Key $BASETEN_API_KEY" \\\n'
69
+ " -d '<JSON_INPUT>'"
70
+ )
58
71
 
59
72
 
60
73
  def _create_chains_table(service) -> Tuple[rich.table.Table, List[str]]:
@@ -281,7 +294,7 @@ def push_chain(
281
294
 
282
295
  assert isinstance(service, deployment_client.BasetenChainService)
283
296
  curl_snippet = _make_chains_curl_snippet(
284
- service.run_remote_url, options.environment
297
+ service.run_remote_url, options.environment, service.is_websocket
285
298
  )
286
299
 
287
300
  table, statuses = _create_chains_table(service)
truss/cli/train/core.py CHANGED
@@ -1,3 +1,4 @@
1
+ import base64
1
2
  import json
2
3
  import os
3
4
  import tarfile
@@ -8,6 +9,7 @@ from pathlib import Path
8
9
  from typing import Any, Callable, Dict, Optional, Tuple
9
10
 
10
11
  import click
12
+ import requests
11
13
  import rich
12
14
  from InquirerPy import inquirer
13
15
  from rich.text import Text
@@ -355,6 +357,7 @@ def download_training_job_data(
355
357
  temp_path.write_bytes(content)
356
358
 
357
359
  unzip_dir = output_dir / artifact_base_name
360
+ unzip_dir = Path(str(unzip_dir).replace(" ", "-"))
358
361
  if unzip_dir.exists():
359
362
  raise click.ClickException(
360
363
  f"Directory '{unzip_dir}' already exists. "
@@ -367,6 +370,7 @@ def download_training_job_data(
367
370
 
368
371
  return unzip_dir
369
372
  else:
373
+ target_path = Path(str(target_path).replace(" ", "-"))
370
374
  target_path.write_bytes(content)
371
375
  return target_path
372
376
 
@@ -417,6 +421,158 @@ def status_page_url(remote_url: str, training_job_id: str) -> str:
417
421
  return f"{remote_url}/training/jobs/{training_job_id}"
418
422
 
419
423
 
424
+ def _get_all_train_init_example_options(
425
+ repo_id: str = "ml-cookbook",
426
+ examples_subdir: str = "examples",
427
+ token: Optional[str] = None,
428
+ ) -> list[str]:
429
+ """
430
+ Retrieve a list of all example options from the ml-cookbook repository to
431
+ copy locally for training initialization. This method generates a list
432
+ of examples and URL paths to show the user for selection.
433
+ """
434
+ headers = {}
435
+ if token:
436
+ headers["Authorization"] = f"token {token}"
437
+
438
+ url = (
439
+ f"https://api.github.com/repos/basetenlabs/{repo_id}/contents/{examples_subdir}"
440
+ )
441
+ try:
442
+ response = requests.get(url, headers=headers)
443
+ response.raise_for_status()
444
+
445
+ items = response.json()
446
+ if not isinstance(items, list):
447
+ items = [items]
448
+ items = [item["name"] for item in items if item["type"] == "dir"]
449
+ return items
450
+
451
+ except requests.exceptions.RequestException as e:
452
+ click.echo(
453
+ f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
454
+ )
455
+ return []
456
+
457
+
458
+ def _get_train_init_example_info(
459
+ repo_id: str = "ml-cookbook",
460
+ examples_subdir: str = "examples",
461
+ example_name: Optional[str] = None,
462
+ token: Optional[str] = None,
463
+ ) -> list[Dict[str, str]]:
464
+ """
465
+ Retrieve directory download links for the example from the ml-cookbook repository to
466
+ copy locally for training initialization.
467
+ """
468
+ headers = {}
469
+ if token:
470
+ headers["Authorization"] = f"token {token}"
471
+
472
+ url = f"https://api.github.com/repos/basetenlabs/{repo_id}/contents/{examples_subdir}/{example_name}"
473
+
474
+ try:
475
+ response = requests.get(url, headers=headers)
476
+ response.raise_for_status()
477
+
478
+ items = response.json()
479
+ if not isinstance(items, list):
480
+ items = [items]
481
+ return items
482
+
483
+ except requests.exceptions.HTTPError as e:
484
+ if response.status_code == 404:
485
+ # example_name does not exist, return empty list
486
+ return []
487
+ else:
488
+ # Other HTTP errors
489
+ click.echo(
490
+ f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
491
+ )
492
+ return []
493
+ except requests.exceptions.RequestException as e:
494
+ # Network or other request errors
495
+ click.echo(
496
+ f"Error exploring directory: {e}. Please file an issue at https://github.com/basetenlabs/truss/issues"
497
+ )
498
+ return []
499
+
500
+
501
+ def download_git_directory(
502
+ git_api_url: str, local_dir: str, token: Optional[str] = None
503
+ ):
504
+ """
505
+ Recursively download directory contents from git api url.
506
+ Special handling for 'training' directory: downloads its contents directly
507
+ to local_dir without creating a 'training' subdirectory.
508
+ Args:
509
+ git_api_url (str): Example format "https://api.github.com/repos/basetenlabs/ml-cookbook/contents/examples/llama-finetune-8b-lora?ref=main"
510
+ local_dir(str): Local directory to download this directory to
511
+ """
512
+ headers = {}
513
+ if token:
514
+ headers["Authorization"] = f"token {token}"
515
+ try:
516
+ response = requests.get(git_api_url, headers=headers)
517
+ response.raise_for_status()
518
+ items = response.json()
519
+
520
+ # Handle single file case
521
+ if not isinstance(items, list):
522
+ items = [items]
523
+
524
+ # Create local directory
525
+ print(f"Creating directory {local_dir}")
526
+ os.makedirs(local_dir, exist_ok=True)
527
+
528
+ # Check if there's a 'training' directory in the items
529
+ training_dir = None
530
+ other_items = []
531
+
532
+ for item in items:
533
+ if item["name"] == "training" and item["type"] == "dir":
534
+ training_dir = item
535
+ else:
536
+ other_items.append(item)
537
+
538
+ # If training directory exists, download its contents directly to local_dir
539
+ if training_dir:
540
+ print(
541
+ f"📁 Found training directory, downloading its contents to {local_dir}"
542
+ )
543
+ return download_git_directory(training_dir["url"], local_dir)
544
+
545
+ # If no training directory, download all files normally
546
+ for item in other_items:
547
+ item_name = item["name"]
548
+ local_item_path = os.path.join(local_dir, item_name)
549
+
550
+ if item["type"] == "file":
551
+ print(f"📄 Downloading {item_name}")
552
+ if item.get("download_url"):
553
+ # Download file directly
554
+ file_response = requests.get(item["download_url"])
555
+ file_response.raise_for_status()
556
+ with open(local_item_path, "wb") as f:
557
+ f.write(file_response.content)
558
+ elif item.get("content"):
559
+ # Decode base64 content (for small files)
560
+ try:
561
+ content = base64.b64decode(item["content"])
562
+ with open(local_item_path, "wb") as f:
563
+ f.write(content)
564
+ except Exception as e:
565
+ print(f"⚠️ Could not decode {item_name}: {e}")
566
+ elif item["type"] == "dir":
567
+ print(f"📁 Entering directory {item_name}")
568
+ # Use the API URL from the response for subdirectories
569
+ download_git_directory(item["url"], local_item_path)
570
+ return True
571
+ except Exception as e:
572
+ print(f"Error processing response: {e}")
573
+ return False
574
+
575
+
420
576
  def fetch_project_by_name_or_id(
421
577
  remote_provider: BasetenRemote, project_identifier: str
422
578
  ) -> dict:
@@ -299,7 +299,7 @@ def _get_checkpoint_ids_to_deploy(
299
299
  def _select_multiple_checkpoints(checkpoint_id_options: List[str]) -> List[str]:
300
300
  """Select multiple checkpoints using interactive checkbox."""
301
301
  checkpoint_ids = inquirer.checkbox(
302
- message="Select the checkpoint to deploy. Use spacebar to select/deselect.",
302
+ message="Use spacebar to select/deselect checkpoints to deploy. Press enter when done.",
303
303
  choices=checkpoint_id_options,
304
304
  ).execute()
305
305
 
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import sys
2
3
  from pathlib import Path
3
4
  from typing import Optional, cast
@@ -5,6 +6,7 @@ from typing import Optional, cast
5
6
  import rich_click as click
6
7
 
7
8
  import truss.cli.train.core as train_cli
9
+ from truss.base.constants import TRAINING_TEMPLATE_DIR
8
10
  from truss.cli import remote_cli
9
11
  from truss.cli.cli import push, truss_cli
10
12
  from truss.cli.logs import utils as cli_log_utils
@@ -25,6 +27,7 @@ from truss.cli.utils.output import console, error_console
25
27
  from truss.remote.baseten.core import get_training_job_logs_with_pagination
26
28
  from truss.remote.baseten.remote import BasetenRemote
27
29
  from truss.remote.remote_factory import RemoteFactory
30
+ from truss.util.path import copy_tree_path
28
31
  from truss_train import TrainingJob
29
32
 
30
33
 
@@ -381,6 +384,75 @@ def download_checkpoint_artifacts(job_id: Optional[str], remote: Optional[str])
381
384
  sys.exit(1)
382
385
 
383
386
 
387
+ @train.command(name="init")
388
+ @click.option("--list-examples", is_flag=True, help="List all available examples.")
389
+ @click.option("--target-directory", type=str, required=False)
390
+ @click.option("--examples", type=str, required=False)
391
+ @common.common_options()
392
+ def init_training_job(
393
+ list_examples: Optional[bool],
394
+ target_directory: Optional[str],
395
+ examples: Optional[str],
396
+ ) -> None:
397
+ try:
398
+ if list_examples:
399
+ all_examples = train_cli._get_all_train_init_example_options()
400
+ console.print("Available training examples:", style="bold")
401
+ for example in all_examples:
402
+ console.print(f"- {example}")
403
+ console.print(
404
+ "To launch, run `truss train init --examples <example1,example2>`",
405
+ style="bold",
406
+ )
407
+ return
408
+
409
+ selected_options = examples.split(",") if examples else []
410
+
411
+ # No examples selected, initialize empty training project structure
412
+ if not selected_options:
413
+ if target_directory is None:
414
+ target_directory = "truss-train-init"
415
+ console.print(f"Initializing empty training project at {target_directory}")
416
+ os.makedirs(target_directory)
417
+ copy_tree_path(Path(TRAINING_TEMPLATE_DIR), Path(target_directory))
418
+ console.print(
419
+ f"✨ Empty training project initialized at {target_directory}",
420
+ style="bold green",
421
+ )
422
+ return
423
+
424
+ if target_directory is None:
425
+ target_directory = os.getcwd()
426
+ for example_to_download in selected_options:
427
+ download_info = train_cli._get_train_init_example_info(
428
+ example_name=example_to_download
429
+ )
430
+ local_dir = os.path.join(target_directory, example_to_download)
431
+
432
+ if not download_info:
433
+ all_examples = train_cli._get_all_train_init_example_options()
434
+ error_console.print(
435
+ f"Example {example_to_download} not found in the ml-cookbook repository. Examples have to be one or more comma separated values from: {', '.join(all_examples)}"
436
+ )
437
+ continue
438
+ success = train_cli.download_git_directory(
439
+ git_api_url=download_info[0]["url"], local_dir=local_dir
440
+ )
441
+ if success:
442
+ console.print(
443
+ f"✨ Training directory for {example_to_download} initialized at {local_dir}",
444
+ style="bold green",
445
+ )
446
+ else:
447
+ error_console.print(
448
+ f"Failed to initialize training artifacts to {local_dir}"
449
+ )
450
+
451
+ except Exception as e:
452
+ error_console.print(f"Failed to initialize training artifacts: {str(e)}")
453
+ sys.exit(1)
454
+
455
+
384
456
  @train.group(name="cache")
385
457
  def cache():
386
458
  """Cache-related subcommands for truss train"""
@@ -18,8 +18,6 @@ ENV PYTHON_EXECUTABLE="{{ python_executable }}"
18
18
  ENV HOME=/home/{{ app_username }}
19
19
  {# Directory containing inference server code. #}
20
20
  ENV APP_HOME=/{{ app_username }}
21
- {# Directory for truss-transfer cache #}
22
- ENV TRUSS_TRANSFER_CACHE_DIR=${APP_HOME}/.cache/truss_transfer
23
21
  RUN mkdir -p ${APP_HOME} {{ control_server_dir }}
24
22
  {# Create a non-root user to run model containers. #}
25
23
  RUN useradd -u {{ app_user_uid }} -ms /bin/bash {{ app_username }}
@@ -32,7 +30,7 @@ ENV DEBIAN_FRONTEND=noninteractive
32
30
  {# to allow the non-root user to install packages. #}
33
31
  {%- if non_root_user and enable_model_container_admin_commands %}
34
32
  RUN apt update && apt install -y sudo
35
- {%- set allowed_admin_commands = ["/usr/bin/apt install *", "/usr/bin/apt update", "/usr/bin/chown *"] %}
33
+ {%- set allowed_admin_commands = ["/usr/bin/apt install *", "/usr/bin/apt update"] %}
36
34
  RUN echo "Defaults:{{ app_username }} passwd_tries=0\n{{ app_username }} ALL=(root) NOPASSWD: {{ allowed_admin_commands | join(", ") }}" > /etc/sudoers.d/app-packages
37
35
  RUN chmod 0440 /etc/sudoers.d/app-packages
38
36
  {#- optional but good practice: check if the sudoers file is valid #}
@@ -1,14 +1,15 @@
1
1
  import asyncio
2
2
  import logging
3
- from typing import Any, Callable, Dict
3
+ from typing import Any, Callable, Dict, Optional, Protocol
4
4
 
5
5
  import httpx
6
6
  from fastapi import APIRouter, WebSocket
7
7
  from fastapi.responses import JSONResponse, StreamingResponse
8
+ from httpx_ws import AsyncWebSocketSession, WebSocketDisconnect, aconnect_ws
8
9
  from httpx_ws import _exceptions as httpx_ws_exceptions
9
- from httpx_ws import aconnect_ws
10
10
  from starlette.requests import ClientDisconnect, Request
11
11
  from starlette.responses import Response
12
+ from starlette.websockets import WebSocketDisconnect as StartletteWebSocketDisconnect
12
13
  from tenacity import RetryCallState, Retrying, retry_if_exception_type, wait_fixed
13
14
  from wsproto.events import BytesMessage, TextMessage
14
15
 
@@ -29,6 +30,15 @@ BASE_RETRY_EXCEPTIONS = (
29
30
 
30
31
  control_app = APIRouter()
31
32
 
33
+ WEBSOCKET_NORMAL_CLOSURE_CODE = 1000
34
+ WEBSOCKET_SERVER_ERROR_CODE = 1011
35
+
36
+
37
+ class CloseableWebsocket(Protocol):
38
+ async def close(
39
+ self, code: int = WEBSOCKET_NORMAL_CLOSURE_CODE, reason: Optional[str] = None
40
+ ) -> None: ...
41
+
32
42
 
33
43
  @control_app.get("/")
34
44
  def index():
@@ -118,13 +128,79 @@ def inference_retries(
118
128
  yield attempt
119
129
 
120
130
 
121
- async def _safe_close_ws(ws: WebSocket, logger: logging.Logger):
131
+ async def _safe_close_ws(
132
+ ws: CloseableWebsocket,
133
+ logger: logging.Logger,
134
+ code: int,
135
+ reason: Optional[str] = None,
136
+ ):
122
137
  try:
123
- await ws.close()
138
+ await ws.close(code, reason)
124
139
  except RuntimeError as close_error:
125
140
  logger.debug(f"Duplicate close of websocket: `{close_error}`.")
126
141
 
127
142
 
143
+ async def forward_to_server(
144
+ client_ws: WebSocket, server_ws: AsyncWebSocketSession
145
+ ) -> None:
146
+ while True:
147
+ message = await client_ws.receive()
148
+ if message.get("type") == "websocket.disconnect":
149
+ raise StartletteWebSocketDisconnect(
150
+ message.get("code", 1000), message.get("reason")
151
+ )
152
+ if "text" in message:
153
+ await server_ws.send_text(message["text"])
154
+ elif "bytes" in message:
155
+ await server_ws.send_bytes(message["bytes"])
156
+
157
+
158
+ async def forward_to_client(client_ws: WebSocket, server_ws: AsyncWebSocketSession):
159
+ while True:
160
+ message = await server_ws.receive()
161
+ if isinstance(message, TextMessage):
162
+ await client_ws.send_text(message.data)
163
+ elif isinstance(message, BytesMessage):
164
+ await client_ws.send_bytes(message.data)
165
+
166
+
167
+ # NB(nikhil): _handle_websocket_forwarding uses some py311 specific syntax, but in newer
168
+ # versions of truss we're guaranteed to be running the control server with at least that version.
169
+ async def _handle_websocket_forwarding(
170
+ client_ws: WebSocket, server_ws: AsyncWebSocketSession
171
+ ):
172
+ logger = client_ws.app.state.logger
173
+ try:
174
+ async with asyncio.TaskGroup() as tg: # type: ignore[attr-defined]
175
+ tg.create_task(forward_to_client(client_ws, server_ws))
176
+ tg.create_task(forward_to_server(client_ws, server_ws))
177
+ except ExceptionGroup as eg: # type: ignore[name-defined] # noqa: F821
178
+ # NB(nikhil): The first websocket proxy method to raise an error will
179
+ # be surfaced here, and that contains the information we want to forward to the
180
+ # other websocket. Further errors might raise as a result of cancellation, but we
181
+ # can safely ignore those.
182
+ exc = eg.exceptions[0]
183
+ if isinstance(exc, WebSocketDisconnect):
184
+ await _safe_close_ws(client_ws, logger, exc.code, exc.reason)
185
+ elif isinstance(exc, StartletteWebSocketDisconnect):
186
+ await _safe_close_ws(server_ws, logger, exc.code, exc.reason)
187
+ else:
188
+ logger.warning(f"Ungraceful websocket close: {exc}")
189
+ finally:
190
+ # NB(nikhil): In most common cases, both websockets would have been successfully
191
+ # closed with applicable codes above, these lines are just a failsafe.
192
+ await _safe_close_ws(client_ws, logger, code=WEBSOCKET_SERVER_ERROR_CODE)
193
+ await _safe_close_ws(server_ws, logger, code=WEBSOCKET_SERVER_ERROR_CODE)
194
+
195
+
196
+ async def _attempt_websocket_proxy(
197
+ client_ws: WebSocket, proxy_client: httpx.AsyncClient, logger
198
+ ):
199
+ async with aconnect_ws("/v1/websocket", proxy_client) as server_ws: # type: ignore
200
+ await client_ws.accept()
201
+ await _handle_websocket_forwarding(client_ws, server_ws)
202
+
203
+
128
204
  async def proxy_ws(client_ws: WebSocket):
129
205
  proxy_client: httpx.AsyncClient = client_ws.app.state.proxy_client
130
206
  logger = client_ws.app.state.logger
@@ -132,37 +208,10 @@ async def proxy_ws(client_ws: WebSocket):
132
208
  for attempt in inference_retries():
133
209
  with attempt:
134
210
  try:
135
- async with aconnect_ws("/v1/websocket", proxy_client) as server_ws: # type: ignore
136
- # Unfortunate, but FastAPI and httpx-ws have slightly different abstractions
137
- # for sending data, so it's not easy to create a unified wrapper.
138
- async def forward_to_server():
139
- while True:
140
- message = await client_ws.receive()
141
- if message.get("type") == "websocket.disconnect":
142
- break
143
- if "text" in message:
144
- await server_ws.send_text(message["text"])
145
- elif "bytes" in message:
146
- await server_ws.send_bytes(message["bytes"])
147
-
148
- async def forward_to_client():
149
- while True:
150
- message = await server_ws.receive()
151
- if message is None:
152
- break
153
- if isinstance(message, TextMessage):
154
- await client_ws.send_text(message.data)
155
- elif isinstance(message, BytesMessage):
156
- await client_ws.send_bytes(message.data)
157
-
158
- await client_ws.accept()
159
- try:
160
- await asyncio.gather(forward_to_client(), forward_to_server())
161
- finally:
162
- await _safe_close_ws(client_ws, logger)
211
+ await _attempt_websocket_proxy(client_ws, proxy_client, logger)
163
212
  except httpx_ws_exceptions.HTTPXWSException as e:
164
213
  logger.warning(f"WebSocket connection rejected: {e}")
165
- await _safe_close_ws(client_ws, logger)
214
+ await _safe_close_ws(client_ws, logger, WEBSOCKET_SERVER_ERROR_CODE)
166
215
  break
167
216
 
168
217
 
@@ -54,8 +54,9 @@ class ModelContainerPatchApplier:
54
54
  py_req_patch: PythonRequirementPatch = patch.body
55
55
  self._apply_python_requirement_patch(py_req_patch)
56
56
  elif isinstance(patch.body, SystemPackagePatch):
57
- sys_pkg_patch: SystemPackagePatch = patch.body
58
- self._apply_system_package_patch(sys_pkg_patch)
57
+ raise UnsupportedPatch(
58
+ "System package patches are not supported for model container, please run truss push again"
59
+ )
59
60
  elif isinstance(patch.body, ConfigPatch):
60
61
  config_patch: ConfigPatch = patch.body
61
62
  self._apply_config_patch(config_patch)
@@ -114,24 +115,6 @@ class ModelContainerPatchApplier:
114
115
  else:
115
116
  raise ValueError(f"Unknown python requirement patch action {action}")
116
117
 
117
- def _apply_system_package_patch(self, system_package_patch: SystemPackagePatch):
118
- self._app_logger.debug(
119
- f"Applying system package patch {system_package_patch.to_dict()}"
120
- )
121
- action = system_package_patch.action
122
-
123
- if action == Action.REMOVE:
124
- subprocess.run(
125
- ["apt", "remove", "-y", system_package_patch.package], check=True
126
- )
127
- elif action in [Action.ADD, Action.UPDATE]:
128
- subprocess.run(["apt", "update"], check=True)
129
- subprocess.run(
130
- ["apt", "install", "-y", system_package_patch.package], check=True
131
- )
132
- else:
133
- raise ValueError(f"Unknown python requirement patch action {action}")
134
-
135
118
  def _apply_config_patch(self, config_patch: ConfigPatch):
136
119
  self._app_logger.debug(f"Applying config patch {config_patch.to_dict()}")
137
120
  TrussConfig.from_dict(config_patch.config).write_to_yaml_file(
@@ -6,7 +6,7 @@ loguru>=0.7.2
6
6
  python-json-logger>=2.0.2
7
7
  tenacity>=8.1.0
8
8
  # To avoid divergence, this should follow the latest release.
9
- truss==0.9.100
9
+ truss==0.11.1
10
10
  uvicorn>=0.24.0
11
11
  uvloop>=0.19.0
12
12
  websockets>=10.0
@@ -18,6 +18,7 @@ _BASETEN_DOWNSTREAM_ERROR_CODE = 600
18
18
  _BASETEN_CLIENT_ERROR_CODE = 700
19
19
 
20
20
  MODEL_ERROR_MESSAGE = "Internal Server Error (in model/chainlet)."
21
+ WEBSOCKET_SERVER_ERROR_CODE = 1011
21
22
 
22
23
 
23
24
  class ModelMissingError(Exception):
@@ -76,7 +76,7 @@ async def parse_body(request: Request) -> bytes:
76
76
 
77
77
 
78
78
  async def _safe_close_websocket(
79
- ws: WebSocket, reason: Optional[str], status_code: int = 1000
79
+ ws: WebSocket, status_code: int = 1000, reason: Optional[str] = None
80
80
  ) -> None:
81
81
  try:
82
82
  await ws.close(code=status_code, reason=reason)
@@ -257,14 +257,16 @@ class BasetenEndpoints:
257
257
  try:
258
258
  await ws.accept()
259
259
  await self._model.websocket(ws)
260
- await _safe_close_websocket(ws, None, status_code=1000)
260
+ await _safe_close_websocket(ws, status_code=1000, reason=None)
261
261
  except WebSocketDisconnect as ws_error:
262
262
  logging.info(
263
263
  f"Client terminated websocket connection: `{ws_error}`."
264
264
  )
265
265
  except Exception:
266
266
  await _safe_close_websocket(
267
- ws, errors.MODEL_ERROR_MESSAGE, status_code=1011
267
+ ws,
268
+ status_code=errors.WEBSOCKET_SERVER_ERROR_CODE,
269
+ reason=errors.MODEL_ERROR_MESSAGE,
268
270
  )
269
271
  raise # Re raise to let `intercept_exceptions` deal with it.
270
272
 
@@ -141,11 +141,9 @@ ENTRYPOINT ["/control/.env/bin/python", "/control/control/server.py"]
141
141
 
142
142
  {%- else %} {#- else (default inference server) #}
143
143
  ENV INFERENCE_SERVER_PORT="8080"
144
- ENV SERVER_START_CMD="/app/entrypoint.sh {{ python_executable }} /app/main.py"
145
- COPY --chown={{ default_owner }} ./server/entrypoint.sh /app/entrypoint.sh
146
- RUN chmod +x /app/entrypoint.sh
144
+ ENV SERVER_START_CMD="{{ python_executable }} /app/main.py"
147
145
  {{ chown_and_switch_to_regular_user_if_enabled() }}
148
- ENTRYPOINT ["/app/entrypoint.sh", "{{ python_executable }}", "/app/main.py"]
146
+ ENTRYPOINT ["{{ python_executable }}", "/app/main.py"]
149
147
  {%- endif %} {#- endif config.docker_server / live_reload #}
150
148
 
151
149
  {% endblock %} {#- endblock run #}
@@ -0,0 +1,46 @@
1
+ # Import necessary classes from the Baseten Training SDK
2
+ from truss_train import definitions
3
+ from truss.base import truss_config
4
+
5
+ PROJECT_NAME = "My-Baseten-Training-Project"
6
+ NUM_NODES = 1
7
+ NUM_GPUS_PER_NODE = 1
8
+
9
+ # 1. Define a base image for your training job. You can also use
10
+ # private images via AWS IAM or GCP Service Account authentication.
11
+ BASE_IMAGE = "pytorch/pytorch:2.7.0-cuda12.8-cudnn9-runtime"
12
+
13
+ # 2. Define the Runtime Environment for the Training Job
14
+ # This includes start commands and environment variables.
15
+ # Secrets from the baseten workspace like API keys are referenced using
16
+ # `SecretReference`.
17
+ training_runtime = definitions.Runtime(
18
+ start_commands=[ # Example: list of commands to run your training script
19
+ "/bin/sh -c 'chmod +x ./run.sh && ./run.sh'"
20
+ ],
21
+ environment_variables={
22
+ # "HF_TOKEN": definitions.SecretReference(name="hf_access_token"),
23
+ "HELLO": "WORLD"
24
+ },
25
+ cache_config=definitions.CacheConfig(
26
+ enabled=False # Set to True to enable caching between runs
27
+ ),
28
+ checkpointing_config=definitions.CheckpointingConfig(
29
+ enabled=False # Set to True to enable saving checkpoints on Baseten
30
+ ),
31
+ )
32
+
33
+ training_compute = definitions.Compute(
34
+ node_count=NUM_NODES,
35
+ accelerator=truss_config.AcceleratorSpec(
36
+ accelerator=truss_config.Accelerator.H100, count=NUM_GPUS_PER_NODE
37
+ ),
38
+ )
39
+
40
+ training_job = definitions.TrainingJob(
41
+ image=definitions.Image(base_image=BASE_IMAGE),
42
+ compute=training_compute,
43
+ runtime=training_runtime,
44
+ )
45
+
46
+ training_project = definitions.TrainingProject(name=PROJECT_NAME, job=training_job)