PyPI - truss - Versions diffs - 0.10.0rc1__py3-none-any.whl → 0.60.0__py3-none-any.whl - Mend - Supply Chain Defender

truss 0.10.0rc1py3-none-any.whl → 0.60.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of truss might be problematic. Click here for more details.

Files changed (362) hide show

truss/tests/test_model_inference.py CHANGED Viewed

@@ -1,46 +1,78 @@
+import asyncio
 import concurrent
+import contextlib
+import dataclasses
 import inspect
 import json
 import logging
+import pathlib
+import sys
 import tempfile
 import textwrap
 import time
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from threading import Thread
+from typing import Iterator, Mapping, Optional
+import httpx
+import opentelemetry.trace.propagation.tracecontext as tracecontext
 import pytest
 import requests
+from opentelemetry import context, trace
+from python_on_whales import Container
 from requests.exceptions import RequestException
+from truss.base.truss_config import map_to_supported_python_version
 from truss.local.local_config_handler import LocalConfigHandler
-from truss.model_inference import map_to_supported_python_version
 from truss.tests.helpers import create_truss
 from truss.tests.test_testing_utilities_for_other_tests import ensure_kill_all
-from truss.truss_handle import TrussHandle
+from truss.truss_handle.truss_handle import TrussHandle, wait_for_truss
 logger = logging.getLogger(__name__)
 DEFAULT_LOG_ERROR = "Internal Server Error"
+PREDICT_URL = "http://localhost:8090/v1/models/model:predict"
+COMPLETIONS_URL = "http://localhost:8090/v1/completions"
+CHAT_COMPLETIONS_URL = "http://localhost:8090/v1/chat/completions"
+@pytest.fixture
+def anyio_backend():
+    return "asyncio"
-def _log_contains_error(line: dict, error: str, message: str):
+def _log_contains_line(
+    line: dict, message: str, level: str, error: Optional[str] = None
+):
     return (
-        line["levelname"] == "ERROR"
-        and line["message"] == message
-        and error in line["exc_info"]
+        line["levelname"] == level
+        and message in line["message"]
+        and (error is None or error in line["exc_info"])
     )
-def assert_logs_contain_error(logs: str, error: str, message=DEFAULT_LOG_ERROR):
-    loglines = logs.splitlines()
+def _assert_logs_contain_error(logs: str, error: str, message=DEFAULT_LOG_ERROR):
+    loglines = [json.loads(line) for line in logs.splitlines()]
     assert any(
-        _log_contains_error(json.loads(line), error, message) for line in loglines
+        _log_contains_line(line, message, "ERROR", error) for line in loglines
+    ), (
+        f"Did not find expected error in logs.\nExpected error: {error}\n"
+        f"Expected message: {message}\nActual logs:\n{loglines}"
+    )
+def _assert_logs_contain(logs: str, message: str, level: str = "INFO"):
+    loglines = [json.loads(line) for line in logs.splitlines()]
+    assert any(_log_contains_line(line, message, level) for line in loglines), (
+        f"Did not find expected  logs.\n"
+        f"Expected message: {message}\nActual logs:\n{loglines}"
     )
-class PropagatingThread(Thread):
+class _PropagatingThread(Thread):
     """
-    PropagatingThread allows us to run threads and keep track of exceptions
+    _PropagatingThread allows us to run threads and keep track of exceptions
     thrown.
     """
@@ -52,22 +84,31 @@ class PropagatingThread(Thread):
             self.exc = e
     def join(self, timeout=None):
-        super(PropagatingThread, self).join(timeout)
+        super(_PropagatingThread, self).join(timeout)
         if self.exc:
             raise self.exc
         return self.ret
+@contextlib.contextmanager
+def _temp_truss(model_src: str, config_src: str = "") -> Iterator[TrussHandle]:
+    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
+        truss_dir = Path(tmp_work_dir, "truss")
+        create_truss(truss_dir, config_src, textwrap.dedent(model_src))
+        yield TrussHandle(truss_dir)
+# Test Cases ###########################################################################
 @pytest.mark.parametrize(
     "python_version, expected_python_version",
     [
-        ("py37", "py38"),
         ("py38", "py38"),
         ("py39", "py39"),
         ("py310", "py310"),
         ("py311", "py311"),
         ("py312", "py311"),
-        ("py36", "py38"),
     ],
 )
 def test_map_to_supported_python_version(python_version, expected_python_version):
@@ -75,11 +116,54 @@ def test_map_to_supported_python_version(python_version, expected_python_version
     assert out_python_version == expected_python_version
+def test_not_supported_python_minor_versions():
+    with pytest.raises(
+        ValueError,
+        match="Mapping python version 3.6 to 3.8, "
+        "the lowest version that Truss currently supports.",
+    ):
+        map_to_supported_python_version("py36")
+    with pytest.raises(
+        ValueError,
+        match="Mapping python version 3.7 to 3.8, "
+        "the lowest version that Truss currently supports.",
+    ):
+        map_to_supported_python_version("py37")
+def test_not_supported_python_major_versions():
+    with pytest.raises(NotImplementedError, match="Only python version 3 is supported"):
+        map_to_supported_python_version("py211")
 @pytest.mark.integration
-def test_model_load_failure_truss():
+def test_model_load_logs(test_data_path):
+    model = """
+    from typing import Optional
+    import logging
+    class Model:
+        def load(self):
+            logging.info(f"User Load Message")
+        def predict(self, model_input):
+            return self.environment_name
+    """
+    config = "model_name: init-environment-truss"
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        logs = container.logs()
+        _assert_logs_contain(logs, message="Executing model.load()")
+        _assert_logs_contain(logs, message="Loading truss model from file")
+        _assert_logs_contain(logs, message="Completed model.load()")
+        _assert_logs_contain(logs, message="User Load Message")
+@pytest.mark.integration
+def test_model_load_failure_truss(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "model_load_failure_test"
+        truss_dir = test_data_path / "model_load_failure_test"
         tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=False)
@@ -110,6 +194,12 @@ def test_model_load_failure_truss():
             assert ready.status_code == expected_code
             return True
+        @handle_request_exception
+        def _test_is_loaded(expected_code):
+            ready = requests.get(f"{truss_server_addr}/v1/models/model/loaded")
+            assert ready.status_code == expected_code
+            return True
         @handle_request_exception
         def _test_ping(expected_code):
             ping = requests.get(f"{truss_server_addr}/ping")
@@ -118,43 +208,39 @@ def test_model_load_failure_truss():
         @handle_request_exception
         def _test_invocations(expected_code):
-            invocations = requests.post(f"{truss_server_addr}/invocations", json={})
+            invocations = requests.post(
+                f"{truss_server_addr}/v1/models/model:predict", json={}
+            )
             assert invocations.status_code == expected_code
             return True
         # The server should be completely down so all requests should result in a RequestException.
         # The decorator handle_request_exception catches the RequestException and returns False.
-        assert not _test_readiness_probe(expected_code=200)
         assert not _test_liveness_probe(expected_code=200)
+        assert not _test_readiness_probe(expected_code=200)
+        assert not _test_is_loaded(expected_code=200)
         assert not _test_ping(expected_code=200)
         assert not _test_invocations(expected_code=200)
 @pytest.mark.integration
-def test_concurrency_truss():
+def test_concurrency_truss(test_data_path):
     # Tests that concurrency limits work correctly
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_concurrency_truss"
+        truss_dir = test_data_path / "test_concurrency_truss"
         tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
         # Each request takes 2 seconds, for this thread, we allow
         # a concurrency of 2. This means the first two requests will
         # succeed within the 2 seconds, and the third will fail, since
         # it cannot start until the first two have completed.
         def make_request():
-            requests.post(full_url, json={}, timeout=3)
+            requests.post(PREDICT_URL, json={}, timeout=3)
-        successful_thread_1 = PropagatingThread(target=make_request)
-        successful_thread_2 = PropagatingThread(target=make_request)
-        failed_thread = PropagatingThread(target=make_request)
+        successful_thread_1 = _PropagatingThread(target=make_request)
+        successful_thread_2 = _PropagatingThread(target=make_request)
+        failed_thread = _PropagatingThread(target=make_request)
         successful_thread_1.start()
         successful_thread_2.start()
@@ -169,38 +255,40 @@ def test_concurrency_truss():
 @pytest.mark.integration
-def test_requirements_file_truss():
+def test_requirements_file_truss(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_requirements_file_truss"
+        truss_dir = test_data_path / "test_requirements_file_truss"
         tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
+        time.sleep(3)  # Sleeping to allow the load to finish
         # The prediction imports torch which is specified in a requirements.txt and returns if GPU is available.
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 200
         assert response.json() is False
 @pytest.mark.integration
-def test_async_truss():
+@pytest.mark.parametrize("pydantic_major_version", ["1", "2"])
+def test_requirements_pydantic(test_data_path, pydantic_major_version):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
+        truss_dir = test_data_path / f"test_pyantic_v{pydantic_major_version}"
+        tr = TrussHandle(truss_dir)
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_dir = truss_root / "test_data" / "test_async_truss"
+        response = requests.post(PREDICT_URL, json={})
+        assert response.status_code == 200
+        assert response.json() == '{\n    "foo": "bla",\n    "bar": 123\n}'
-        tr = TrussHandle(truss_dir)
+@pytest.mark.integration
+def test_async_truss(test_data_path):
+    with ensure_kill_all():
+        truss_dir = test_data_path / "test_async_truss"
+        tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.json() == {
             "preprocess_value": "value",
             "postprocess_value": "value",
@@ -208,58 +296,44 @@ def test_async_truss():
 @pytest.mark.integration
-def test_async_streaming():
+def test_async_streaming(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_streaming_async_generator_truss"
+        truss_dir = test_data_path / "test_streaming_async_generator_truss"
         tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={}, stream=True)
+        response = requests.post(PREDICT_URL, json={}, stream=True)
         assert response.headers.get("transfer-encoding") == "chunked"
         assert [
             byte_string.decode() for byte_string in list(response.iter_content())
         ] == ["0", "1", "2", "3", "4"]
         predict_non_stream_response = requests.post(
-            full_url,
-            json={},
-            stream=True,
-            headers={"accept": "application/json"},
+            PREDICT_URL, json={}, stream=True, headers={"accept": "application/json"}
         )
         assert "transfer-encoding" not in predict_non_stream_response.headers
         assert predict_non_stream_response.json() == "01234"
 @pytest.mark.integration
-def test_async_streaming_timeout():
+def test_async_streaming_timeout(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_streaming_read_timeout"
+        truss_dir = test_data_path / "test_streaming_read_timeout"
         tr = TrussHandle(truss_dir)
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        predict_url = f"{truss_server_addr}/v1/models/model:predict"
         # ChunkedEncodingError is raised when the chunk does not get processed due to streaming read timeout
         with pytest.raises(requests.exceptions.ChunkedEncodingError):
-            response = requests.post(predict_url, json={}, stream=True)
+            response = requests.post(PREDICT_URL, json={}, stream=True)
             for chunk in response.iter_content():
                 pass
         # Check to ensure the Timeout error is in the container logs
-        assert_logs_contain_error(
+        # TODO: maybe intercept this error better?
+        _assert_logs_contain_error(
             container.logs(),
             error="raise exceptions.TimeoutError()",
             message="Exception in ASGI application\n",
@@ -267,20 +341,16 @@ def test_async_streaming_timeout():
 @pytest.mark.integration
-def test_streaming_with_error():
+def test_streaming_with_error_and_stacktrace(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_streaming_truss_with_error"
+        truss_dir = test_data_path / "test_streaming_truss_with_error"
         tr = TrussHandle(truss_dir)
-        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        predict_url = f"{truss_server_addr}/v1/models/model:predict"
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
         predict_error_response = requests.post(
-            predict_url, json={"throw_error": True}, stream=True, timeout=2
+            PREDICT_URL, json={"throw_error": True}, stream=True, timeout=2
         )
         # In error cases, the response will return whatever the stream returned,
@@ -293,73 +363,28 @@ def test_streaming_with_error():
         # Test that we are able to continue to make requests successfully
         predict_non_error_response = requests.post(
-            predict_url, json={"throw_error": False}, stream=True, timeout=2
+            PREDICT_URL, json={"throw_error": False}, stream=True, timeout=2
         )
         assert [
             byte_string.decode()
             for byte_string in predict_non_error_response.iter_content()
         ] == ["0", "1", "2", "3", "4"]
-@pytest.mark.integration
-def test_streaming_truss():
-    with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "test_streaming_truss"
-        tr = TrussHandle(truss_dir)
-        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        predict_url = f"{truss_server_addr}/v1/models/model:predict"
-        # A request for which response is not completely read
-        predict_response = requests.post(predict_url, json={}, stream=True)
-        # We just read the first part and leave it hanging here
-        next(predict_response.iter_content())
-        predict_response = requests.post(predict_url, json={}, stream=True)
-        assert predict_response.headers.get("transfer-encoding") == "chunked"
-        assert [
-            byte_string.decode()
-            for byte_string in list(predict_response.iter_content())
-        ] == ["0", "1", "2", "3", "4"]
-        # When accept is set to application/json, the response is not streamed.
-        predict_non_stream_response = requests.post(
-            predict_url,
-            json={},
-            stream=True,
-            headers={"accept": "application/json"},
+        expected_stack_trace = (
+            "Traceback (most recent call last):\n"
+            '  File "/app/model/model.py", line 12, in inner\n'
+            "    helpers_1.foo(123)\n"
+            '  File "/packages/helpers_1.py", line 5, in foo\n'
+            "    return helpers_2.bar(x)\n"
+            '  File "/packages/helpers_2.py", line 2, in bar\n'
+            '    raise Exception("Crashed in `bar`.")\n'
+            "Exception: Crashed in `bar`."
+        )
+        _assert_logs_contain_error(
+            container.logs(),
+            error=expected_stack_trace,
+            message="Exception while generating streamed response: Crashed in `bar`.",
         )
-        assert "transfer-encoding" not in predict_non_stream_response.headers
-        assert predict_non_stream_response.json() == "01234"
-        # Test that concurrency work correctly. The streaming Truss has a configured
-        # concurrency of 1, so only one request can be in flight at a time. Each request
-        # takes 2 seconds, so with a timeout of 3 seconds, we expect the first request to
-        # succeed and for the second to timeout.
-        #
-        # Note that with streamed requests, requests.post raises a ReadTimeout exception if
-        # `timeout` seconds has passed since receiving any data from the server.
-        def make_request(delay: int):
-            # For streamed responses, requests does not start receiving content from server until
-            # `iter_content` is called, so we must call this in order to get an actual timeout.
-            time.sleep(delay)
-            list(requests.post(predict_url, json={}, stream=True).iter_content())
-        with ThreadPoolExecutor() as e:
-            # We use concurrent.futures.wait instead of the timeout property
-            # on requests, since requests timeout property has a complex interaction
-            # with streaming.
-            first_request = e.submit(make_request, 0)
-            second_request = e.submit(make_request, 0.2)
-            futures = [first_request, second_request]
-            done, not_done = concurrent.futures.wait(futures, timeout=3)
-            assert first_request in done
-            assert second_request in not_done
 @pytest.mark.integration
@@ -378,106 +403,50 @@ secrets:
     config_with_no_secret = "model_name: secrets-truss"
     missing_secret_error_message = """Secret 'secret' not found. Please ensure that:
-  * Secret 'secret' is defined in the 'secrets' section of the Truss config file
-  * The model was pushed with the --trusted flag"""
+  * Secret 'secret' is defined in the 'secrets' section of the Truss config file"""
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(inspect.getsource(Model)))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(inspect.getsource(Model), config) as tr:
         LocalConfigHandler.set_secret("secret", "secret_value")
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.json() == "secret_value"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        # Case where the secret is not specified in the config
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(
-            truss_dir, config_with_no_secret, textwrap.dedent(inspect.getsource(Model))
-        )
-        tr = TrussHandle(truss_dir)
+    # Case where the secret is not specified in the config
+    with ensure_kill_all(), _temp_truss(
+        inspect.getsource(Model), config_with_no_secret
+    ) as tr:
         LocalConfigHandler.set_secret("secret", "secret_value")
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert "error" in response.json()
-        assert_logs_contain_error(container.logs(), missing_secret_error_message)
+        _assert_logs_contain_error(container.logs(), missing_secret_error_message)
         assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        # Case where the secret is not mounted
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(inspect.getsource(Model)))
-        tr = TrussHandle(truss_dir)
+    # Case where the secret is not mounted
+    with ensure_kill_all(), _temp_truss(inspect.getsource(Model), config) as tr:
         LocalConfigHandler.remove_secret("secret")
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 500
-        assert_logs_contain_error(container.logs(), missing_secret_error_message)
+        _assert_logs_contain_error(container.logs(), missing_secret_error_message)
         assert "Internal Server Error" in response.json()["error"]
-@pytest.mark.integration
-def test_prints_captured_in_log():
-    class Model:
-        def predict(self, request):
-            print("This is a message from the Truss: Hello World!")
-            return {}
-    config = """model_name: printing-truss"""
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        # Case where the secret is not specified in the config
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(inspect.getsource(Model)))
-        tr = TrussHandle(truss_dir)
-        container = tr.docker_run(
-            local_port=8090, detach=True, wait_for_server_ready=True
-        )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        _ = requests.post(full_url, json={})
-        loglines = container.logs().splitlines()
-        relevant_line = None
-        for line in loglines:
-            logline = json.loads(line)
-            if logline["message"] == "This is a message from the Truss: Hello World!":
-                relevant_line = logline
-                break
-        # check that log line has other attributes and could be found
-        assert relevant_line is not None, "Relevant log line not found."
-        assert "asctime" in relevant_line
-        assert "levelname" in relevant_line
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
 @pytest.mark.integration
 def test_postprocess_with_streaming_predict():
+    # TODO: revisit the decision to forbid this. If so remove below comment.
     """
     Test a Truss that has streaming response from both predict and postprocess.
     In this case, the postprocess step continues to happen within the predict lock,
@@ -498,26 +467,26 @@ def test_postprocess_with_streaming_predict():
                 yield str(i)
     """
-    config = "model_name: error-truss"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model))
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
-        tr = TrussHandle(truss_dir)
-        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={}, stream=True)
-        # Note that the postprocess function is applied to the
-        # streamed response.
-        assert response.content == b"0 modified1 modified"
+        response = requests.post(PREDICT_URL, json={}, stream=True)
+        logging.info(response.content)
+        _assert_logs_contain_error(
+            container.logs(),
+            "ModelDefinitionError: If the predict function returns a generator (streaming), you cannot use postprocessing.",
+        )
+        assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
 @pytest.mark.integration
 def test_streaming_postprocess():
     """
-    Tests a Truss where predict returns non-streaming, but postprocess is streamd, and
+    Tests a Truss where predict returns non-streaming, but postprocess is streamed, and
     ensures that the postprocess step does not happen within the predict lock. To do this,
     we sleep for two seconds during the postprocess streaming process, and fire off two
     requests with a total timeout of 3 seconds, ensuring that if they were serialized
@@ -536,22 +505,14 @@ def test_streaming_postprocess():
             return ["0", "1"]
     """
-    config = "model_name: error-truss"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(model) as tr:
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
         def make_request(delay: int):
             # For streamed responses, requests does not start receiving content from server until
             # `iter_content` is called, so we must call this in order to get an actual timeout.
             time.sleep(delay)
-            response = requests.post(full_url, json={}, stream=True)
+            response = requests.post(PREDICT_URL, json={}, stream=True)
             assert response.status_code == 200
             assert response.content == b"0 modified1 modified"
@@ -599,20 +560,12 @@ def test_postprocess():
     """
-    config = "model_name: error-truss"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(model) as tr:
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
         def make_request(delay: int):
             time.sleep(delay)
-            response = requests.post(full_url, json={})
+            response = requests.post(PREDICT_URL, json={})
             assert response.status_code == 200
             assert response.json() == ["0 modified", "1 modified"]
@@ -642,27 +595,20 @@ def test_truss_with_errors():
             raise ValueError("error")
     """
-    config = "model_name: error-truss"
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(model) as tr:
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 500
         assert "error" in response.json()
-        assert_logs_contain_error(container.logs(), "ValueError: error")
+        _assert_logs_contain_error(container.logs(), "ValueError: error")
         assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
     model_preprocess_error = """
     class Model:
@@ -673,24 +619,19 @@ def test_truss_with_errors():
             return {"a": "b"}
     """
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model_preprocess_error))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(model_preprocess_error) as tr:
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 500
         assert "error" in response.json()
-        assert_logs_contain_error(container.logs(), "ValueError: error")
+        _assert_logs_contain_error(container.logs(), "ValueError: error")
         assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
     model_postprocess_error = """
     class Model:
@@ -701,23 +642,18 @@ def test_truss_with_errors():
             raise ValueError("error")
     """
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
-        create_truss(truss_dir, config, textwrap.dedent(model_postprocess_error))
-        tr = TrussHandle(truss_dir)
+    with ensure_kill_all(), _temp_truss(model_postprocess_error) as tr:
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 500
         assert "error" in response.json()
-        assert_logs_contain_error(container.logs(), "ValueError: error")
+        _assert_logs_contain_error(container.logs(), "ValueError: error")
         assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
     model_async = """
     class Model:
@@ -725,32 +661,93 @@ def test_truss_with_errors():
             raise ValueError("error")
     """
-    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
-        truss_dir = Path(tmp_work_dir, "truss")
+    with ensure_kill_all(), _temp_truss(model_async) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
-        create_truss(truss_dir, config, textwrap.dedent(model_async))
+        response = requests.post(PREDICT_URL, json={})
+        assert response.status_code == 500
+        assert "error" in response.json()
-        tr = TrussHandle(truss_dir)
+        _assert_logs_contain_error(container.logs(), "ValueError: error")
+        assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
+@pytest.mark.integration
+def test_truss_with_user_errors():
+    """Test that user-code raised `fastapi.HTTPExceptions` are passed through as is."""
+    model = """
+    import fastapi
+    class Model:
+        def predict(self, request):
+            raise fastapi.HTTPException(status_code=500, detail="My custom message.")
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
         container = tr.docker_run(
             local_port=8090, detach=True, wait_for_server_ready=True
         )
-        truss_server_addr = "http://localhost:8090"
-        full_url = f"{truss_server_addr}/v1/models/model:predict"
-        response = requests.post(full_url, json={})
+        response = requests.post(PREDICT_URL, json={})
         assert response.status_code == 500
         assert "error" in response.json()
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
+        _assert_logs_contain_error(
+            container.logs(),
+            "HTTPException: 500: My custom message.",
+            "Model raised HTTPException",
+        )
+        assert "My custom message." in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
+@pytest.mark.integration
+def test_truss_with_error_stacktrace(test_data_path):
+    with ensure_kill_all():
+        truss_dir = test_data_path / "test_truss_with_error"
+        tr = TrussHandle(truss_dir)
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
-        assert_logs_contain_error(container.logs(), "ValueError: error")
+        response = requests.post(PREDICT_URL, json={})
+        assert response.status_code == 500
+        assert "error" in response.json()
         assert "Internal Server Error" in response.json()["error"]
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
+        expected_stack_trace = (
+            "Traceback (most recent call last):\n"
+            '  File "/app/model/model.py", line 8, in predict\n'
+            "    return helpers_1.foo(123)\n"
+            '  File "/packages/helpers_1.py", line 5, in foo\n'
+            "    return helpers_2.bar(x)\n"
+            '  File "/packages/helpers_2.py", line 2, in bar\n'
+            '    raise Exception("Crashed in `bar`.")\n'
+            "Exception: Crashed in `bar`."
+        )
+        _assert_logs_contain_error(
+            container.logs(),
+            error=expected_stack_trace,
+            message="Internal Server Error",
+        )
 @pytest.mark.integration
-def test_slow_truss():
+def test_slow_truss(test_data_path):
     with ensure_kill_all():
-        truss_root = Path(__file__).parent.parent.parent.resolve() / "truss"
-        truss_dir = truss_root / "test_data" / "server_conformance_test_truss"
+        truss_dir = test_data_path / "server_conformance_test_truss"
         tr = TrussHandle(truss_dir)
         _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=False)
@@ -765,6 +762,10 @@ def test_slow_truss():
             ready = requests.get(f"{truss_server_addr}/v1/models/model")
             assert ready.status_code == expected_code
+        def _test_is_loaded(expected_code):
+            ready = requests.get(f"{truss_server_addr}/v1/models/model/loaded")
+            assert ready.status_code == expected_code
         def _test_ping(expected_code):
             ping = requests.get(f"{truss_server_addr}/ping")
             assert ping.status_code == expected_code
@@ -786,6 +787,7 @@ def test_slow_truss():
         for _ in range(LOAD_TEST_TIME):
             _test_liveness_probe(200)
             _test_readiness_probe(503)
+            _test_is_loaded(503)
             _test_ping(503)
             _test_invocations(503)
             time.sleep(1)
@@ -793,6 +795,7 @@ def test_slow_truss():
         time.sleep(LOAD_BUFFER_TIME)
         _test_liveness_probe(200)
         _test_readiness_probe(200)
+        _test_is_loaded(200)
         _test_ping(200)
         predict_call = Thread(
@@ -805,9 +808,1054 @@ def test_slow_truss():
         for _ in range(PREDICT_TEST_TIME):
             _test_liveness_probe(200)
             _test_readiness_probe(200)
+            _test_is_loaded(200)
             _test_ping(200)
             time.sleep(1)
         predict_call.join()
         _test_invocations(200)
+@pytest.mark.integration
+def test_init_environment_parameter():
+    # Test a truss deployment that is associated with an environment
+    model = """
+    from typing import Optional
+    class Model:
+        def __init__(self, **kwargs):
+            self._config = kwargs["config"]
+            self._environment = kwargs["environment"]
+            self.environment_name = self._environment.get("name") if self._environment else None
+        def load(self):
+            print(f"Executing model.load with environment: {self.environment_name}")
+        def predict(self, model_input):
+            return self.environment_name
+    """
+    config = "model_name: init-environment-truss"
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        # Mimic environment changing to staging
+        staging_env = {"name": "staging"}
+        staging_env_str = json.dumps(staging_env)
+        LocalConfigHandler.set_dynamic_config("environment", staging_env_str)
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        assert "Executing model.load with environment: staging" in container.logs()
+        response = requests.post(PREDICT_URL, json={})
+        assert response.json() == "staging"
+        assert response.status_code == 200
+        container.execute(["bash", "-c", "rm -f /etc/b10_dynamic_config/environment"])
+    # Test a truss deployment with no associated environment
+    config = "model_name: init-no-environment-truss"
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        assert "Executing model.load with environment: None" in container.logs()
+        response = requests.post(PREDICT_URL, json={})
+        assert response.json() is None
+        assert response.status_code == 200
+@pytest.mark.integration
+def test_setup_environment():
+    # Test truss that uses setup_environment() without load()
+    model = """
+    from typing import Optional
+    class Model:
+        def setup_environment(self, environment: Optional[dict]):
+            print("setup_environment called with", environment)
+            self.environment_name = environment.get("name") if environment else None
+            print(f"in {self.environment_name} environment")
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        # Mimic environment changing to beta
+        beta_env = {"name": "beta"}
+        beta_env_str = json.dumps(beta_env)
+        container.execute(
+            [
+                "bash",
+                "-c",
+                f"echo '{beta_env_str}' > /etc/b10_dynamic_config/environment",
+            ]
+        )
+        time.sleep(30)
+        assert (
+            f"Executing model.setup_environment with environment: {beta_env}"
+            in container.logs()
+        )
+        single_quote_beta_env_str = beta_env_str.replace('"', "'")
+        assert (
+            f"setup_environment called with {single_quote_beta_env_str}"
+            in container.logs()
+        )
+        assert "in beta environment" in container.logs()
+        container.execute(["bash", "-c", "rm -f /etc/b10_dynamic_config/environment"])
+    # Test a truss that uses the environment in load()
+    model = """
+    from typing import Optional
+    class Model:
+        def setup_environment(self, environment: Optional[dict]):
+            print("setup_environment called with", environment)
+            self.environment_name = environment.get("name") if environment else None
+            print(f"in {self.environment_name} environment")
+        def load(self):
+            print("loading in environment", self.environment_name)
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        # Mimic environment changing to staging
+        staging_env = {"name": "staging"}
+        staging_env_str = json.dumps(staging_env)
+        LocalConfigHandler.set_dynamic_config("environment", staging_env_str)
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        # Don't need to wait here because we explicitly grab the environment from dynamic_config_resolver before calling user's load()
+        assert (
+            f"Executing model.setup_environment with environment: {staging_env}"
+            in container.logs()
+        )
+        single_quote_staging_env_str = staging_env_str.replace('"', "'")
+        assert (
+            f"setup_environment called with {single_quote_staging_env_str}"
+            in container.logs()
+        )
+        assert "in staging environment" in container.logs()
+        assert "loading in environment staging" in container.logs()
+        # Set environment to None
+        no_env = None
+        no_env_str = json.dumps(no_env)
+        container.execute(
+            ["bash", "-c", f"echo '{no_env_str}' > /etc/b10_dynamic_config/environment"]
+        )
+        time.sleep(30)
+        assert (
+            f"Executing model.setup_environment with environment: {no_env}"
+            in container.logs()
+        )
+        assert "setup_environment called with None" in container.logs()
+        container.execute(["bash", "-c", "rm -f /etc/b10_dynamic_config/environment"])
+@pytest.mark.integration
+def test_health_check_configuration():
+    model = """
+    class Model:
+        def predict(self, model_input):
+            return model_input
+    """
+    config = """runtime:
+    health_checks:
+        restart_check_delay_seconds: 100
+        restart_threshold_seconds: 1700
+    """
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        assert tr.spec.config.runtime.health_checks.restart_check_delay_seconds == 100
+        assert tr.spec.config.runtime.health_checks.restart_threshold_seconds == 1700
+        assert (
+            tr.spec.config.runtime.health_checks.stop_traffic_threshold_seconds is None
+        )
+    config = """runtime:
+    health_checks:
+        restart_check_delay_seconds: 1200
+        restart_threshold_seconds: 90
+        stop_traffic_threshold_seconds: 50
+    """
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        assert tr.spec.config.runtime.health_checks.restart_check_delay_seconds == 1200
+        assert tr.spec.config.runtime.health_checks.restart_threshold_seconds == 90
+        assert tr.spec.config.runtime.health_checks.stop_traffic_threshold_seconds == 50
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        assert tr.spec.config.runtime.health_checks.restart_check_delay_seconds is None
+        assert tr.spec.config.runtime.health_checks.restart_threshold_seconds is None
+        assert (
+            tr.spec.config.runtime.health_checks.stop_traffic_threshold_seconds is None
+        )
+@pytest.mark.integration
+def test_is_healthy():
+    model = """
+    class Model:
+        def load(self):
+            raise Exception("not loaded")
+        def is_healthy(self) -> bool:
+            return True
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        truss_server_addr = "http://localhost:8090"
+        for _ in range(5):
+            time.sleep(1)
+            healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+            if healthy.status_code == 503:
+                break
+            assert healthy.status_code == 200
+        assert healthy.status_code == 503
+        diff = container.diff()
+        assert "/root/inference_server_crashed.txt" in diff
+        assert diff["/root/inference_server_crashed.txt"] == "A"
+    model = """
+    class Model:
+        def is_healthy(self, argument) -> bool:
+            pass
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1)
+        _assert_logs_contain_error(
+            container.logs(),
+            message="Exception while loading model",
+            error="`is_healthy` must have only one argument: `self`",
+        )
+    model = """
+    class Model:
+        def is_healthy(self) -> bool:
+            raise Exception("not healthy")
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        # Sleep a few seconds to get the server some time to wake up
+        time.sleep(10)
+        truss_server_addr = "http://localhost:8090"
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 503
+        assert (
+            "Exception while checking if model is healthy: not healthy"
+            in container.logs()
+        )
+        assert "Health check failed." in container.logs()
+    model = """
+    import time
+    class Model:
+        def load(self):
+            time.sleep(10)
+        def is_healthy(self) -> bool:
+            return False
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        truss_server_addr = "http://localhost:8090"
+        time.sleep(5)
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 503
+        # Ensure we only log after model.load is complete
+        assert "Health check failed." not in container.logs()
+        # Sleep a few seconds to get the server some time to wake up
+        time.sleep(10)
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 503
+        assert container.logs().count("Health check failed.") == 1
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 503
+        assert container.logs().count("Health check failed.") == 2
+    model = """
+    import time
+    class Model:
+        def __init__(self, **kwargs):
+            self._healthy = False
+        def load(self):
+            time.sleep(10)
+            self._healthy = True
+        def is_healthy(self):
+            return self._healthy
+        def predict(self, model_input):
+            self._healthy = model_input["healthy"]
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=False)
+        time.sleep(5)
+        truss_server_addr = "http://localhost:8090"
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 503
+        time.sleep(10)
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 200
+        healthy_responses = [True, "yessss", 34, {"woo": "hoo"}]
+        for response in healthy_responses:
+            predict_response = requests.post(PREDICT_URL, json={"healthy": response})
+            assert predict_response.status_code == 200
+            healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+            assert healthy.status_code == 200
+        not_healthy_responses = [False, "", 0, {}]
+        for response in not_healthy_responses:
+            predict_response = requests.post(PREDICT_URL, json={"healthy": response})
+            assert predict_response.status_code == 200
+            healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+            assert healthy.status_code == 503
+    model = """
+    class Model:
+        def is_healthy(self) -> bool:
+            return True
+        def predict(self, model_input):
+            return model_input
+    """
+    with ensure_kill_all(), _temp_truss(model, "") as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        truss_server_addr = "http://localhost:8090"
+        healthy = requests.get(f"{truss_server_addr}/v1/models/model")
+        assert healthy.status_code == 200
+def _patch_termination_timeout(container: Container, seconds: int, truss_container_fs):
+    app_path = truss_container_fs / "app"
+    sys.path.append(str(app_path))
+    import truss_server
+    local_server_source = pathlib.Path(truss_server.__file__)
+    container_server_source = "/app/truss_server.py"
+    modified_content = local_server_source.read_text().replace(
+        "TIMEOUT_GRACEFUL_SHUTDOWN = 120", f"TIMEOUT_GRACEFUL_SHUTDOWN = {seconds}"
+    )
+    with tempfile.NamedTemporaryFile() as patched_file:
+        patched_file.write(modified_content.encode("utf-8"))
+        patched_file.flush()
+        container.copy_to(patched_file.name, container_server_source)
+@pytest.mark.anyio
+@pytest.mark.integration
+async def test_graceful_shutdown(truss_container_fs):
+    model = """
+    import time
+    class Model:
+        def predict(self, request):
+            print(f"Received {request}")
+            time.sleep(request["seconds"])
+            print(f"Done {request}")
+            return request
+    """
+    async def predict_request(data: dict):
+        async with httpx.AsyncClient() as client:
+            response = await client.post(PREDICT_URL, json=data)
+            response.raise_for_status()
+            return response.json()
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        await predict_request({"seconds": 0, "task": 0})  # Warm up server.
+        # Test starting two requests, each taking 2 seconds, then terminating server.
+        # They should both finish successfully since the server grace period is 120 s.
+        task_0 = asyncio.create_task(predict_request({"seconds": 2, "task": 0}))
+        await asyncio.sleep(0.1)  # Yield to event loop to make above task run.
+        task_1 = asyncio.create_task(predict_request({"seconds": 2, "task": 1}))
+        await asyncio.sleep(0.1)  # Yield to event loop to make above task run.
+        t0 = time.perf_counter()
+        # Even though the server has 120s grace period, we expect to finish much
+        # faster in the test here, so use 10s.
+        container.stop(10)
+        stop_time = time.perf_counter() - t0
+        print(f"Stopped in {stop_time} seconds,")
+        assert 3 < stop_time < 5
+        assert (await task_0) == {"seconds": 2, "task": 0}
+        assert (await task_1) == {"seconds": 2, "task": 1}
+        # Now mess around in the docker container to reduce the grace period to 3 s.
+        # (There's not nice way to patch this...)
+        _patch_termination_timeout(container, 3, truss_container_fs)
+        # Now only one request should complete.
+        container.restart()
+        wait_for_truss("http://localhost:8090", container, True)
+        await predict_request({"seconds": 0, "task": 0})  # Warm up server.
+        task_2 = asyncio.create_task(predict_request({"seconds": 2, "task": 2}))
+        await asyncio.sleep(0.1)  # Yield to event loop to make above task run.
+        task_3 = asyncio.create_task(predict_request({"seconds": 2, "task": 3}))
+        await asyncio.sleep(0.1)  # Yield to event loop to make above task run.
+        t0 = time.perf_counter()
+        container.stop(10)
+        stop_time = time.perf_counter() - t0
+        print(f"Stopped in {stop_time} seconds,")
+        assert 3 < stop_time < 5
+        assert (await task_2) == {"seconds": 2, "task": 2}
+        with pytest.raises(httpx.HTTPStatusError):
+            await task_3
+# Tracing ##############################################################################
+def _make_otel_headers() -> Mapping[str, str]:
+    """
+    Create and return a mapping with OpenTelemetry trace context headers.
+    This function starts a new span and injects the trace context into the headers,
+    which can be used to propagate tracing information in outgoing HTTP requests.
+    Returns:
+        Mapping[str, str]: A mapping containing the trace context headers.
+    """
+    # Initialize a tracer
+    tracer = trace.get_tracer(__name__)
+    # Create a dictionary to hold the headers
+    headers: dict[str, str] = {}
+    # Start a new span
+    with tracer.start_as_current_span("outgoing-request-span"):
+        # Use the TraceContextTextMapPropagator to inject the trace context into the headers
+        propagator = tracecontext.TraceContextTextMapPropagator()
+        propagator.inject(headers, context=context.get_current())
+    return headers
+@pytest.mark.integration
+@pytest.mark.parametrize("enable_tracing_data", [True, False])
+def test_streaming_truss_with_user_tracing(test_data_path, enable_tracing_data):
+    with ensure_kill_all():
+        truss_dir = test_data_path / "test_streaming_truss_with_tracing"
+        tr = TrussHandle(truss_dir)
+        def enable_gpu_fn(conf):
+            new_runtime = dataclasses.replace(
+                conf.runtime, enable_tracing_data=enable_tracing_data
+            )
+            return dataclasses.replace(conf, runtime=new_runtime)
+        tr._update_config(enable_gpu_fn)
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        # A request for which response is not completely read
+        headers_0 = _make_otel_headers()
+        predict_response = requests.post(
+            PREDICT_URL, json={}, stream=True, headers=headers_0
+        )
+        # We just read the first part and leave it hanging here
+        next(predict_response.iter_content())
+        headers_1 = _make_otel_headers()
+        predict_response = requests.post(
+            PREDICT_URL, json={}, stream=True, headers=headers_1
+        )
+        assert predict_response.headers.get("transfer-encoding") == "chunked"
+        # When accept is set to application/json, the response is not streamed.
+        headers_2 = _make_otel_headers()
+        predict_non_stream_response = requests.post(
+            PREDICT_URL,
+            json={},
+            stream=True,
+            headers={**headers_2, "accept": "application/json"},
+        )
+        assert "transfer-encoding" not in predict_non_stream_response.headers
+        assert predict_non_stream_response.json() == "01234"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            truss_traces_file = pathlib.Path(tmp_dir) / "otel_traces.ndjson"
+            container.copy_from("/tmp/otel_traces.ndjson", truss_traces_file)
+            truss_traces = [
+                json.loads(s) for s in truss_traces_file.read_text().splitlines()
+            ]
+            user_traces_file = pathlib.Path(tmp_dir) / "otel_user_traces.ndjson"
+            container.copy_from("/tmp/otel_user_traces.ndjson", user_traces_file)
+            user_traces = [
+                json.loads(s) for s in user_traces_file.read_text().splitlines()
+            ]
+        if not enable_tracing_data:
+            assert len(truss_traces) == 0
+            assert len(user_traces) > 0
+            return
+        assert sum(1 for x in truss_traces if x["name"] == "predict-endpoint") == 3
+        assert sum(1 for x in user_traces if x["name"] == "load_model") == 1
+        assert sum(1 for x in user_traces if x["name"] == "predict") == 3
+        user_parents = set(x["parent_id"] for x in user_traces)
+        truss_spans = set(x["context"]["span_id"] for x in truss_traces)
+        truss_parents = set(x["parent_id"] for x in truss_traces)
+        # Make sure there is no context creep into user traces. No user trace should
+        # have a truss trace as parent.
+        assert user_parents & truss_spans == set()
+        # But make sure traces have parents at all.
+        assert len(user_parents) > 3
+        assert len(truss_parents) > 3
+# Returning Response Objects ###########################################################
+@pytest.mark.integration
+def test_truss_with_response():
+    """Test that user-code can set a custom status code."""
+    model = """
+    from fastapi.responses import Response
+    class Model:
+        def predict(self, inputs):
+            return Response(status_code=inputs["code"])
+    """
+    from fastapi import status
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(PREDICT_URL, json={"code": status.HTTP_204_NO_CONTENT})
+        assert response.status_code == 204
+        assert "x-baseten-error-source" not in response.headers
+        assert "x-baseten-error-code" not in response.headers
+        response = requests.post(
+            PREDICT_URL, json={"code": status.HTTP_500_INTERNAL_SERVER_ERROR}
+        )
+        assert response.status_code == 500
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "700"
+@pytest.mark.integration
+def test_truss_with_streaming_response():
+    # TODO: one issue with this is that (unlike our "builtin" streaming), this keeps
+    #  the semaphore claimed potentially longer if the client drops.
+    model = """from starlette.responses import StreamingResponse
+class Model:
+    def predict(self, model_input):
+        def text_generator():
+            for i in range(3):
+                yield f"data: {i}\\n\\n"
+        return StreamingResponse(text_generator(), media_type="text/event-stream")
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        # A request for which response is not completely read.
+        predict_response = requests.post(PREDICT_URL, json={}, stream=True)
+        assert (
+            predict_response.headers["Content-Type"]
+            == "text/event-stream; charset=utf-8"
+        )
+        lines = predict_response.text.strip().split("\n")
+        assert lines == ["data: 0", "", "data: 1", "", "data: 2"]
+# Using Request in Model ###############################################################
+@pytest.mark.integration
+def test_truss_with_request():
+    model = """
+    import fastapi
+    class Model:
+        async def preprocess(self, request: fastapi.Request):
+            return await request.json()
+        async def predict(self, inputs, request: fastapi.Request):
+            inputs["request_size"] = len(await request.body())
+            return inputs
+        def postprocess(self, inputs):
+             return {**inputs, "postprocess": "was here"}
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(PREDICT_URL, json={"test": 123})
+        assert response.status_code == 200
+        assert response.json() == {
+            "test": 123,
+            "request_size": 13,
+            "postprocess": "was here",
+        }
+@pytest.mark.integration
+def test_truss_with_requests_and_invalid_signatures():
+    model = """
+    class Model:
+        def predict(self, inputs, invalid_arg): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "`predict` method with two arguments must have request as second argument",
+            "Exception while loading model",
+        )
+    model = """
+    import fastapi
+    class Model:
+        def predict(self, request: fastapi.Request, invalid_arg): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "`predict` method with two arguments is not allowed to have request as "
+            "first argument",
+            "Exception while loading model",
+        )
+    model = """
+    import fastapi
+    class Model:
+        def predict(self, inputs, request: fastapi.Request, something): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "`predict` method cannot have more than two arguments",
+            "Exception while loading model",
+        )
+@pytest.mark.integration
+def test_truss_with_requests_and_invalid_argument_combinations():
+    model = """
+    import fastapi
+    class Model:
+        async def preprocess(self, inputs): ...
+        def predict(self, request: fastapi.Request): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "When using `preprocess`, the predict method cannot only have the request argument",
+            "Exception while loading model",
+        )
+    model = """
+    import fastapi
+    class Model:
+        def preprocess(self, inputs): ...
+        async def predict(self, inputs, request: fastapi.Request): ...
+        def postprocess(self, request: fastapi.Request): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "The `postprocess` method cannot only have the request argument",
+            "Exception while loading model",
+        )
+    model = """
+    import fastapi
+    class Model:
+        def preprocess(self, inputs): ...
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=False
+        )
+        time.sleep(1.0)  # Wait for logs.
+        _assert_logs_contain_error(
+            container.logs(),
+            "Truss model must have a `predict` method.",
+            "Exception while loading model",
+        )
+@pytest.mark.integration
+def test_truss_forbid_postprocessing_with_response():
+    model = """
+    import fastapi, json
+    class Model:
+        def predict(self, inputs):
+            return fastapi.Response(content=json.dumps(inputs), status_code=200)
+        def postprocess(self, inputs):
+             return inputs
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        response = requests.post(PREDICT_URL, json={})
+        assert response.status_code == 500
+        assert response.headers["x-baseten-error-source"] == "04"
+        assert response.headers["x-baseten-error-code"] == "600"
+        _assert_logs_contain_error(
+            container.logs(),
+            "If the predict function returns a response object, you cannot "
+            "use postprocessing.",
+        )
+@pytest.mark.integration
+def test_async_streaming_with_cancellation():
+    model = """
+    import fastapi, asyncio, logging
+    class Model:
+        async def predict(self, inputs, request: fastapi.Request):
+            await asyncio.sleep(1)
+            if await request.is_disconnected():
+                logging.warning("Cancelled (before gen).")
+                return
+            for i in range(5):
+                await asyncio.sleep(1.0)
+                logging.warning(i)
+                yield str(i)
+                if await request.is_disconnected():
+                    logging.warning("Cancelled (during gen).")
+                    return
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        # For hard cancellation we need to use httpx, requests' timeouts don't work.
+        with pytest.raises(httpx.ReadTimeout):
+            with httpx.Client(
+                timeout=httpx.Timeout(1.0, connect=1.0, read=1.0)
+            ) as client:
+                response = client.post(PREDICT_URL, json={}, timeout=1.0)
+                response.raise_for_status()
+        time.sleep(2)  # Wait a bit to get all logs.
+        assert "Cancelled (during gen)." in container.logs()
+@pytest.mark.integration
+def test_async_non_streaming_with_cancellation():
+    model = """
+    import fastapi, asyncio, logging
+    class Model:
+        async def predict(self, inputs, request: fastapi.Request):
+            logging.info("Start sleep")
+            await asyncio.sleep(2)
+            logging.info("done sleep, check request.")
+            if await request.is_disconnected():
+                logging.warning("Cancelled (before gen).")
+                return
+            logging.info("Not cancelled.")
+            return "Done"
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        container = tr.docker_run(
+            local_port=8090, detach=True, wait_for_server_ready=True
+        )
+        # For hard cancellation we need to use httpx, requests' timeouts don't work.
+        with pytest.raises(httpx.ReadTimeout):
+            with httpx.Client(
+                timeout=httpx.Timeout(1.0, connect=1.0, read=1.0)
+            ) as client:
+                response = client.post(PREDICT_URL, json={}, timeout=1.0)
+                response.raise_for_status()
+        time.sleep(2)  # Wait a bit to get all logs.
+        assert "Cancelled (before gen)." in container.logs()
+@pytest.mark.integration
+def test_limit_concurrency_with_sse():
+    # It seems that the "builtin" functionality of the FastAPI server already buffers
+    # the generator, so that it doesn't keep hanging around if the client doesn't
+    # consume data. `_buffered_response_generator` might be redundant.
+    # This can be observed by waiting for a long time in `make_request`: the server will
+    # print `Done` for the tasks, while we still wait and hold the unconsumed response.
+    # For testing we need to have actually slow generation to keep the server busy.
+    model = """
+    import asyncio
+    class Model:
+        async def predict(self, request):
+            print(f"Starting {request}")
+            for i in range(5):
+                await asyncio.sleep(0.1)
+                yield str(i)
+            print(f"Done {request}")
+    """
+    config = """runtime:
+  predict_concurrency: 2"""
+    def make_request(consume_chunks, timeout, task_id):
+        t0 = time.time()
+        with httpx.Client() as client:
+            with client.stream(
+                "POST", PREDICT_URL, json={"task_id": task_id}
+            ) as response:
+                assert response.status_code == 200
+                if consume_chunks:
+                    chunks = [chunk for chunk in response.iter_text()]
+                    print(f"consumed chunks ({task_id}): {chunks}")
+                    assert len(chunks) > 0
+                    t1 = time.time()
+                    if t1 - t0 > timeout:
+                        raise httpx.ReadTimeout("Timeout")
+                    return chunks
+                else:
+                    print(f"waiting ({task_id})")
+                    time.sleep(0.5)  # Hold the connection.
+                    print(f"waiting done ({task_id})")
+    with ensure_kill_all(), _temp_truss(model, config) as tr:
+        _ = tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        # Processing full request takes 0.5s.
+        print("Make warmup request")
+        make_request(consume_chunks=True, timeout=0.55, task_id=0)
+        with ThreadPoolExecutor() as executor:
+            # Start two requests and hold them without consuming all chunks
+            # Each takes for 0.5 s. Semaphore should be claimed, with 0 remaining.
+            print("Start two tasks.")
+            task1 = executor.submit(make_request, False, 0.55, 1)
+            task2 = executor.submit(make_request, False, 0.55, 2)
+            print("Wait for tasks to start.")
+            time.sleep(0.05)
+            print("Make a request while server is busy.")
+            with pytest.raises(httpx.ReadTimeout):
+                make_request(True, timeout=0.55, task_id=3)
+            task1.result()
+            task2.result()
+            print("Task 1 and 2 completed. Server should be free again.")
+        result = make_request(True, timeout=0.55, task_id=4)
+        print(f"Final chunks: {result}")
+@pytest.mark.integration
+def test_custom_openai_endpoints():
+    """
+    Test a Truss that exposes an OpenAI compatible endpoint.
+    """
+    model = """
+    from typing import Dict
+    class Model:
+        def __init__(self):
+            pass
+        def load(self):
+            self._predict_count = 0
+            self._completions_count = 0
+        async def predict(self, inputs: Dict) -> int:
+            self._predict_count += inputs["increment"]
+            return self._predict_count
+        async def completions(self, inputs: Dict) -> int:
+            self._completions_count += inputs["increment"]
+            return self._completions_count
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(PREDICT_URL, json={"increment": 1})
+        assert response.status_code == 200
+        assert response.json() == 1
+        response = requests.post(COMPLETIONS_URL, json={"increment": 2})
+        assert response.status_code == 200
+        assert response.json() == 2
+        response = requests.post(CHAT_COMPLETIONS_URL, json={"increment": 3})
+        assert response.status_code == 404
+@pytest.mark.integration
+def test_postprocess_async_generator_streaming():
+    """
+    Test a Truss that exposes an OpenAI compatible endpoint.
+    """
+    model = """
+    from typing import Dict, List, Generator
+    class Model:
+        def __init__(self):
+            pass
+        def load(self):
+            pass
+        async def predict(self, inputs: Dict) -> List[str]:
+            nums: List[int] = inputs["nums"]
+            return nums
+        async def postprocess(self, nums: List[str]) -> Generator[str, None, None]:
+            for num in nums:
+                yield num
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(PREDICT_URL, json={"nums": ["1", "2"]}, stream=True)
+        assert response.headers.get("transfer-encoding") == "chunked"
+        assert [
+            byte_string.decode() for byte_string in list(response.iter_content())
+        ] == ["1", "2"]
+@pytest.mark.integration
+def test_preprocess_async_generator():
+    """
+    Test a Truss that exposes an OpenAI compatible endpoint.
+    """
+    model = """
+    from typing import Dict, List, AsyncGenerator
+    class Model:
+        def __init__(self):
+            pass
+        def load(self):
+            pass
+        async def preprocess(self, inputs: Dict) -> AsyncGenerator[str, None]:
+            for num in inputs["nums"]:
+                yield num
+        async def predict(self, nums: AsyncGenerator[str, None]) -> List[str]:
+            return [num async for num in nums]
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(PREDICT_URL, json={"nums": ["1", "2"]})
+        assert response.status_code == 200
+        assert response.json() == ["1", "2"]
+@pytest.mark.integration
+def test_openai_client_streaming():
+    """
+    Test a Truss that exposes an OpenAI compatible endpoint.
+    """
+    model = """
+    from typing import Dict, AsyncGenerator
+    class Model:
+        def __init__(self):
+            pass
+        def load(self):
+            pass
+        async def chat_completions(self, inputs: Dict) -> AsyncGenerator[str, None]:
+            for num in inputs["nums"]:
+                yield num
+        async def predict(self, inputs: Dict):
+            pass
+    """
+    with ensure_kill_all(), _temp_truss(model) as tr:
+        tr.docker_run(local_port=8090, detach=True, wait_for_server_ready=True)
+        response = requests.post(
+            CHAT_COMPLETIONS_URL,
+            json={"nums": ["1", "2"]},
+            stream=True,
+            # Despite requesting json, we should still stream results back.
+            headers={
+                "accept": "application/json",
+                "user-agent": "OpenAI/Python 1.61.0",
+            },
+        )
+        assert response.headers.get("transfer-encoding") == "chunked"
+        assert [
+            byte_string.decode() for byte_string in list(response.iter_content())
+        ] == ["1", "2"]