PyPI - llama-stack - Versions diffs - 0.3.0__tar.gz → 0.3.2__tar.gz - Mend

llama-stack 0.3.0tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (634) hide show

{llama_stack-0.3.0/llama_stack.egg-info → llama_stack-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama_stack
-Version: 0.3.0
+Version: 0.3.2
 Summary: Llama Stack
 Author-email: Meta Llama <llama-oss@meta.com>
 License: MIT
@@ -22,7 +22,7 @@ Requires-Dist: fire
 Requires-Dist: httpx
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: jsonschema
-Requires-Dist: llama-stack-client>=0.3.0
+Requires-Dist: llama-stack-client>=0.3.2
 Requires-Dist: openai>=1.107
 Requires-Dist: prompt-toolkit
 Requires-Dist: python-dotenv
@@ -44,7 +44,7 @@ Requires-Dist: sqlalchemy[asyncio]>=2.0.41
 Provides-Extra: ui
 Requires-Dist: streamlit; extra == "ui"
 Requires-Dist: pandas; extra == "ui"
-Requires-Dist: llama-stack-client>=0.3.0; extra == "ui"
+Requires-Dist: llama-stack-client>=0.3.2; extra == "ui"
 Requires-Dist: streamlit-option-menu; extra == "ui"
 Dynamic: license-file

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/cli/stack/list_deps.py RENAMED Viewed

@@ -46,6 +46,10 @@ class StackListDeps(Subcommand):
     def _run_stack_list_deps_command(self, args: argparse.Namespace) -> None:
         # always keep implementation completely silo-ed away from CLI so CLI
         # can be fast to load and reduces dependencies
+        if not args.config and not args.providers:
+            self.parser.print_help()
+            self.parser.exit()
         from ._list_deps import run_stack_list_deps_command
         return run_stack_list_deps_command(args)

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/cli/stack/run.py RENAMED Viewed

@@ -106,7 +106,8 @@ class StackRun(Subcommand):
             try:
                 config = parse_and_maybe_upgrade_config(config_dict)
-                if not os.path.exists(str(config.external_providers_dir)):
+                # Create external_providers_dir if it's specified and doesn't exist
+                if config.external_providers_dir and not os.path.exists(str(config.external_providers_dir)):
                     os.makedirs(str(config.external_providers_dir), exist_ok=True)
             except AttributeError as e:
                 self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
@@ -127,7 +128,7 @@ class StackRun(Subcommand):
             config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
         port = args.port or config.server.port
-        host = config.server.host or ["::", "0.0.0.0"]
+        host = config.server.host or "0.0.0.0"
         # Set the config file in environment so create_app can find it
         os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
@@ -139,6 +140,7 @@ class StackRun(Subcommand):
             "lifespan": "on",
             "log_level": logger.getEffectiveLevel(),
             "log_config": logger_config,
+            "workers": config.server.workers,
         }
         keyfile = config.server.tls_keyfile

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/core/configure.py RENAMED Viewed

@@ -17,7 +17,6 @@ from llama_stack.core.distribution import (
     get_provider_registry,
 )
 from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
-from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
@@ -194,19 +193,11 @@ def upgrade_from_routing_table(
 def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
-    version = config_dict.get("version", None)
-    if version == LLAMA_STACK_RUN_CONFIG_VERSION:
-        processed_config_dict = replace_env_vars(config_dict)
-        return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
     if "routing_table" in config_dict:
         logger.info("Upgrading config...")
         config_dict = upgrade_from_routing_table(config_dict)
     config_dict["version"] = LLAMA_STACK_RUN_CONFIG_VERSION
-    if not config_dict.get("external_providers_dir", None):
-        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR
     processed_config_dict = replace_env_vars(config_dict)
     return StackRunConfig(**cast_image_name_to_string(processed_config_dict))

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/core/datatypes.py RENAMED Viewed

@@ -471,6 +471,10 @@ class ServerConfig(BaseModel):
         "- true: Enable localhost CORS for development\n"
         "- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
     )
+    workers: int = Field(
+        default=1,
+        description="Number of workers to use for the server",
+    )
 class StackRunConfig(BaseModel):

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/core/routers/inference.py RENAMED Viewed

@@ -105,7 +105,8 @@ class InferenceRouter(Inference):
         prompt_tokens: int,
         completion_tokens: int,
         total_tokens: int,
-        model: Model,
+        fully_qualified_model_id: str,
+        provider_id: str,
     ) -> list[MetricEvent]:
         """Constructs a list of MetricEvent objects containing token usage metrics.
@@ -113,7 +114,8 @@ class InferenceRouter(Inference):
             prompt_tokens: Number of tokens in the prompt
             completion_tokens: Number of tokens in the completion
             total_tokens: Total number of tokens used
-            model: Model object containing model_id and provider_id
+            fully_qualified_model_id:
+            provider_id: The provider identifier
         Returns:
             List of MetricEvent objects with token usage metrics
@@ -139,8 +141,8 @@ class InferenceRouter(Inference):
                     timestamp=datetime.now(UTC),
                     unit="tokens",
                     attributes={
-                        "model_id": model.model_id,
-                        "provider_id": model.provider_id,
+                        "model_id": fully_qualified_model_id,
+                        "provider_id": provider_id,
                     },
                 )
             )
@@ -153,7 +155,9 @@ class InferenceRouter(Inference):
         total_tokens: int,
         model: Model,
     ) -> list[MetricInResponse]:
-        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
+        metrics = self._construct_metrics(
+            prompt_tokens, completion_tokens, total_tokens, model.model_id, model.provider_id
+        )
         if self.telemetry:
             for metric in metrics:
                 enqueue_event(metric)
@@ -173,14 +177,25 @@ class InferenceRouter(Inference):
             encoded = self.formatter.encode_content(messages)
         return len(encoded.tokens) if encoded and encoded.tokens else 0
-    async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
-        """takes a model id and gets model after ensuring that it is accessible and of the correct type"""
-        model = await self.routing_table.get_model(model_id)
-        if model is None:
+    async def _get_model_provider(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
+        model = await self.routing_table.get_object_by_identifier("model", model_id)
+        if model:
+            if model.model_type != expected_model_type:
+                raise ModelTypeError(model_id, model.model_type, expected_model_type)
+            provider = await self.routing_table.get_provider_impl(model.identifier)
+            return provider, model.provider_resource_id
+        splits = model_id.split("/", maxsplit=1)
+        if len(splits) != 2:
+            raise ModelNotFoundError(model_id)
+        provider_id, provider_resource_id = splits
+        if provider_id not in self.routing_table.impls_by_provider_id:
+            logger.warning(f"Provider {provider_id} not found for model {model_id}")
             raise ModelNotFoundError(model_id)
-        if model.model_type != expected_model_type:
-            raise ModelTypeError(model_id, model.model_type, expected_model_type)
-        return model
+        return self.routing_table.impls_by_provider_id[provider_id], provider_resource_id
     async def openai_completion(
         self,
@@ -189,24 +204,24 @@ class InferenceRouter(Inference):
         logger.debug(
             f"InferenceRouter.openai_completion: model={params.model}, stream={params.stream}, prompt={params.prompt}",
         )
-        model_obj = await self._get_model(params.model, ModelType.llm)
-        # Update params with the resolved model identifier
-        params.model = model_obj.identifier
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.llm)
+        params.model = provider_resource_id
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
         if params.stream:
             return await provider.openai_completion(params)
             # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
             # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
         response = await provider.openai_completion(params)
+        response.model = request_model_id
         if self.telemetry:
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
                 total_tokens=response.usage.total_tokens,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
             )
             for metric in metrics:
                 enqueue_event(metric)
@@ -224,7 +239,9 @@ class InferenceRouter(Inference):
         logger.debug(
             f"InferenceRouter.openai_chat_completion: model={params.model}, stream={params.stream}, messages={params.messages}",
         )
-        model_obj = await self._get_model(params.model, ModelType.llm)
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.llm)
+        params.model = provider_resource_id
         # Use the OpenAI client for a bit of extra input validation without
         # exposing the OpenAI client itself as part of our API surface
@@ -242,10 +259,6 @@ class InferenceRouter(Inference):
             params.tool_choice = None
             params.tools = None
-        # Update params with the resolved model identifier
-        params.model = model_obj.identifier
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
         if params.stream:
             response_stream = await provider.openai_chat_completion(params)
@@ -253,11 +266,13 @@ class InferenceRouter(Inference):
             # We need to add metrics to each chunk and store the final completion
             return self.stream_tokens_and_compute_metrics_openai_chat(
                 response=response_stream,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
                 messages=params.messages,
             )
         response = await self._nonstream_openai_chat_completion(provider, params)
+        response.model = request_model_id
         # Store the response with the ID that will be returned to the client
         if self.store:
@@ -268,7 +283,8 @@ class InferenceRouter(Inference):
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
                 total_tokens=response.usage.total_tokens,
-                model=model_obj,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
             )
             for metric in metrics:
                 enqueue_event(metric)
@@ -285,13 +301,13 @@ class InferenceRouter(Inference):
         logger.debug(
             f"InferenceRouter.openai_embeddings: model={params.model}, input_type={type(params.input)}, encoding_format={params.encoding_format}, dimensions={params.dimensions}",
         )
-        model_obj = await self._get_model(params.model, ModelType.embedding)
-        # Update model to use resolved identifier
-        params.model = model_obj.identifier
+        request_model_id = params.model
+        provider, provider_resource_id = await self._get_model_provider(params.model, ModelType.embedding)
+        params.model = provider_resource_id
-        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_embeddings(params)
+        response = await provider.openai_embeddings(params)
+        response.model = request_model_id
+        return response
     async def list_chat_completions(
         self,
@@ -347,7 +363,8 @@ class InferenceRouter(Inference):
         self,
         response,
         prompt_tokens,
-        model,
+        fully_qualified_model_id: str,
+        provider_id: str,
         tool_prompt_format: ToolPromptFormat | None = None,
     ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
         completion_text = ""
@@ -385,7 +402,8 @@ class InferenceRouter(Inference):
                         prompt_tokens=prompt_tokens,
                         completion_tokens=completion_tokens,
                         total_tokens=total_tokens,
-                        model=model,
+                        fully_qualified_model_id=fully_qualified_model_id,
+                        provider_id=provider_id,
                     )
                     for metric in completion_metrics:
                         if metric.metric in [
@@ -405,7 +423,8 @@ class InferenceRouter(Inference):
                         prompt_tokens or 0,
                         completion_tokens or 0,
                         total_tokens,
-                        model,
+                        fully_qualified_model_id=fully_qualified_model_id,
+                        provider_id=provider_id,
                     )
                     async_metrics = [
                         MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
@@ -417,7 +436,8 @@ class InferenceRouter(Inference):
         self,
         response: ChatCompletionResponse | CompletionResponse,
         prompt_tokens,
-        model,
+        fully_qualified_model_id: str,
+        provider_id: str,
         tool_prompt_format: ToolPromptFormat | None = None,
     ):
         if isinstance(response, ChatCompletionResponse):
@@ -434,7 +454,8 @@ class InferenceRouter(Inference):
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
-                model=model,
+                fully_qualified_model_id=fully_qualified_model_id,
+                provider_id=provider_id,
             )
             for metric in completion_metrics:
                 if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
@@ -448,14 +469,16 @@ class InferenceRouter(Inference):
             prompt_tokens or 0,
             completion_tokens or 0,
             total_tokens,
-            model,
+            fully_qualified_model_id=fully_qualified_model_id,
+            provider_id=provider_id,
         )
         return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
     async def stream_tokens_and_compute_metrics_openai_chat(
         self,
         response: AsyncIterator[OpenAIChatCompletionChunk],
-        model: Model,
+        fully_qualified_model_id: str,
+        provider_id: str,
         messages: list[OpenAIMessageParam] | None = None,
     ) -> AsyncIterator[OpenAIChatCompletionChunk]:
         """Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
@@ -475,6 +498,8 @@ class InferenceRouter(Inference):
                 if created is None and chunk.created:
                     created = chunk.created
+                chunk.model = fully_qualified_model_id
                 # Accumulate choice data for final assembly
                 if chunk.choices:
                     for choice_delta in chunk.choices:
@@ -531,7 +556,8 @@ class InferenceRouter(Inference):
                             prompt_tokens=chunk.usage.prompt_tokens,
                             completion_tokens=chunk.usage.completion_tokens,
                             total_tokens=chunk.usage.total_tokens,
-                            model=model,
+                            model_id=fully_qualified_model_id,
+                            provider_id=provider_id,
                         )
                         for metric in metrics:
                             enqueue_event(metric)
@@ -579,7 +605,7 @@ class InferenceRouter(Inference):
                     id=id,
                     choices=assembled_choices,
                     created=created or int(time.time()),
-                    model=model.identifier,
+                    model=fully_qualified_model_id,
                     object="chat.completion",
                 )
                 logger.debug(f"InferenceRouter.completion_response: {final_response}")

llama_stack-0.3.2/llama_stack/core/utils/context.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from collections.abc import AsyncGenerator
+from contextvars import ContextVar
+from llama_stack.providers.utils.telemetry.tracing import CURRENT_TRACE_CONTEXT
+_MISSING = object()
+def preserve_contexts_async_generator[T](
+    gen: AsyncGenerator[T, None], context_vars: list[ContextVar]
+) -> AsyncGenerator[T, None]:
+    """
+    Wraps an async generator to preserve context variables across iterations.
+    This is needed because we start a new asyncio event loop for each streaming request,
+    and we need to preserve the context across the event loop boundary.
+    """
+    # Capture initial context values
+    initial_context_values = {context_var.name: context_var.get() for context_var in context_vars}
+    async def wrapper() -> AsyncGenerator[T, None]:
+        while True:
+            previous_values: dict[ContextVar, object] = {}
+            tokens: dict[ContextVar, object] = {}
+            # Restore ALL context values before any await and capture previous state
+            # This is needed to propagate context across async generator boundaries
+            for context_var in context_vars:
+                try:
+                    previous_values[context_var] = context_var.get()
+                except LookupError:
+                    previous_values[context_var] = _MISSING
+                tokens[context_var] = context_var.set(initial_context_values[context_var.name])
+            def _restore_context_var(context_var: ContextVar, *, _tokens=tokens, _prev=previous_values) -> None:
+                token = _tokens.get(context_var)
+                previous_value = _prev.get(context_var, _MISSING)
+                if token is not None:
+                    try:
+                        context_var.reset(token)
+                        return
+                    except (RuntimeError, ValueError):
+                        pass
+                if previous_value is _MISSING:
+                    context_var.set(None)
+                else:
+                    context_var.set(previous_value)
+            try:
+                item = await gen.__anext__()
+            except StopAsyncIteration:
+                # Restore all context vars before exiting to prevent leaks
+                # Use _restore_context_var for all vars to properly restore to previous values
+                for context_var in context_vars:
+                    _restore_context_var(context_var)
+                break
+            except Exception:
+                # Restore all context vars on exception
+                for context_var in context_vars:
+                    _restore_context_var(context_var)
+                raise
+            try:
+                yield item
+                # Update our tracked values with any changes made during this iteration
+                # Only for non-trace context vars - trace context must persist across yields
+                # to allow nested span tracking for telemetry
+                for context_var in context_vars:
+                    if context_var is not CURRENT_TRACE_CONTEXT:
+                        initial_context_values[context_var.name] = context_var.get()
+            finally:
+                # Restore non-trace context vars after each yield to prevent leaks between requests
+                # CURRENT_TRACE_CONTEXT is NOT restored here to preserve telemetry span stack
+                for context_var in context_vars:
+                    if context_var is not CURRENT_TRACE_CONTEXT:
+                        _restore_context_var(context_var)
+    return wrapper()

{llama_stack-0.3.0 → llama_stack-0.3.2}/llama_stack/distributions/starter/build.yaml RENAMED Viewed

@@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]

llama-stack 0.3.0__tar.gz → 0.3.2__tar.gz

llama-stack 0.3.0tar.gz → 0.3.2tar.gz