PyPI - llama-stack - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (307) hide show

llama_stack/cli/stack/_list_deps.py +11 -7
llama_stack/cli/stack/run.py +3 -25
llama_stack/core/access_control/datatypes.py +78 -0
llama_stack/core/configure.py +2 -2
{llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
llama_stack/core/connectors/connectors.py +162 -0
llama_stack/core/conversations/conversations.py +61 -58
llama_stack/core/datatypes.py +54 -8
llama_stack/core/library_client.py +60 -13
llama_stack/core/prompts/prompts.py +43 -42
llama_stack/core/routers/datasets.py +20 -17
llama_stack/core/routers/eval_scoring.py +143 -53
llama_stack/core/routers/inference.py +20 -9
llama_stack/core/routers/safety.py +30 -42
llama_stack/core/routers/vector_io.py +15 -7
llama_stack/core/routing_tables/models.py +42 -3
llama_stack/core/routing_tables/scoring_functions.py +19 -19
llama_stack/core/routing_tables/shields.py +20 -17
llama_stack/core/routing_tables/vector_stores.py +8 -5
llama_stack/core/server/auth.py +192 -17
llama_stack/core/server/fastapi_router_registry.py +40 -5
llama_stack/core/server/server.py +24 -5
llama_stack/core/stack.py +54 -10
llama_stack/core/storage/datatypes.py +9 -0
llama_stack/core/store/registry.py +1 -1
llama_stack/core/utils/exec.py +2 -2
llama_stack/core/utils/type_inspection.py +16 -2
llama_stack/distributions/dell/config.yaml +4 -1
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/dell/run-with-safety.yaml +4 -1
llama_stack/distributions/nvidia/config.yaml +4 -1
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
llama_stack/distributions/oci/config.yaml +4 -1
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/distributions/open-benchmark/config.yaml +9 -1
llama_stack/distributions/postgres-demo/config.yaml +1 -1
llama_stack/distributions/starter/build.yaml +62 -0
llama_stack/distributions/starter/config.yaml +22 -3
llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/starter/starter.py +13 -1
llama_stack/distributions/starter-gpu/build.yaml +62 -0
llama_stack/distributions/starter-gpu/config.yaml +22 -3
llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/template.py +10 -2
llama_stack/distributions/watsonx/config.yaml +4 -1
llama_stack/log.py +1 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
llama_stack/providers/inline/batches/reference/batches.py +2 -1
llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
llama_stack/providers/registry/agents.py +1 -0
llama_stack/providers/registry/inference.py +1 -9
llama_stack/providers/registry/vector_io.py +136 -16
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/files/s3/config.py +5 -3
llama_stack/providers/remote/files/s3/files.py +2 -2
llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/inference/openai/openai.py +2 -0
llama_stack/providers/remote/inference/together/together.py +4 -0
llama_stack/providers/remote/inference/vertexai/config.py +3 -3
llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
llama_stack/providers/remote/inference/vllm/config.py +37 -18
llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
llama_stack/providers/remote/vector_io/oci/config.py +41 -0
llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
llama_stack/providers/utils/bedrock/client.py +3 -3
llama_stack/providers/utils/bedrock/config.py +7 -7
llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
llama_stack/providers/utils/inference/http_client.py +239 -0
llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
llama_stack/providers/utils/inference/model_registry.py +148 -2
llama_stack/providers/utils/inference/openai_compat.py +2 -1
llama_stack/providers/utils/inference/openai_mixin.py +41 -2
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
llama_stack/providers/utils/memory/vector_store.py +46 -19
llama_stack/providers/utils/responses/responses_store.py +40 -6
llama_stack/providers/utils/safety.py +114 -0
llama_stack/providers/utils/tools/mcp.py +44 -3
llama_stack/testing/api_recorder.py +9 -3
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
llama_stack/models/llama/hadamard_utils.py +0 -88
llama_stack/models/llama/llama3/args.py +0 -74
llama_stack/models/llama/llama3/generation.py +0 -378
llama_stack/models/llama/llama3/model.py +0 -304
llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
llama_stack/models/llama/llama3/quantization/loader.py +0 -316
llama_stack/models/llama/llama3_1/__init__.py +0 -12
llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
llama_stack/models/llama/llama3_1/prompts.py +0 -258
llama_stack/models/llama/llama3_2/__init__.py +0 -5
llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
llama_stack/models/llama/llama3_3/__init__.py +0 -5
llama_stack/models/llama/llama3_3/prompts.py +0 -259
llama_stack/models/llama/llama4/args.py +0 -107
llama_stack/models/llama/llama4/ffn.py +0 -58
llama_stack/models/llama/llama4/moe.py +0 -214
llama_stack/models/llama/llama4/preprocess.py +0 -435
llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
llama_stack/models/llama/llama4/quantization/loader.py +0 -226
llama_stack/models/llama/llama4/vision/__init__.py +0 -5
llama_stack/models/llama/llama4/vision/embedding.py +0 -210
llama_stack/models/llama/llama4/vision/encoder.py +0 -412
llama_stack/models/llama/quantize_impls.py +0 -316
llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
llama_stack-0.4.3.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -53
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -53
llama_stack_api/version.py +0 -9
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0

llama_stack/models/llama/llama4/vision/encoder.py DELETED Viewed

@@ -1,412 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from collections.abc import Callable
-from typing import Any
-import fairscale.nn.model_parallel.initialize as fs_init
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from torch import einsum
-from ..args import ModelArgs
-from ..model import Attention
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-    def forward(self, x: torch.Tensor):
-        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
-        return x
-class ColumnParallelConv2dPatch(torch.nn.Module):
-    """Conv2D Patching layer with model parallelism.
-    Column parallel over unfolded input.
-    Arguments:
-        in_channels: Input channels.
-        out_channels: Output channels.
-        kernel_size: Size of convolution kernel.
-        stride (default 1): Stride for convolution.
-        bias (default False): Use bias in Conv2d.
-    Input: (bsz, in_channels, height, width)
-    Output: (bsz, num_tokens, out_channels)
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int | tuple[int, int],
-        stride: int | tuple[int, int],
-        bias: bool | None = False,
-    ) -> None:
-        super().__init__()
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size, kernel_size)
-        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
-        self._linear = ColumnParallelLinear(
-            in_channels * kernel_size[0] * kernel_size[1],
-            out_channels,
-            bias=bias,
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self._unfold(x)
-        x = x.permute(0, 2, 1)
-        x = self._linear(x)
-        return x
-class _FeedForward(torch.nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        dropout: float,
-        act_layer: Callable = nn.GELU,
-    ):
-        super().__init__()
-        # layers
-        self.c_fc = ColumnParallelLinear(
-            dim,
-            hidden_dim,
-            bias=True,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.c_proj = RowParallelLinear(
-            hidden_dim,
-            dim,
-            bias=True,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        self.non_linearity = act_layer()
-        self.dropout = dropout
-    def forward(self, x):
-        hidden = self.c_fc(x)
-        hidden = self.non_linearity(hidden)
-        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
-        return self.c_proj(hidden)
-class _TransformerBlock(nn.Module):
-    def __init__(
-        self,
-        d_model: int,
-        n_head: int,
-        mlp_ratio: float = 4.0,
-        act_layer: Callable = nn.GELU,
-        gated: bool = False,
-    ):
-        super().__init__()
-        assert d_model % n_head == 0
-        self.n_heads = n_head
-        self.head_dim = d_model // self.n_heads
-        attn_args = ModelArgs(
-            dim=d_model,
-            head_dim=self.head_dim,
-            n_heads=self.n_heads,
-            n_kv_heads=self.n_heads,
-        )
-        self.attn = Attention(attn_args, use_rope=True, use_qk_norm=False, add_bias=True)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = _FeedForward(
-            dim=d_model,
-            hidden_dim=int(mlp_ratio * d_model),
-            dropout=0.0,
-            act_layer=act_layer,
-        )
-        self.ln_2 = LayerNorm(d_model)
-        self.gated = gated
-        if gated:
-            self.gate_attn = nn.Parameter(torch.zeros(1))
-            self.gate_ffn = nn.Parameter(torch.zeros(1))
-    def attention(
-        self,
-        x: torch.Tensor,
-        freq_cis: torch.Tensor | None = None,
-    ):
-        return self.attn(x=x, start_pos=0, freqs_cis=freq_cis)
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: torch.Tensor | None = None,
-        freq_cis: torch.Tensor | None = None,
-    ):
-        _gate_attn = 1 if not self.gated else self.gate_attn.tanh()
-        _gate_ffn = 1 if not self.gated else self.gate_ffn.tanh()
-        x = x + _gate_attn * self.attention(self.ln_1(x), freq_cis=freq_cis)
-        x = x + _gate_ffn * self.mlp(self.ln_2(x))
-        return x
-class _Transformer(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        layers: int,
-        heads: int,
-        mlp_ratio: float = 4.0,
-        act_layer: Callable = nn.GELU,
-        gated: bool = False,
-    ):
-        super().__init__()
-        self.resblocks = nn.ModuleList(
-            [
-                _TransformerBlock(
-                    d_model=dim,
-                    n_head=heads,
-                    mlp_ratio=mlp_ratio,
-                    act_layer=act_layer,
-                    gated=gated,
-                )
-                for _ in range(layers)
-            ]
-        )
-    def forward(self, x: torch.Tensor, return_intermediate=None, mask=None, freq_cis=None):
-        out = []
-        for idx, r in enumerate(self.resblocks):
-            if return_intermediate is not None and idx in return_intermediate:
-                out.append(x)
-            x = r(x, mask=mask, freq_cis=freq_cis)
-        if return_intermediate is not None:
-            return x, torch.stack(out, dim=-1)
-        return x
-class PackingIndex:
-    Z = 0  # Z (time) coordinate of the token in the original sample
-    Y = 1  # Y (height) coordinate of the token in the original sample
-    X = 2  # X (width) coordinate of the token in the original sample
-    TIME = 3  # Total number of time units (frames) in the original sample
-    HEIGHT = 4  # Height of the original sample
-    WIDTH = 5  # Width of the original sample
-    # USE INDEX TO CHECK THE TYPE OF THE TOKEN (see ID fields below)
-    IDX = 6  # Full index of the token in the original sample (x + y * w + z * w * h)
-    BATCH_IDX = 7  # Which batch element this token belongs to. Note the batch idx of padding tokens is BATCH_SIZE
-    # Total size of the enum, remember to update this!
-    NUM_METADATA = 8
-    # Note: For padding tokens IDX = -1
-    #       For cls tokens,    IDX = -2
-    ID_CLS_TOKEN = -2
-    ID_PAD_TOKEN = -1
-class VisionEncoder(nn.Module):
-    def __init__(
-        self,
-        image_size: tuple[int, int],
-        patch_size: tuple[int, int],
-        dim: int,
-        layers: int,
-        heads: int,
-        mlp_ratio: float,
-        in_channels: int = 3,
-    ):
-        super().__init__()
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.grid_size = (
-            self.image_size[0] // self.patch_size[0],
-            self.image_size[1] // self.patch_size[1],
-        )
-        self.conv1 = ColumnParallelConv2dPatch(
-            in_channels=in_channels,
-            out_channels=dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=False,
-        )
-        scale = dim**-0.5
-        self.class_embedding = nn.Parameter(scale * torch.randn(dim))
-        self.positional_embedding_vlm = nn.Parameter(
-            scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, dim)
-        )
-        self.ln_pre = LayerNorm(dim)
-        self.ln_post = LayerNorm(dim)
-        self.transformer = _Transformer(
-            dim,
-            layers,
-            heads,
-            mlp_ratio,
-            act_layer=nn.GELU,
-        )
-        # NOTE: hack for the fixed res
-        image_h, image_w = self.image_size
-        patch_h, patch_w = self.patch_size
-        idx_h, idx_w = image_h // patch_h, image_w // patch_w
-        img_idx = torch.arange(image_h * image_w // (patch_h * patch_w), dtype=torch.int32)
-        img_idx = img_idx.reshape(idx_h * idx_w, 1)
-        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
-        img_idx[-1, -1] = PackingIndex.ID_CLS_TOKEN
-        packed_img_idx = torch.empty(
-            img_idx.shape[0],
-            img_idx.shape[1],
-            PackingIndex.NUM_METADATA - 1,
-            dtype=torch.int32,
-        )
-        packed_img_idx[:, :, PackingIndex.Y] = img_idx // idx_w
-        packed_img_idx[:, :, PackingIndex.X] = img_idx % idx_w
-        packed_img_idx[:, :, PackingIndex.HEIGHT].fill_(idx_h)
-        packed_img_idx[:, :, PackingIndex.WIDTH].fill_(idx_w)
-        packed_img_idx[:, :, PackingIndex.IDX] = img_idx
-        packed_img_idx = packed_img_idx.reshape(1, -1, PackingIndex.NUM_METADATA - 1)
-        self.packed_img_idx = packed_img_idx  # for positional embedding load hook
-        # compute rope freqs
-        rope_freq = self.get_rope_freqs(dim // heads // 2)
-        freqs_x = self.compute_rope_freqs(rope_freq, packed_img_idx[:, :, PackingIndex.X] + 1)
-        freqs_y = self.compute_rope_freqs(rope_freq, packed_img_idx[:, :, PackingIndex.Y] + 1)
-        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
-        # disable RoPE for padding and cls tokens
-        freqs = freqs.masked_fill(packed_img_idx[:, :, PackingIndex.IDX, None] < 0, 0)
-        # compute complex freqs
-        self.freq_cis = torch.view_as_complex(torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
-        # xlf automatically broadcasts
-        self.freq_cis = self.freq_cis.squeeze(0)
-        self.n_heads = heads // fs_init.get_model_parallel_world_size()
-        self._register_load_state_dict_pre_hook(self.load_hook)
-    def get_rope_freqs(self, dim, theta=10000):
-        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-        return freqs
-    @torch.amp.autocast("cuda", enabled=False)
-    def compute_rope_freqs(self, freqs, t):
-        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
-        freqs = freqs.repeat_interleave(2, dim=-1)
-        return freqs
-    def load_hook(
-        self,
-        state_dict: dict[str, Any],
-        prefix: str,
-        local_metadata: dict[str, Any],
-        strict: bool = True,
-        missing_keys: list[str] = None,
-        unexpected_keys: list[str] = None,
-        error_msgs: list[str] = None,
-        return_state_dict: bool = False,
-    ) -> None:
-        orig_pos_embed = state_dict.get(prefix + "positional_embedding")
-        if orig_pos_embed is not None and orig_pos_embed.shape[-2:] != self.positional_embedding_vlm.shape[-2:]:
-            raise ValueError(
-                f"Positional embedding shape {orig_pos_embed.shape} does not match expected shape {self.positional_embedding_vlm.shape}"
-            )
-        batch_size, token_per_image, _ = self.packed_img_idx.shape
-        # Input points for idx are [x, y, w, h]
-        idx = self.packed_img_idx.reshape(batch_size * token_per_image, 1, -1)
-        total_windows, window_size, _ = idx.shape
-        # Grid values are [-1, 1] and coords are w, h
-        grid = (
-            (idx[:, :, [PackingIndex.X, PackingIndex.Y]] / idx[:, :, [PackingIndex.WIDTH, PackingIndex.HEIGHT]]) * 2 - 1
-        )[None, ...]
-        # In this mode, cls token has no position embedding
-        if orig_pos_embed is not None:
-            posemb = (
-                orig_pos_embed[1:].view(1, self.grid_size[0], self.grid_size[1], -1).permute(0, 3, 1, 2).contiguous()
-            )
-            posemb = posemb.to(device=grid.device, dtype=grid.dtype)
-            sample = F.grid_sample(
-                posemb, grid, padding_mode="zeros"
-            )  # padding tokens / class token will get zero for posemb
-            sample = sample.view(-1, total_windows, window_size).permute(1, 2, 0).contiguous()
-            sample = torch.where(
-                idx[:, :, PackingIndex.IDX, None] == PackingIndex.ID_CLS_TOKEN,
-                orig_pos_embed[0].view(1, 1, -1).to(device=sample.device, dtype=sample.dtype),
-                sample,
-            )
-            new_pos_embed = sample.reshape(batch_size, token_per_image, -1)
-            state_dict[prefix + "positional_embedding_vlm"] = new_pos_embed.squeeze(0)
-        if return_state_dict:
-            return state_dict
-    def apply_class_embedding(self, x):
-        x = torch.cat(
-            [
-                x,
-                self.class_embedding.to(x.dtype)
-                + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
-            ],
-            dim=1,
-        )  # shape = [*, grid ** 2 + 1, width]
-        return x
-    def forward(self, images: torch.Tensor) -> torch.Tensor:
-        # NOTE: in Llama4 bsz=bsz*num_tiles, num_chunks=1
-        if images.ndim == 5:
-            num_concurrent_media = 1
-            bsz, num_chunks, nch, h, w = images.shape
-        else:
-            bsz, num_concurrent_media, num_chunks, nch, h, w = images.shape
-        images = images.reshape(bsz * num_concurrent_media * num_chunks, nch, h, w)
-        # patch embedding
-        x = images.reshape(bsz * num_concurrent_media * num_chunks, nch, h, w)
-        x = self.conv1(x)  # shape = [*, width, grid ** 2]
-        _, ntok, dim = x.shape
-        x = x.reshape(bsz * num_concurrent_media * num_chunks, ntok, dim)
-        # apply cls token
-        x = self.apply_class_embedding(x)
-        ntok += 1
-        # apply position embeddings
-        if self.positional_embedding_vlm is not None:
-            x = x + self.positional_embedding_vlm.to(x.dtype)
-        x = x.reshape(bsz * num_concurrent_media, num_chunks, ntok, dim)
-        x = self.ln_pre(x)
-        x = x.view(bsz * num_concurrent_media, -1, dim)
-        freq_cis = self.freq_cis.to(images.device)
-        tf_output = self.transformer(
-            x,
-            freq_cis=freq_cis,
-        )
-        int_x = None
-        if isinstance(tf_output, tuple):
-            x, int_x = tf_output
-        else:
-            x = tf_output
-        x = self.ln_post(x)
-        # remove cls token output
-        x = x[:, :-1, :]
-        # add and output x + int_x features
-        if int_x is not None:
-            int_x = int_x[:, :-1, :, :]
-            int_x = int_x.reshape(bsz * num_concurrent_media, ntok - 1, -1)
-            x = torch.cat([x, int_x], dim=-1)
-        return x

llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl