PyPI - llama-stack - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (307) hide show

llama_stack/cli/stack/_list_deps.py +11 -7
llama_stack/cli/stack/run.py +3 -25
llama_stack/core/access_control/datatypes.py +78 -0
llama_stack/core/configure.py +2 -2
{llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
llama_stack/core/connectors/connectors.py +162 -0
llama_stack/core/conversations/conversations.py +61 -58
llama_stack/core/datatypes.py +54 -8
llama_stack/core/library_client.py +60 -13
llama_stack/core/prompts/prompts.py +43 -42
llama_stack/core/routers/datasets.py +20 -17
llama_stack/core/routers/eval_scoring.py +143 -53
llama_stack/core/routers/inference.py +20 -9
llama_stack/core/routers/safety.py +30 -42
llama_stack/core/routers/vector_io.py +15 -7
llama_stack/core/routing_tables/models.py +42 -3
llama_stack/core/routing_tables/scoring_functions.py +19 -19
llama_stack/core/routing_tables/shields.py +20 -17
llama_stack/core/routing_tables/vector_stores.py +8 -5
llama_stack/core/server/auth.py +192 -17
llama_stack/core/server/fastapi_router_registry.py +40 -5
llama_stack/core/server/server.py +24 -5
llama_stack/core/stack.py +54 -10
llama_stack/core/storage/datatypes.py +9 -0
llama_stack/core/store/registry.py +1 -1
llama_stack/core/utils/exec.py +2 -2
llama_stack/core/utils/type_inspection.py +16 -2
llama_stack/distributions/dell/config.yaml +4 -1
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/dell/run-with-safety.yaml +4 -1
llama_stack/distributions/nvidia/config.yaml +4 -1
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
llama_stack/distributions/oci/config.yaml +4 -1
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/distributions/open-benchmark/config.yaml +9 -1
llama_stack/distributions/postgres-demo/config.yaml +1 -1
llama_stack/distributions/starter/build.yaml +62 -0
llama_stack/distributions/starter/config.yaml +22 -3
llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/starter/starter.py +13 -1
llama_stack/distributions/starter-gpu/build.yaml +62 -0
llama_stack/distributions/starter-gpu/config.yaml +22 -3
llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/template.py +10 -2
llama_stack/distributions/watsonx/config.yaml +4 -1
llama_stack/log.py +1 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
llama_stack/providers/inline/batches/reference/batches.py +2 -1
llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
llama_stack/providers/registry/agents.py +1 -0
llama_stack/providers/registry/inference.py +1 -9
llama_stack/providers/registry/vector_io.py +136 -16
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/files/s3/config.py +5 -3
llama_stack/providers/remote/files/s3/files.py +2 -2
llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/inference/openai/openai.py +2 -0
llama_stack/providers/remote/inference/together/together.py +4 -0
llama_stack/providers/remote/inference/vertexai/config.py +3 -3
llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
llama_stack/providers/remote/inference/vllm/config.py +37 -18
llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
llama_stack/providers/remote/vector_io/oci/config.py +41 -0
llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
llama_stack/providers/utils/bedrock/client.py +3 -3
llama_stack/providers/utils/bedrock/config.py +7 -7
llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
llama_stack/providers/utils/inference/http_client.py +239 -0
llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
llama_stack/providers/utils/inference/model_registry.py +148 -2
llama_stack/providers/utils/inference/openai_compat.py +2 -1
llama_stack/providers/utils/inference/openai_mixin.py +41 -2
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
llama_stack/providers/utils/memory/vector_store.py +46 -19
llama_stack/providers/utils/responses/responses_store.py +40 -6
llama_stack/providers/utils/safety.py +114 -0
llama_stack/providers/utils/tools/mcp.py +44 -3
llama_stack/testing/api_recorder.py +9 -3
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
llama_stack/models/llama/hadamard_utils.py +0 -88
llama_stack/models/llama/llama3/args.py +0 -74
llama_stack/models/llama/llama3/generation.py +0 -378
llama_stack/models/llama/llama3/model.py +0 -304
llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
llama_stack/models/llama/llama3/quantization/loader.py +0 -316
llama_stack/models/llama/llama3_1/__init__.py +0 -12
llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
llama_stack/models/llama/llama3_1/prompts.py +0 -258
llama_stack/models/llama/llama3_2/__init__.py +0 -5
llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
llama_stack/models/llama/llama3_3/__init__.py +0 -5
llama_stack/models/llama/llama3_3/prompts.py +0 -259
llama_stack/models/llama/llama4/args.py +0 -107
llama_stack/models/llama/llama4/ffn.py +0 -58
llama_stack/models/llama/llama4/moe.py +0 -214
llama_stack/models/llama/llama4/preprocess.py +0 -435
llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
llama_stack/models/llama/llama4/quantization/loader.py +0 -226
llama_stack/models/llama/llama4/vision/__init__.py +0 -5
llama_stack/models/llama/llama4/vision/embedding.py +0 -210
llama_stack/models/llama/llama4/vision/encoder.py +0 -412
llama_stack/models/llama/quantize_impls.py +0 -316
llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
llama_stack-0.4.3.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -53
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -53
llama_stack_api/version.py +0 -9
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0

llama_stack/models/llama/llama4/quantization/loader.py DELETED Viewed

@@ -1,226 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import os
-from collections.abc import Callable
-import torch
-from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
-from torch import Tensor, nn
-from torch.nn import functional as F
-from llama_stack.log import get_logger
-from ...datatypes import QuantizationMode
-from ..model import Transformer, TransformerBlock
-from ..moe import MoE
-log = get_logger(name=__name__, category="models::llama")
-def swiglu_wrapper_no_reduce(
-    self,
-    x: Tensor,
-):
-    from ...quantize_impls import ffn_swiglu
-    return ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
-def experts_batched_swiglu_wrapper(
-    self,
-    x: Tensor,  # (e, g, D)
-    w1: Tensor,  # (e, D, F)
-    w3: Tensor,  # (e, D, F)
-    w2: Tensor,  # (e, F, D)
-) -> torch.Tensor:
-    from ...quantize_impls import bmm_nt
-    middle_out_egF = F.silu(bmm_nt(x, w1)) * bmm_nt(x, w3)  # noqa: N806
-    return bmm_nt(middle_out_egF, w2)
-def convert_to_quantized_model(
-    model: Transformer,
-    checkpoint_dir: str,
-    quantization_mode: str | None = None,
-    fp8_activation_scale_ub: float | None = 1200.0,
-    use_rich_progress: bool = True,
-) -> Transformer:
-    from ...quantize_impls import (
-        Fp8ScaledWeights,
-        Int4ScaledWeights,
-        load_fp8,
-        load_int4,
-        quantize_fp8,
-        quantize_int4,
-    )
-    rank = get_model_parallel_rank()
-    def should_quantize_block(block: nn.Module) -> bool:
-        if not isinstance(block, TransformerBlock):
-            return False
-        is_moe = isinstance(block.feed_forward, MoE)
-        if quantization_mode == QuantizationMode.fp8_mixed:
-            # skip quantization on first and last layers
-            return is_moe and not (block.layer_id == 0 or block.layer_id == (model.n_layers - 1))
-        return is_moe
-    use_rich_progress = use_rich_progress and rank == 0
-    progress, log_status, update_status = logging_callbacks(use_rich_progress, rank, model, should_quantize_block)
-    if quantization_mode == QuantizationMode.int4_mixed:
-        int4_scales_path = os.path.join(checkpoint_dir, f"int4_scales_{rank}.pt")
-        if os.path.isfile(int4_scales_path):
-            log_status(f"Rank {rank}: Loading int4 scales")
-            int4_scales = torch.load(int4_scales_path, weights_only=True)
-            def apply_quantization(key, weight):
-                scale = int4_scales[key]
-                return load_int4(
-                    weight,
-                    scale,
-                    output_device=torch.device("cuda"),
-                )
-        else:
-            log_status(f"Rank {rank}: Quantizing int4 weights from bf16")
-            def apply_quantization(_, weight):
-                return quantize_int4(weight, output_device=torch.device("cuda"))
-    else:
-        fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{rank}.pt")
-        if os.path.isfile(fp8_scales_path):
-            log_status(f"Rank {rank}: Loading fp8 scales")
-            fp8_scales = torch.load(fp8_scales_path, weights_only=True)
-            def apply_quantization(key, weight):
-                scale = fp8_scales[key]
-                return load_fp8(
-                    weight,
-                    scale,
-                    fp8_activation_scale_ub,
-                    output_device=torch.device("cuda"),
-                )
-        else:
-            log_status(f"Rank {rank}: Quantizing fp8 weights from bf16")
-            def apply_quantization(_, weight):
-                return quantize_fp8(weight, fp8_activation_scale_ub, output_device=torch.device("cuda"))
-    processed_blocks = 0
-    try:
-        if use_rich_progress:
-            progress.start()
-        for _, block in model.named_modules():
-            if not should_quantize_block(block):
-                continue
-            update_status(f"Rank {rank} - Layer {block.layer_id}")
-            # Quantize only routed experts, not shared
-            prefix = f"layers.{block.layer_id}.feed_forward"
-            moe = block.feed_forward
-            moe.experts.batched_swiglu = experts_batched_swiglu_wrapper.__get__(moe.experts)
-            for key in ("w1", "w3", "w2"):
-                param = getattr(moe.experts, key)
-                update_status(f"Rank {rank} - Layer {block.layer_id} - MoE {key}")
-                setattr(
-                    moe.experts,
-                    key,
-                    apply_quantization(
-                        f"{prefix}.experts.{key}",
-                        param.transpose(1, 2).contiguous(),
-                    ),
-                )
-            if quantization_mode == QuantizationMode.int4_mixed:
-                # Quantize shared experts
-                moe.shared_expert.forward = swiglu_wrapper_no_reduce.__get__(moe.shared_expert)
-                for key in ("w1", "w3", "w2"):
-                    param = getattr(moe.shared_expert, key)
-                    update_status(f"Rank {rank} - Layer {block.layer_id} - MoE shared expert {key}")
-                    param.weight = apply_quantization(f"{prefix}.shared_expert.{key}", param.weight)
-            processed_blocks += 1
-            update_status(message=None, completed=processed_blocks)
-        update_status(f"Rank {rank} - Moving parameters to CUDA")
-        param_count = 0
-        for _, parameter in model.named_parameters():
-            if not isinstance(parameter, Fp8ScaledWeights) and not isinstance(parameter, Int4ScaledWeights):
-                parameter.data = parameter.to(device="cuda")
-                param_count += 1
-        update_status(f"Rank {rank} - Completed - moved {param_count} parameters to CUDA")
-    finally:
-        if use_rich_progress:
-            progress.stop()
-    return model
-# fp8/int4 loading can be very slow so we add progress bars to make life slightly better
-def logging_callbacks(
-    use_rich_progress: bool,
-    rank: int,
-    model: Transformer,
-    should_quantize_block: Callable[[nn.Module], bool],
-):
-    console = None
-    if use_rich_progress:
-        from rich.console import Console
-        console = Console(highlight=False)
-    def log_status(message: str) -> None:
-        if use_rich_progress:
-            console.print(message)
-        elif rank == 0:  # Only log from rank 0 for non-rich logging
-            log.info(message)
-    total_blocks = sum(1 for _, block in model.named_modules() if should_quantize_block(block))
-    progress = None
-    if use_rich_progress:
-        from rich.progress import (
-            BarColumn,
-            Progress,
-            SpinnerColumn,
-            TextColumn,
-            TimeElapsedColumn,
-            TimeRemainingColumn,
-        )
-        progress = Progress(
-            SpinnerColumn(),
-            BarColumn(complete_style="green", finished_style="bright_green"),
-            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            TextColumn("ETA:"),
-            TimeRemainingColumn(),
-            TextColumn("[bold]{task.fields[status]}"),
-            console=console,
-            expand=True,
-        )
-        task_id = progress.add_task("[blue]Converting layers...", total=total_blocks, status="Starting")
-    def update_status(message: str | None, completed: int | None = None) -> None:
-        if use_rich_progress:
-            if message is not None:
-                progress.update(task_id, status=message)
-            if completed is not None:
-                progress.update(task_id, completed=completed)
-        elif rank == 0 and completed and completed % 10 == 0:
-            log.info(f"Rank {rank}: {completed}/{total_blocks} blocks completed")
-    return progress, log_status, update_status

llama_stack/models/llama/llama4/vision/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.

llama_stack/models/llama/llama4/vision/embedding.py DELETED Viewed

@@ -1,210 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import math
-from collections.abc import Callable
-from typing import Any
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from ..args import VisionArgs
-from .encoder import VisionEncoder
-class PixelShuffle(nn.Module):
-    def __init__(self, ps_ratio):
-        super().__init__()
-        self.ps_ratio = ps_ratio
-    def forward(self, x):
-        # x: [B, N, C], N = number of patches
-        assert self.ps_ratio is not None, "ps_ratio is required for pixel shuffle"
-        assert x.dim() == 3, "pixel shuffle requires encoded patches [B, N, C]"
-        hh = ww = int(math.sqrt(x.shape[1]))
-        x = x.reshape(x.shape[0], hh, ww, -1)
-        x = pixel_shuffle_op(x, ps_ratio=self.ps_ratio)
-        pixel_shuffle_patches = x.reshape(x.shape[0], -1, x.shape[-1])
-        return pixel_shuffle_patches
-def pixel_shuffle_op(input_x, ps_ratio):
-    n, w, h, c = input_x.size()
-    input_x = input_x.view(n, w, int(h * ps_ratio), int(c / ps_ratio))
-    input_x = input_x.permute(0, 2, 1, 3).contiguous()
-    input_x = input_x.view(
-        n,
-        int(h * ps_ratio),
-        int(w * ps_ratio),
-        int(c / (ps_ratio * ps_ratio)),
-    )
-    input_x = input_x.permute(0, 2, 1, 3).contiguous()
-    return input_x
-class SimpleMLP(torch.nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        bias: bool = True,
-        dropout: float = 0.0,
-        act_layer: Callable = nn.GELU,
-    ):
-        super().__init__()
-        # layers
-        self.c_fc = ColumnParallelLinear(
-            dim,
-            hidden_dim,
-            bias=bias,
-            gather_output=False,
-        )
-        self.c_proj = RowParallelLinear(
-            hidden_dim,
-            hidden_dim,
-            bias=bias,
-            input_is_parallel=True,
-        )
-        self.non_linearity = act_layer()
-        self.dropout = dropout
-    def forward(self, x):
-        hidden = self.c_fc(x)
-        hidden = self.non_linearity(hidden)
-        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
-        return self.non_linearity(self.c_proj(hidden))
-class PixelShuffleMLP(torch.nn.Module):
-    def __init__(
-        self,
-        ps_ratio: float,
-        input_dim: int,
-        output_dim: int = 4096,
-        add_fc: bool = False,
-    ):
-        super().__init__()
-        self.pixel_shuffle = PixelShuffle(ps_ratio)
-        self.mlp = SimpleMLP(
-            int(input_dim // (ps_ratio**2)),
-            output_dim,
-            bias=False,
-            dropout=0.0,
-            act_layer=nn.GELU,
-        )
-        self.fc = nn.Identity()
-        if add_fc:
-            self.fc = ColumnParallelLinear(
-                output_dim,
-                output_dim,
-                bias=False,
-            )
-    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
-        encoded_patches = self.pixel_shuffle(encoded_patches)
-        return self.fc(self.mlp(encoded_patches))
-class VisionEmbeddings(torch.nn.Module):
-    def __init__(self, args: VisionArgs):
-        super().__init__()
-        self.args = args
-        image_size = args.image_size
-        patch_size = args.patch_size
-        self.vision_encoder = VisionEncoder(
-            image_size=(image_size.height, image_size.width),
-            patch_size=(patch_size.height, patch_size.width),
-            dim=args.dim,
-            layers=args.n_layers,
-            heads=args.n_heads,
-            mlp_ratio=args.mlp_ratio,
-        )
-        self.vision_encoder = self.vision_encoder.to(torch.bfloat16)
-        self.vision_adapter = PixelShuffleMLP(
-            ps_ratio=args.pixel_shuffle_ratio,
-            input_dim=args.dim,
-            output_dim=args.output_dim,
-        )
-        self.output_dim = args.output_dim
-        self._register_load_state_dict_pre_hook(self.load_hook)
-    def load_hook(
-        self,
-        state_dict: dict[str, Any],
-        prefix: str,
-        local_metadata: dict[str, Any],
-        strict: bool = True,
-        missing_keys: list[str] = None,
-        unexpected_keys: list[str] = None,
-        error_msgs: list[str] = None,
-        return_state_dict: bool = False,
-    ) -> None:
-        original_sd = self.state_dict()
-        for k in state_dict:
-            if k.startswith(prefix) and len(state_dict[k].shape) == 1 and state_dict[k].shape[0] == 0:
-                state_dict[k] = state_dict[k].reshape(original_sd[k[len(prefix) :]].shape)
-    def _get_empty_sequence(self, h):
-        return torch.zeros(
-            h.shape[0],
-            h.shape[1],
-            self.output_dim,
-            device=h.device,
-            dtype=h.dtype,
-        )
-    # x_images is batched; each batch sample contains a list of images. so this is List[List[torch.Tensor]]
-    # each image is a tensor of shape [num_tiles, C, H, W]
-    def forward(
-        self,
-        image_batch: list[list[torch.Tensor]],
-        image_mask: torch.Tensor,
-        h_ref: torch.Tensor,
-    ) -> torch.Tensor:
-        images_flattened = [image for sample in image_batch for image in sample]
-        images_flattened = torch.vstack(images_flattened).unsqueeze(1).to(h_ref.dtype).to(h_ref.device)
-        embedding = self.vision_encoder(images_flattened)
-        projected_embedding = self.vision_adapter(embedding)
-        h_image = self._get_empty_sequence(h_ref)
-        return scatter_embeddings(image_batch, image_mask, h_image, projected_embedding)
-def scatter_embeddings(image_batch, image_mask, h_image, encoded_patches_proj):
-    # If dynamic transform is used and the batch contains 2 images (where image_1 has 2 chunks and image_2 has 3 chunks),
-    # `num_images_per_sequence` now records the number of chunks per image as `[2, 3]`.
-    # `encoded_patches_proj.split` will then split the image chunks into 2 groups: `[image_1_chunks, image_2_chunks]`.
-    num_images_per_sequence = [sum(image.size(0) for image in sample_images) for sample_images in image_batch]
-    assert not torch.isnan(encoded_patches_proj).any()
-    assert sum(num_images_per_sequence) == encoded_patches_proj.size(0), (
-        f"{sum(num_images_per_sequence)=} != {encoded_patches_proj.shape=}"
-    )
-    encoded_patches_list = encoded_patches_proj.split(num_images_per_sequence, dim=0)
-    for index in range(h_image.size(0)):
-        encoded_patches_per_sample = encoded_patches_list[index]
-        sample_image_mask = image_mask[index]
-        if encoded_patches_per_sample.numel() == 0:
-            continue
-        encoded_patches_per_sample = encoded_patches_per_sample.contiguous().view(
-            -1, encoded_patches_per_sample.size(-1)
-        )
-        n_tokens_to_fill = sample_image_mask.sum()
-        assert n_tokens_to_fill <= encoded_patches_per_sample.size(0)
-        h_image[index].masked_scatter_(
-            sample_image_mask.expand(-1, h_image.size(-1)),
-            encoded_patches_per_sample[:n_tokens_to_fill],
-        )
-    return h_image

llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl