PyPI - llama-stack - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (307) hide show

llama_stack/cli/stack/_list_deps.py +11 -7
llama_stack/cli/stack/run.py +3 -25
llama_stack/core/access_control/datatypes.py +78 -0
llama_stack/core/configure.py +2 -2
{llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
llama_stack/core/connectors/connectors.py +162 -0
llama_stack/core/conversations/conversations.py +61 -58
llama_stack/core/datatypes.py +54 -8
llama_stack/core/library_client.py +60 -13
llama_stack/core/prompts/prompts.py +43 -42
llama_stack/core/routers/datasets.py +20 -17
llama_stack/core/routers/eval_scoring.py +143 -53
llama_stack/core/routers/inference.py +20 -9
llama_stack/core/routers/safety.py +30 -42
llama_stack/core/routers/vector_io.py +15 -7
llama_stack/core/routing_tables/models.py +42 -3
llama_stack/core/routing_tables/scoring_functions.py +19 -19
llama_stack/core/routing_tables/shields.py +20 -17
llama_stack/core/routing_tables/vector_stores.py +8 -5
llama_stack/core/server/auth.py +192 -17
llama_stack/core/server/fastapi_router_registry.py +40 -5
llama_stack/core/server/server.py +24 -5
llama_stack/core/stack.py +54 -10
llama_stack/core/storage/datatypes.py +9 -0
llama_stack/core/store/registry.py +1 -1
llama_stack/core/utils/exec.py +2 -2
llama_stack/core/utils/type_inspection.py +16 -2
llama_stack/distributions/dell/config.yaml +4 -1
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/dell/run-with-safety.yaml +4 -1
llama_stack/distributions/nvidia/config.yaml +4 -1
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
llama_stack/distributions/oci/config.yaml +4 -1
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/distributions/open-benchmark/config.yaml +9 -1
llama_stack/distributions/postgres-demo/config.yaml +1 -1
llama_stack/distributions/starter/build.yaml +62 -0
llama_stack/distributions/starter/config.yaml +22 -3
llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/starter/starter.py +13 -1
llama_stack/distributions/starter-gpu/build.yaml +62 -0
llama_stack/distributions/starter-gpu/config.yaml +22 -3
llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
llama_stack/distributions/template.py +10 -2
llama_stack/distributions/watsonx/config.yaml +4 -1
llama_stack/log.py +1 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
llama_stack/providers/inline/batches/reference/batches.py +2 -1
llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
llama_stack/providers/registry/agents.py +1 -0
llama_stack/providers/registry/inference.py +1 -9
llama_stack/providers/registry/vector_io.py +136 -16
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/files/s3/config.py +5 -3
llama_stack/providers/remote/files/s3/files.py +2 -2
llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/inference/openai/openai.py +2 -0
llama_stack/providers/remote/inference/together/together.py +4 -0
llama_stack/providers/remote/inference/vertexai/config.py +3 -3
llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
llama_stack/providers/remote/inference/vllm/config.py +37 -18
llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
llama_stack/providers/remote/vector_io/oci/config.py +41 -0
llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
llama_stack/providers/utils/bedrock/client.py +3 -3
llama_stack/providers/utils/bedrock/config.py +7 -7
llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
llama_stack/providers/utils/inference/http_client.py +239 -0
llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
llama_stack/providers/utils/inference/model_registry.py +148 -2
llama_stack/providers/utils/inference/openai_compat.py +2 -1
llama_stack/providers/utils/inference/openai_mixin.py +41 -2
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
llama_stack/providers/utils/memory/vector_store.py +46 -19
llama_stack/providers/utils/responses/responses_store.py +40 -6
llama_stack/providers/utils/safety.py +114 -0
llama_stack/providers/utils/tools/mcp.py +44 -3
llama_stack/testing/api_recorder.py +9 -3
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
llama_stack/models/llama/hadamard_utils.py +0 -88
llama_stack/models/llama/llama3/args.py +0 -74
llama_stack/models/llama/llama3/generation.py +0 -378
llama_stack/models/llama/llama3/model.py +0 -304
llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
llama_stack/models/llama/llama3/quantization/loader.py +0 -316
llama_stack/models/llama/llama3_1/__init__.py +0 -12
llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
llama_stack/models/llama/llama3_1/prompts.py +0 -258
llama_stack/models/llama/llama3_2/__init__.py +0 -5
llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
llama_stack/models/llama/llama3_3/__init__.py +0 -5
llama_stack/models/llama/llama3_3/prompts.py +0 -259
llama_stack/models/llama/llama4/args.py +0 -107
llama_stack/models/llama/llama4/ffn.py +0 -58
llama_stack/models/llama/llama4/moe.py +0 -214
llama_stack/models/llama/llama4/preprocess.py +0 -435
llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
llama_stack/models/llama/llama4/quantization/loader.py +0 -226
llama_stack/models/llama/llama4/vision/__init__.py +0 -5
llama_stack/models/llama/llama4/vision/embedding.py +0 -210
llama_stack/models/llama/llama4/vision/encoder.py +0 -412
llama_stack/models/llama/quantize_impls.py +0 -316
llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
llama_stack-0.4.3.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -53
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -53
llama_stack_api/version.py +0 -9
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0

llama_stack/models/llama/llama3/generation.py DELETED Viewed

@@ -1,378 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-import json
-import os
-import sys
-import time
-from collections.abc import Callable, Generator
-from pathlib import Path
-import torch
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.initialize import (
-    initialize_model_parallel,
-    model_parallel_is_initialized,
-)
-from termcolor import cprint
-from llama_stack.models.llama.datatypes import ToolPromptFormat
-from ..checkpoint import maybe_reshard_state_dict
-from ..datatypes import GenerationResult, QuantizationMode, RawContent, RawMessage
-from .args import ModelArgs
-from .chat_format import ChatFormat, LLMInput
-from .model import Transformer
-from .multimodal.model import CrossAttentionTransformer
-from .tokenizer import Tokenizer
-class Llama3:
-    @staticmethod
-    def build(
-        ckpt_dir: str,
-        max_seq_len: int,
-        max_batch_size: int,
-        world_size: int | None = None,
-        quantization_mode: QuantizationMode | None = None,
-        seed: int = 1,
-        device: str = "cuda",
-    ):
-        device = torch.device(device)
-        if (
-            device.type == "cuda"
-            and not torch.cuda.is_available()
-            or device.type == "xpu"
-            and not torch.xpu.is_available()
-        ):
-            raise RuntimeError(f"PyTorch backend for {device.type} device type is not available")
-        if not torch.distributed.is_initialized():
-            if device.type == "cuda":
-                torch.distributed.init_process_group("nccl")
-            else:
-                torch.distributed.init_process_group("gloo")
-        if not model_parallel_is_initialized():
-            if world_size is None:
-                world_size = int(os.environ.get("WORLD_SIZE", 1))
-            initialize_model_parallel(world_size)
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        if device.type == "cuda":
-            torch.cuda.set_device(local_rank)
-        elif device.type == "xpu":
-            torch.xpu.set_device(local_rank)
-        torch.manual_seed(seed)
-        if local_rank > 0:
-            sys.stdout = open(os.devnull, "w")
-        start_time = time.time()
-        ckpt_paths = sorted(Path(ckpt_dir).glob("*.pth"))
-        assert len(ckpt_paths) > 0, f"no checkpoint files found in {ckpt_dir}"
-        print(f"Loading a checkpoint (shards={len(ckpt_paths)}, current-mp-size={world_size})")
-        with open(Path(ckpt_dir) / "params.json") as f:
-            params = json.loads(f.read())
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=max_seq_len,
-            max_batch_size=max_batch_size,
-            **params,
-        )
-        tokenizer = Tokenizer.get_instance()
-        state_dict = maybe_reshard_state_dict(
-            ckpt_paths,
-            n_kv_heads=model_args.n_kv_heads if model_args.n_kv_heads else model_args.n_heads,
-        )
-        assert model_args.vocab_size == tokenizer.n_words
-        def build_model():
-            if model_args.vision_chunk_size > 0:
-                model = CrossAttentionTransformer(model_args)
-                model.setup_cache(model_args.max_batch_size, device=device, dtype=torch.get_default_dtype())
-            else:
-                model = Transformer(model_args)
-            return model
-        if quantization_mode == QuantizationMode.fp8_mixed or quantization_mode == QuantizationMode.int4_mixed:
-            from .quantization.loader import convert_to_quantized_model
-            torch.set_default_tensor_type(torch.BFloat16Tensor)
-            model = build_model()
-            print("Loading state dict...")
-            model.load_state_dict(state_dict, strict=False)
-            print("Done...")
-            model = convert_to_quantized_model(model, ckpt_dir, quantization_mode, device=device)
-            torch.set_default_device(device)
-        else:
-            print(f"Setting default device to {device}")
-            if device.type == "cuda":
-                if torch.cuda.is_bf16_supported():
-                    torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
-                else:
-                    torch.set_default_tensor_type(torch.cuda.Float16Tensor)
-            elif device.type == "xpu":
-                if torch.xpu.is_bf16_supported():
-                    torch.set_default_tensor_type(torch.xpu.BFloat16Tensor)
-                else:
-                    torch.set_default_tensor_type(torch.xpu.Float16Tensor)
-            model = build_model()
-            print("Loading state dict...")
-            model.load_state_dict(state_dict, strict=True)
-            model.to(device)
-            print("Done...")
-        print(f"Loaded in {time.time() - start_time:.2f} seconds")
-        return Llama3(model, tokenizer, model_args)
-    def __init__(
-        self,
-        model: Transformer | CrossAttentionTransformer,
-        tokenizer: Tokenizer,
-        args: ModelArgs,
-    ):
-        self.args = args
-        self.model = model
-        self.tokenizer = tokenizer
-        self.formatter = ChatFormat(tokenizer)
-    @torch.inference_mode()
-    def generate(
-        self,
-        llm_inputs: list[LLMInput],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: int | None = None,
-        logprobs: bool = False,
-        echo: bool = False,
-        print_model_input: bool = False,
-        logits_processor: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-    ) -> Generator[list[GenerationResult], None, None]:
-        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
-            max_gen_len = self.args.max_seq_len - 1
-        params = self.model.params
-        print_model_input = print_model_input or os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1"
-        if print_model_input:
-            for inp in llm_inputs:
-                tokens_to_print = [self.formatter.vision_token if t == 128256 else t for t in inp.tokens]
-                cprint(
-                    "Input to model:\n" + self.tokenizer.decode(tokens_to_print) + "\n",
-                    "red",
-                    file=sys.stderr,
-                )
-        prompt_tokens = [inp.tokens for inp in llm_inputs]
-        bsz = len(llm_inputs)
-        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
-        min_prompt_len = min(len(t) for t in prompt_tokens)
-        max_prompt_len = max(len(t) for t in prompt_tokens)
-        if max_prompt_len >= params.max_seq_len:
-            cprint(
-                f"Out of token budget {max_prompt_len} vs {params.max_seq_len}",
-                color="red",
-                file=sys.stderr,
-            )
-            return
-        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
-        pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long)
-        for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long)
-        if logprobs:
-            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
-        is_vision = not isinstance(self.model, Transformer)
-        if is_vision:
-            images = [inp.vision.images if inp.vision is not None else [] for inp in llm_inputs]
-            mask = [inp.vision.mask if inp.vision is not None else [] for inp in llm_inputs]
-            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = self.model.compute_vision_tokens_masks(
-                batch_images=images,
-                batch_masks=mask,
-                total_len=total_len,
-                device=tokens.device,
-            )
-        eos_reached = torch.tensor([False] * bsz)
-        input_text_mask = tokens != pad_id
-        if echo:
-            for i in range(max_prompt_len):
-                results = []
-                for j, t in enumerate(tokens[:, i]):
-                    results.append(
-                        GenerationResult(
-                            token=t.item(),
-                            text=self.tokenizer.decode([t.item()]),
-                            source="input",
-                            logprobs=(token_logprobs[j, i : i + 1].tolist() if logprobs else None),
-                            batch_idx=j,
-                            finished=False,
-                            ignore_token=t.item() == pad_id,
-                        )
-                    )
-                yield results
-        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)
-        prev_pos = 0
-        for cur_pos in range(min_prompt_len, total_len):
-            if is_vision:
-                position_ids = torch.arange(prev_pos, cur_pos, dtype=torch.long)
-                text_only_inference = all(inp.vision is None for inp in llm_inputs)
-                logits = self.model.forward(
-                    position_ids,
-                    tokens,
-                    cross_attention_masks,
-                    full_text_row_masked_out_mask,
-                    xattn_caches,
-                    text_only_inference,
-                )
-            else:
-                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
-            if logits_processor is not None:
-                logits = logits_processor(tokens[:, :cur_pos], logits)
-            if temperature > 0:
-                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
-                next_token = sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits[:, -1], dim=-1)
-            next_token = next_token.reshape(-1)
-            # only replace token if prompt has already been generated
-            next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token)
-            tokens[:, cur_pos] = next_token
-            target = tokens[:, prev_pos + 1 : cur_pos + 1]
-            if is_vision:
-                # the logits space (num_classes) is designed to never contain a media_token
-                # however our input token stream does contain them. we need to nuke them here
-                # or else the CUDA kernels will crash with an illegal memory access
-                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
-                masks = [target.eq(t) for t in vision_tokens]
-                if len(masks) > 1:
-                    mask = torch.logical_or(*masks)
-                else:
-                    mask = masks[0]
-                target[mask] = 0
-            if logprobs:
-                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
-                    input=logits.transpose(1, 2),
-                    target=target,
-                    reduction="none",
-                    ignore_index=pad_id,
-                )
-            eos_reached |= (~input_text_mask[:, cur_pos]) & (torch.isin(next_token, stop_tokens))
-            results = []
-            for idx, t in enumerate(next_token):
-                results.append(
-                    GenerationResult(
-                        token=t.item(),
-                        text=self.tokenizer.decode([t.item()]),
-                        source="output",
-                        logprobs=(token_logprobs[idx, cur_pos : cur_pos + 1].tolist() if logprobs else None),
-                        batch_idx=idx,
-                        finished=eos_reached[idx].item(),
-                        ignore_token=cur_pos < len(prompt_tokens[idx]),
-                    )
-                )
-            yield results
-            prev_pos = cur_pos
-            if all(eos_reached):
-                break
-    def completion(
-        self,
-        contents: list[RawContent],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: int | None = None,
-        logprobs: bool = False,
-        echo: bool = False,
-    ) -> Generator[list[GenerationResult], None, None]:
-        model_inputs = [self.formatter.encode_content(c) for c in contents]
-        for result in self.generate(
-            model_inputs=model_inputs,
-            temperature=temperature,
-            top_p=top_p,
-            max_gen_len=max_gen_len,
-            logprobs=logprobs,
-            echo=echo,
-        ):
-            yield result
-            if all(r.finished for r in result):
-                break
-    def chat_completion(
-        self,
-        messages_batch: list[list[RawMessage]],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: int | None = None,
-        logprobs: bool = False,
-        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
-        echo: bool = False,
-    ) -> Generator[list[GenerationResult], None, None]:
-        model_inputs = [self.formatter.encode_dialog_prompt(messages) for messages in messages_batch]
-        for result in self.generate(
-            model_inputs=model_inputs,
-            temperature=temperature,
-            top_p=top_p,
-            max_gen_len=max_gen_len,
-            logprobs=logprobs,
-            echo=echo,
-        ):
-            yield result
-            if all(r.finished for r in result):
-                break
-def sample_top_p(probs, p):
-    """
-    Perform top-p (nucleus) sampling on a probability distribution.
-    Args:
-        probs (torch.Tensor): Probability distribution tensor.
-        p (float): Probability threshold for top-p sampling.
-    Returns:
-        torch.Tensor: Sampled token indices.
-    Note:
-        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
-        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
-    """
-    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort[mask] = 0.0
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    next_token = torch.multinomial(probs_sort, num_samples=1)
-    next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token

llama_stack/models/llama/llama3/model.py DELETED Viewed

@@ -1,304 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import math
-import fairscale.nn.model_parallel.initialize as fs_init
-import torch
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.layers import (
-    ColumnParallelLinear,
-    RowParallelLinear,
-    VocabParallelEmbedding,
-)
-from torch import nn
-from .args import ModelArgs
-# **NOTE**: This code is not runnable without installing `torch` and `fairscale`
-# dependencies. These dependencies are not part of the default dependencies
-# (requirements.txt) of the `llama-models` package.
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-def apply_scaling(freqs: torch.Tensor) -> torch.Tensor:
-    # Values obtained from grid search
-    scale_factor = 8
-    low_freq_factor = 1
-    high_freq_factor = 4
-    old_context_len = 8192  # original llama3 length
-    low_freq_wavelen = old_context_len / low_freq_factor
-    high_freq_wavelen = old_context_len / high_freq_factor
-    wavelen = 2 * torch.pi / freqs
-    new_freqs = torch.where(wavelen > low_freq_wavelen, freqs / scale_factor, freqs)
-    smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    return torch.where(
-        (wavelen >= high_freq_wavelen) & (wavelen <= low_freq_wavelen),
-        (1 - smooth) * new_freqs / scale_factor + smooth * new_freqs,
-        new_freqs,
-    )
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
-    if use_scaled:
-        freqs = apply_scaling(freqs)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
-        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        world_size = fs_init.get_model_parallel_world_size()
-        self.n_local_heads = args.n_heads // world_size
-        self.n_local_kv_heads = self.n_kv_heads // world_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-        self.wq = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wk = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wv = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wo = RowParallelLinear(
-            args.n_heads * self.head_dim,
-            args.dim,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        self.cache_k = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-        self.cache_v = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: torch.Tensor | None,
-    ):
-        bsz, seqlen, _ = x.shape
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
-        self.cache_k = self.cache_k.to(xq)
-        self.cache_v = self.cache_v.to(xq)
-        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
-        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
-        keys = self.cache_k[:bsz, : start_pos + seqlen]
-        values = self.cache_v[:bsz, : start_pos + seqlen]
-        # repeat k/v heads if n_kv_heads < n_heads
-        keys = repeat_kv(keys, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
-        values = repeat_kv(values, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
-        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        keys = keys.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
-        values = values.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
-        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if mask is not None:
-            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
-        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-        return self.wo(output)
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: float | None,
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        self.w1 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
-        self.w2 = RowParallelLinear(hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x)
-        self.w3 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
-    def forward(self, x):
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=4 * args.dim,
-            multiple_of=args.multiple_of,
-            ffn_dim_multiplier=args.ffn_dim_multiplier,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: torch.Tensor | None,
-    ):
-        h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
-        out = h + self.feed_forward(self.ffn_norm(h))
-        return out
-class Transformer(nn.Module):
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-        self.tok_embeddings = VocabParallelEmbedding(params.vocab_size, params.dim, init_method=lambda x: x)
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = ColumnParallelLinear(params.dim, params.vocab_size, bias=False, init_method=lambda x: x)
-        self.freqs_cis = precompute_freqs_cis(
-            params.dim // params.n_heads,
-            params.max_seq_len * 2,
-            params.rope_theta,
-            params.use_scaled_rope,
-        )
-    @torch.inference_mode()
-    def forward(self, tokens: torch.Tensor, start_pos: int):
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.to(h.device)
-        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
-        mask = None
-        if seqlen > 1:
-            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
-            mask = torch.triu(mask, diagonal=1)
-            # https://github.com/pytorch/pytorch/issues/100005
-            # torch.triu is buggy when the device is mps: filled values are
-            # nan instead of 0.
-            if mask.device.type == torch.device("mps").type:
-                mask = torch.nan_to_num(mask, nan=0.0)
-            # When performing key-value caching, we compute the attention scores
-            # only for the new sequence. Thus, the matrix of scores is of size
-            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
-            # j > cache_len + i, since row i corresponds to token cache_len + i.
-            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h).float()
-        return output

llama_stack/models/llama/llama3/multimodal/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.

llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

llama-stack 0.4.3py3-none-any.whl → 0.5.0rc1py3-none-any.whl