PyPI - llama-stack - Versions diffs - 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (738) hide show

llama_stack/apis/evals/evals.py DELETED Viewed

@@ -1,122 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from enum import Enum
-from typing import List, Protocol
-from llama_models.schema_utils import webmethod
-from pydantic import BaseModel
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.dataset import *  # noqa: F403
-from llama_stack.apis.common.training_types import *  # noqa: F403
-class TextGenerationMetric(Enum):
-    perplexity = "perplexity"
-    rouge = "rouge"
-    bleu = "bleu"
-class QuestionAnsweringMetric(Enum):
-    em = "em"
-    f1 = "f1"
-class SummarizationMetric(Enum):
-    rouge = "rouge"
-    bleu = "bleu"
-class EvaluationJob(BaseModel):
-    job_uuid: str
-class EvaluationJobLogStream(BaseModel):
-    job_uuid: str
-class EvaluateTaskRequestCommon(BaseModel):
-    job_uuid: str
-    dataset: TrainEvalDataset
-    checkpoint: Checkpoint
-    # generation params
-    sampling_params: SamplingParams = SamplingParams()
-@json_schema_type
-class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate text generation."""
-    metrics: List[TextGenerationMetric]
-@json_schema_type
-class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate question answering."""
-    metrics: List[QuestionAnsweringMetric]
-@json_schema_type
-class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
-    """Request to evaluate summarization."""
-    metrics: List[SummarizationMetric]
-class EvaluationJobStatusResponse(BaseModel):
-    job_uuid: str
-@json_schema_type
-class EvaluationJobArtifactsResponse(BaseModel):
-    """Artifacts of a evaluation job."""
-    job_uuid: str
-class Evaluations(Protocol):
-    @webmethod(route="/evaluate/text_generation/")
-    def evaluate_text_generation(
-        self,
-        metrics: List[TextGenerationMetric],
-    ) -> EvaluationJob: ...
-    @webmethod(route="/evaluate/question_answering/")
-    def evaluate_question_answering(
-        self,
-        metrics: List[QuestionAnsweringMetric],
-    ) -> EvaluationJob: ...
-    @webmethod(route="/evaluate/summarization/")
-    def evaluate_summarization(
-        self,
-        metrics: List[SummarizationMetric],
-    ) -> EvaluationJob: ...
-    @webmethod(route="/evaluate/jobs")
-    def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
-    @webmethod(route="/evaluate/job/status")
-    def get_evaluation_job_status(
-        self, job_uuid: str
-    ) -> EvaluationJobStatusResponse: ...
-    # sends SSE stream of logs
-    @webmethod(route="/evaluate/job/logs")
-    def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
-    @webmethod(route="/evaluate/job/cancel")
-    def cancel_evaluation_job(self, job_uuid: str) -> None: ...
-    @webmethod(route="/evaluate/job/artifacts")
-    def get_evaluation_job_artifacts(
-        self, job_uuid: str
-    ) -> EvaluationJobArtifactsResponse: ...

llama_stack/apis/inference/client.py DELETED Viewed

@@ -1,197 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import asyncio
-import json
-from typing import Any, AsyncGenerator, List, Optional
-import fire
-import httpx
-from llama_models.llama3.api.datatypes import ImageMedia, URL
-from pydantic import BaseModel
-from llama_models.llama3.api import *  # noqa: F403
-from llama_stack.apis.inference import *  # noqa: F403
-from termcolor import cprint
-from llama_stack.distribution.datatypes import RemoteProviderConfig
-from .event_logger import EventLogger
-async def get_client_impl(config: RemoteProviderConfig, _deps: Any) -> Inference:
-    return InferenceClient(config.url)
-def encodable_dict(d: BaseModel):
-    return json.loads(d.json())
-class InferenceClient(Inference):
-    def __init__(self, base_url: str):
-        self.base_url = base_url
-    async def initialize(self) -> None:
-        pass
-    async def shutdown(self) -> None:
-        pass
-    def completion(self, request: CompletionRequest) -> AsyncGenerator:
-        raise NotImplementedError()
-    def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-        if stream:
-            return self._stream_chat_completion(request)
-        else:
-            return self._nonstream_chat_completion(request)
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> ChatCompletionResponse:
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/inference/chat_completion",
-                json=encodable_dict(request),
-                headers={"Content-Type": "application/json"},
-                timeout=20,
-            )
-            response.raise_for_status()
-            j = response.json()
-            return ChatCompletionResponse(**j)
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest
-    ) -> AsyncGenerator:
-        async with httpx.AsyncClient() as client:
-            async with client.stream(
-                "POST",
-                f"{self.base_url}/inference/chat_completion",
-                json=encodable_dict(request),
-                headers={"Content-Type": "application/json"},
-                timeout=20,
-            ) as response:
-                if response.status_code != 200:
-                    content = await response.aread()
-                    cprint(
-                        f"Error: HTTP {response.status_code} {content.decode()}",
-                        "red",
-                    )
-                    return
-                async for line in response.aiter_lines():
-                    if line.startswith("data:"):
-                        data = line[len("data: ") :]
-                        try:
-                            if "error" in data:
-                                cprint(data, "red")
-                                continue
-                            yield ChatCompletionResponseStreamChunk(**json.loads(data))
-                        except Exception as e:
-                            print(data)
-                            print(f"Error with parsing or validation: {e}")
-async def run_main(
-    host: str, port: int, stream: bool, model: Optional[str], logprobs: bool
-):
-    client = InferenceClient(f"http://{host}:{port}")
-    if not model:
-        model = "Llama3.1-8B-Instruct"
-    message = UserMessage(
-        content="hello world, write me a 2 sentence poem about the moon"
-    )
-    cprint(f"User>{message.content}", "green")
-    if logprobs:
-        logprobs_config = LogProbConfig(
-            top_k=1,
-        )
-    else:
-        logprobs_config = None
-    iterator = client.chat_completion(
-        model=model,
-        messages=[message],
-        stream=stream,
-        logprobs=logprobs_config,
-    )
-    if logprobs:
-        async for chunk in iterator:
-            cprint(f"Response: {chunk}", "red")
-    else:
-        async for log in EventLogger().log(iterator):
-            log.print()
-async def run_mm_main(
-    host: str, port: int, stream: bool, path: Optional[str], model: Optional[str]
-):
-    client = InferenceClient(f"http://{host}:{port}")
-    if not model:
-        model = "Llama3.2-11B-Vision-Instruct"
-    message = UserMessage(
-        content=[
-            ImageMedia(image=URL(uri=f"file://{path}")),
-            "Describe this image in two sentences",
-        ],
-    )
-    cprint(f"User>{message.content}", "green")
-    iterator = client.chat_completion(
-        model=model,
-        messages=[message],
-        stream=stream,
-    )
-    async for log in EventLogger().log(iterator):
-        log.print()
-def main(
-    host: str,
-    port: int,
-    stream: bool = True,
-    mm: bool = False,
-    logprobs: bool = False,
-    file: Optional[str] = None,
-    model: Optional[str] = None,
-):
-    if mm:
-        asyncio.run(run_mm_main(host, port, stream, file, model))
-    else:
-        asyncio.run(run_main(host, port, stream, model, logprobs))
-if __name__ == "__main__":
-    fire.Fire(main)

llama_stack/apis/inspect/client.py DELETED Viewed

@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import asyncio
-from typing import List
-import fire
-import httpx
-from termcolor import cprint
-from .inspect import *  # noqa: F403
-class InspectClient(Inspect):
-    def __init__(self, base_url: str):
-        self.base_url = base_url
-    async def initialize(self) -> None:
-        pass
-    async def shutdown(self) -> None:
-        pass
-    async def list_providers(self) -> Dict[str, ProviderInfo]:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                f"{self.base_url}/providers/list",
-                headers={"Content-Type": "application/json"},
-            )
-            response.raise_for_status()
-            print(response.json())
-            return {
-                k: [ProviderInfo(**vi) for vi in v] for k, v in response.json().items()
-            }
-    async def list_routes(self) -> Dict[str, List[RouteInfo]]:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                f"{self.base_url}/routes/list",
-                headers={"Content-Type": "application/json"},
-            )
-            response.raise_for_status()
-            return {
-                k: [RouteInfo(**vi) for vi in v] for k, v in response.json().items()
-            }
-    async def health(self) -> HealthInfo:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                f"{self.base_url}/health",
-                headers={"Content-Type": "application/json"},
-            )
-            response.raise_for_status()
-            j = response.json()
-            if j is None:
-                return None
-            return HealthInfo(**j)
-async def run_main(host: str, port: int):
-    client = InspectClient(f"http://{host}:{port}")
-    response = await client.list_providers()
-    cprint(f"list_providers response={response}", "green")
-    response = await client.list_routes()
-    cprint(f"list_routes response={response}", "blue")
-    response = await client.health()
-    cprint(f"health response={response}", "yellow")
-def main(host: str, port: int):
-    asyncio.run(run_main(host, port))
-if __name__ == "__main__":
-    fire.Fire(main)

llama_stack/apis/memory/client.py DELETED Viewed

@@ -1,155 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import asyncio
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-import fire
-import httpx
-from llama_stack.distribution.datatypes import RemoteProviderConfig
-from llama_stack.apis.memory import *  # noqa: F403
-from llama_stack.apis.memory_banks.client import MemoryBanksClient
-from llama_stack.providers.utils.memory.file_utils import data_url_from_file
-async def get_client_impl(config: RemoteProviderConfig, _deps: Any) -> Memory:
-    return MemoryClient(config.url)
-class MemoryClient(Memory):
-    def __init__(self, base_url: str):
-        self.base_url = base_url
-    async def initialize(self) -> None:
-        pass
-    async def shutdown(self) -> None:
-        pass
-    async def insert_documents(
-        self,
-        bank_id: str,
-        documents: List[MemoryBankDocument],
-    ) -> None:
-        async with httpx.AsyncClient() as client:
-            r = await client.post(
-                f"{self.base_url}/memory/insert",
-                json={
-                    "bank_id": bank_id,
-                    "documents": [d.dict() for d in documents],
-                },
-                headers={"Content-Type": "application/json"},
-                timeout=20,
-            )
-            r.raise_for_status()
-    async def query_documents(
-        self,
-        bank_id: str,
-        query: InterleavedTextMedia,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> QueryDocumentsResponse:
-        async with httpx.AsyncClient() as client:
-            r = await client.post(
-                f"{self.base_url}/memory/query",
-                json={
-                    "bank_id": bank_id,
-                    "query": query,
-                    "params": params,
-                },
-                headers={"Content-Type": "application/json"},
-                timeout=20,
-            )
-            r.raise_for_status()
-            return QueryDocumentsResponse(**r.json())
-async def run_main(host: str, port: int, stream: bool):
-    banks_client = MemoryBanksClient(f"http://{host}:{port}")
-    bank = VectorMemoryBankDef(
-        identifier="test_bank",
-        provider_id="",
-        embedding_model="all-MiniLM-L6-v2",
-        chunk_size_in_tokens=512,
-        overlap_size_in_tokens=64,
-    )
-    await banks_client.register_memory_bank(bank)
-    retrieved_bank = await banks_client.get_memory_bank(bank.identifier)
-    assert retrieved_bank is not None
-    assert retrieved_bank.embedding_model == "all-MiniLM-L6-v2"
-    urls = [
-        "memory_optimizations.rst",
-        "chat.rst",
-        "llama3.rst",
-        "datasets.rst",
-        "qat_finetune.rst",
-        "lora_finetune.rst",
-    ]
-    documents = [
-        MemoryBankDocument(
-            document_id=f"num-{i}",
-            content=URL(
-                uri=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}"
-            ),
-            mime_type="text/plain",
-        )
-        for i, url in enumerate(urls)
-    ]
-    this_dir = os.path.dirname(__file__)
-    files = [Path(this_dir).parent.parent.parent / "CONTRIBUTING.md"]
-    documents += [
-        MemoryBankDocument(
-            document_id=f"num-{i}",
-            content=data_url_from_file(path),
-        )
-        for i, path in enumerate(files)
-    ]
-    client = MemoryClient(f"http://{host}:{port}")
-    # insert some documents
-    await client.insert_documents(
-        bank_id=bank.identifier,
-        documents=documents,
-    )
-    # query the documents
-    response = await client.query_documents(
-        bank_id=bank.identifier,
-        query=[
-            "How do I use Lora?",
-        ],
-    )
-    for chunk, score in zip(response.chunks, response.scores):
-        print(f"Score: {score}")
-        print(f"Chunk:\n========\n{chunk}\n========\n")
-    response = await client.query_documents(
-        bank_id=bank.identifier,
-        query=[
-            "Tell me more about llama3 and torchtune",
-        ],
-    )
-    for chunk, score in zip(response.chunks, response.scores):
-        print(f"Score: {score}")
-        print(f"Chunk:\n========\n{chunk}\n========\n")
-def main(host: str, port: int, stream: bool = True):
-    asyncio.run(run_main(host, port, stream))
-if __name__ == "__main__":
-    fire.Fire(main)

llama_stack/apis/memory/memory.py DELETED Viewed

@@ -1,65 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import List, Optional, Protocol, runtime_checkable
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.memory_banks import *  # noqa: F403
-@json_schema_type
-class MemoryBankDocument(BaseModel):
-    document_id: str
-    content: InterleavedTextMedia | URL
-    mime_type: str | None = None
-    metadata: Dict[str, Any] = Field(default_factory=dict)
-class Chunk(BaseModel):
-    content: InterleavedTextMedia
-    token_count: int
-    document_id: str
-@json_schema_type
-class QueryDocumentsResponse(BaseModel):
-    chunks: List[Chunk]
-    scores: List[float]
-class MemoryBankStore(Protocol):
-    def get_memory_bank(self, bank_id: str) -> Optional[MemoryBankDef]: ...
-@runtime_checkable
-class Memory(Protocol):
-    memory_bank_store: MemoryBankStore
-    # this will just block now until documents are inserted, but it should
-    # probably return a Job instance which can be polled for completion
-    @webmethod(route="/memory/insert")
-    async def insert_documents(
-        self,
-        bank_id: str,
-        documents: List[MemoryBankDocument],
-        ttl_seconds: Optional[int] = None,
-    ) -> None: ...
-    @webmethod(route="/memory/query")
-    async def query_documents(
-        self,
-        bank_id: str,
-        query: InterleavedTextMedia,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> QueryDocumentsResponse: ...

llama_stack/apis/memory_banks/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from .memory_banks import *  # noqa: F401 F403

llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

llama-stack 0.0.42py3-none-any.whl → 0.3.4py3-none-any.whl