PyPI - llama-stack - Versions diffs - 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

llama-stack 0.3.5py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (460) hide show

llama_stack/providers/utils/memory/openai_vector_store_mixin.py CHANGED Viewed

@@ -15,21 +15,37 @@ from typing import Annotated, Any
 from fastapi import Body
 from pydantic import TypeAdapter
-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files, OpenAIFileObject
-from llama_stack.apis.vector_io import (
+from llama_stack.core.datatypes import VectorStoresConfig
+from llama_stack.core.id_generation import generate_object_id
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    interleaved_content_as_str,
+)
+from llama_stack.providers.utils.memory.vector_store import (
+    ChunkForDeletion,
+    content_from_data_and_mime_type,
+    make_overlapped_chunks,
+)
+from llama_stack_api import (
     Chunk,
+    EmbeddedChunk,
+    Files,
+    Inference,
     OpenAICreateVectorStoreFileBatchRequestWithExtraBody,
     OpenAICreateVectorStoreRequestWithExtraBody,
+    OpenAIEmbeddingsRequestWithExtraBody,
+    OpenAIFileObject,
     QueryChunksResponse,
     SearchRankingOptions,
+    VectorStore,
     VectorStoreChunkingStrategy,
     VectorStoreChunkingStrategyAuto,
     VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
     VectorStoreContent,
     VectorStoreDeleteResponse,
     VectorStoreFileBatchObject,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
     VectorStoreFileCounts,
     VectorStoreFileDeleteResponse,
     VectorStoreFileLastError,
@@ -38,29 +54,22 @@ from llama_stack.apis.vector_io import (
     VectorStoreFileStatus,
     VectorStoreListFilesResponse,
     VectorStoreListResponse,
+    VectorStoreNotFoundError,
     VectorStoreObject,
     VectorStoreSearchResponse,
     VectorStoreSearchResponsePage,
 )
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.core.id_generation import generate_object_id
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.memory.vector_store import (
-    ChunkForDeletion,
-    content_from_data_and_mime_type,
-    make_overlapped_chunks,
+from llama_stack_api.files.models import (
+    RetrieveFileContentRequest,
+    RetrieveFileRequest,
 )
+from llama_stack_api.internal.kvstore import KVStore
 EMBEDDING_DIMENSION = 768
 logger = get_logger(name=__name__, category="providers::utils")
 # Constants for OpenAI vector stores
-CHUNK_MULTIPLIER = 5
-FILE_BATCH_CLEANUP_INTERVAL_SECONDS = 24 * 60 * 60  # 1 day in seconds
-MAX_CONCURRENT_FILES_PER_BATCH = 3  # Maximum concurrent file processing within a batch
-FILE_BATCH_CHUNK_SIZE = 10  # Process files in chunks of this size
 VERSION = "v3"
 VECTOR_DBS_PREFIX = f"vector_stores:{VERSION}::"
@@ -81,15 +90,29 @@ class OpenAIVectorStoreMixin(ABC):
     # to properly initialize the mixin attributes.
     def __init__(
         self,
+        inference_api: Inference,
         files_api: Files | None = None,
         kvstore: KVStore | None = None,
+        vector_stores_config: VectorStoresConfig | None = None,
     ):
+        if not inference_api:
+            raise RuntimeError("Inference API is required for vector store operations")
+        self.inference_api = inference_api
         self.openai_vector_stores: dict[str, dict[str, Any]] = {}
         self.openai_file_batches: dict[str, dict[str, Any]] = {}
         self.files_api = files_api
         self.kvstore = kvstore
+        self.vector_stores_config = vector_stores_config or VectorStoresConfig()
         self._last_file_batch_cleanup_time = 0
         self._file_batch_tasks: dict[str, asyncio.Task[None]] = {}
+        self._vector_store_locks: dict[str, asyncio.Lock] = {}
+    def _get_vector_store_lock(self, vector_store_id: str) -> asyncio.Lock:
+        """Get or create a lock for a specific vector store."""
+        if vector_store_id not in self._vector_store_locks:
+            self._vector_store_locks[vector_store_id] = asyncio.Lock()
+        return self._vector_store_locks[vector_store_id]
     async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
         """Save vector store metadata to persistent storage."""
@@ -333,8 +356,8 @@ class OpenAIVectorStoreMixin(ABC):
     @abstractmethod
     async def insert_chunks(
         self,
-        vector_db_id: str,
-        chunks: list[Chunk],
+        vector_store_id: str,
+        chunks: list[EmbeddedChunk],
         ttl_seconds: int | None = None,
     ) -> None:
         """Insert chunks into a vector database (provider-specific implementation)."""
@@ -342,7 +365,7 @@ class OpenAIVectorStoreMixin(ABC):
     @abstractmethod
     async def query_chunks(
-        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
+        self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
     ) -> QueryChunksResponse:
         """Query chunks from a vector database (provider-specific implementation)."""
         pass
@@ -414,6 +437,10 @@ class OpenAIVectorStoreMixin(ABC):
             in_progress=0,
             total=0,
         )
+        if not params.chunking_strategy or params.chunking_strategy.type == "auto":
+            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
+        else:
+            chunking_strategy = params.chunking_strategy
         store_info: dict[str, Any] = {
             "id": vector_store_id,
             "object": "vector_store",
@@ -426,7 +453,7 @@ class OpenAIVectorStoreMixin(ABC):
             "expires_at": None,
             "last_active_at": created_at,
             "file_ids": [],
-            "chunking_strategy": params.chunking_strategy,
+            "chunking_strategy": chunking_strategy.model_dump(),
         }
         # Add provider information to metadata if provided
@@ -434,6 +461,11 @@ class OpenAIVectorStoreMixin(ABC):
             metadata["provider_id"] = provider_id
         if provider_vector_store_id:
             metadata["provider_vector_store_id"] = provider_vector_store_id
+        # Add embedding configuration to metadata for file processing
+        metadata["embedding_model"] = embedding_model
+        metadata["embedding_dimension"] = str(embedding_dimension)
         store_info["metadata"] = metadata
         # Save to persistent storage (provider-specific)
@@ -445,7 +477,13 @@ class OpenAIVectorStoreMixin(ABC):
         # Now that our vector store is created, attach any files that were provided
         file_ids = params.file_ids or []
         tasks = [self.openai_attach_file_to_vector_store(vector_store_id, file_id) for file_id in file_ids]
-        await asyncio.gather(*tasks)
+        # Use return_exceptions=True to handle individual file attachment failures gracefully
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Log any exceptions but don't fail the vector store creation
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.warning(f"Failed to attach file {file_ids[i]} to vector store {vector_store_id}: {result}")
         # Get the updated store info and return it
         store_info = self.openai_vector_stores[vector_store_id]
@@ -579,7 +617,11 @@ class OpenAIVectorStoreMixin(ABC):
             str | None
         ) = "vector",  # Using str instead of Literal due to OpenAPI schema generator limitations
     ) -> VectorStoreSearchResponsePage:
-        """Search for chunks in a vector store."""
+        """Search for chunks in a vector store.
+        Note: Query rewriting is handled at the router level, not here.
+        The rewrite_query parameter is kept for API compatibility but is ignored.
+        """
         max_num_results = max_num_results or 10
         # Validate search_mode
@@ -602,21 +644,24 @@ class OpenAIVectorStoreMixin(ABC):
                 else 0.0
             )
             params = {
-                "max_chunks": max_num_results * CHUNK_MULTIPLIER,
+                "max_chunks": max_num_results * self.vector_stores_config.chunk_retrieval_params.chunk_multiplier,
                 "score_threshold": score_threshold,
                 "mode": search_mode,
             }
             # TODO: Add support for ranking_options.ranker
             response = await self.query_chunks(
-                vector_db_id=vector_store_id,
+                vector_store_id=vector_store_id,
                 query=search_query,
                 params=params,
             )
             # Convert response to OpenAI format
             data = []
-            for chunk, score in zip(response.chunks, response.scores, strict=False):
+            for embedded_chunk, score in zip(response.chunks, response.scores, strict=False):
+                # EmbeddedChunk inherits from Chunk, so use it directly
+                chunk = embedded_chunk
                 # Apply filters if provided
                 if filters:
                     # Simple metadata filtering
@@ -637,7 +682,7 @@ class OpenAIVectorStoreMixin(ABC):
                     break
             return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                 data=data,
                 has_more=False,  # For simplicity, we don't implement pagination here
                 next_page=None,
@@ -647,7 +692,7 @@ class OpenAIVectorStoreMixin(ABC):
             logger.error(f"Error searching vector store {vector_store_id}: {e}")
             # Return empty results on error
             return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                 data=[],
                 has_more=False,
                 next_page=None,
@@ -699,34 +744,35 @@ class OpenAIVectorStoreMixin(ABC):
             # Unknown filter type, default to no match
             raise ValueError(f"Unsupported filter type: {filter_type}")
-    def _chunk_to_vector_store_content(self, chunk: Chunk) -> list[VectorStoreContent]:
-        # content is InterleavedContent
+    def _chunk_to_vector_store_content(
+        self, chunk: EmbeddedChunk, include_embeddings: bool = False, include_metadata: bool = False
+    ) -> list[VectorStoreContent]:
+        def extract_fields() -> dict:
+            """Extract metadata fields from chunk based on include flags."""
+            return {
+                "chunk_metadata": chunk.chunk_metadata if include_metadata else None,
+                "metadata": chunk.metadata if include_metadata else None,
+                "embedding": chunk.embedding if include_embeddings else None,
+            }
+        fields = extract_fields()
         if isinstance(chunk.content, str):
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=chunk.content,
-                )
-            ]
+            content_item = VectorStoreContent(type="text", text=chunk.content, **fields)
+            content = [content_item]
         elif isinstance(chunk.content, list):
             # TODO: Add support for other types of content
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=item.text,
-                )
-                for item in chunk.content
-                if item.type == "text"
-            ]
+            content = []
+            for item in chunk.content:
+                if item.type == "text":
+                    content_item = VectorStoreContent(type="text", text=item.text, **fields)
+                    content.append(content_item)
         else:
             if chunk.content.type != "text":
                 raise ValueError(f"Unsupported content type: {chunk.content.type}")
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=chunk.content.text,
-                )
-            ]
+            content_item = VectorStoreContent(type="text", text=chunk.content.text, **fields)
+            content = [content_item]
         return content
     async def openai_attach_file_to_vector_store(
@@ -751,6 +797,7 @@ class OpenAIVectorStoreMixin(ABC):
         chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
         created_at = int(time.time())
         chunks: list[Chunk] = []
+        embedded_chunks: list[EmbeddedChunk] = []
         file_response: OpenAIFileObject | None = None
         vector_store_file_object = VectorStoreFileObject(
@@ -779,15 +826,22 @@ class OpenAIVectorStoreMixin(ABC):
             chunk_overlap_tokens = 400
         try:
-            file_response = await self.files_api.openai_retrieve_file(file_id)
+            file_response = await self.files_api.openai_retrieve_file(RetrieveFileRequest(file_id=file_id))
             mime_type, _ = mimetypes.guess_type(file_response.filename)
-            content_response = await self.files_api.openai_retrieve_file_content(file_id)
+            content_response = await self.files_api.openai_retrieve_file_content(
+                RetrieveFileContentRequest(file_id=file_id)
+            )
             content = content_from_data_and_mime_type(content_response.body, mime_type)
             chunk_attributes = attributes.copy()
             chunk_attributes["filename"] = file_response.filename
+            # Get embedding model info from vector store metadata
+            store_info = self.openai_vector_stores[vector_store_id]
+            embedding_model = store_info["metadata"].get("embedding_model")
+            embedding_dimension = store_info["metadata"].get("embedding_dimension")
             chunks = make_overlapped_chunks(
                 file_id,
                 content,
@@ -802,9 +856,42 @@ class OpenAIVectorStoreMixin(ABC):
                     message="No chunks were generated from the file",
                 )
             else:
+                # Validate embedding model and dimension are available
+                if not embedding_model:
+                    raise RuntimeError(f"Vector store {vector_store_id} is not properly configured for file processing")
+                if not embedding_dimension:
+                    raise RuntimeError(f"Vector store {vector_store_id} is not properly configured for file processing")
+                # Generate embeddings for all chunks before insertion
+                # Prepare embedding request for all chunks
+                params = OpenAIEmbeddingsRequestWithExtraBody(
+                    model=embedding_model,
+                    input=[interleaved_content_as_str(c.content) for c in chunks],
+                )
+                resp = await self.inference_api.openai_embeddings(params)
+                # Create EmbeddedChunk instances from chunks and their embeddings
+                for chunk, data in zip(chunks, resp.data, strict=False):
+                    # Ensure embedding is a list of floats
+                    embedding = data.embedding
+                    if isinstance(embedding, str):
+                        # Handle case where embedding might be returned as a string (shouldn't normally happen)
+                        raise ValueError(f"Received string embedding instead of list: {embedding}")
+                    embedded_chunk = EmbeddedChunk(
+                        content=chunk.content,
+                        chunk_id=chunk.chunk_id,
+                        metadata=chunk.metadata,
+                        chunk_metadata=chunk.chunk_metadata,
+                        embedding=embedding,
+                        embedding_model=embedding_model,
+                        embedding_dimension=len(embedding),
+                    )
+                    embedded_chunks.append(embedded_chunk)
                 await self.insert_chunks(
-                    vector_db_id=vector_store_id,
-                    chunks=chunks,
+                    vector_store_id=vector_store_id,
+                    chunks=embedded_chunks,
                 )
                 vector_store_file_object.status = "completed"
         except Exception as e:
@@ -815,26 +902,27 @@ class OpenAIVectorStoreMixin(ABC):
                 message=str(e),
             )
-        # Create OpenAI vector store file metadata
+        # Save vector store file to persistent storage AFTER insert_chunks
+        # so that chunks include the embeddings that were generated
         file_info = vector_store_file_object.model_dump(exclude={"last_error"})
         file_info["filename"] = file_response.filename if file_response else ""
-        # Save vector store file to persistent storage (provider-specific)
-        dict_chunks = [c.model_dump() for c in chunks]
-        # This should be updated to include chunk_id
+        dict_chunks = [c.model_dump() for c in embedded_chunks]
         await self._save_openai_vector_store_file(vector_store_id, file_id, file_info, dict_chunks)
         # Update file_ids and file_counts in vector store metadata
-        store_info = self.openai_vector_stores[vector_store_id].copy()
-        store_info["file_ids"].append(file_id)
-        store_info["file_counts"]["total"] += 1
-        store_info["file_counts"][vector_store_file_object.status] += 1
-        # Save updated vector store to persistent storage
-        await self._save_openai_vector_store(vector_store_id, store_info)
-        # Update vector store in-memory cache
-        self.openai_vector_stores[vector_store_id] = store_info
+        # Use lock to prevent race condition when multiple files are attached concurrently
+        async with self._get_vector_store_lock(vector_store_id):
+            store_info = self.openai_vector_stores[vector_store_id].copy()
+            # Deep copy file_counts to avoid mutating shared dict
+            store_info["file_counts"] = store_info["file_counts"].copy()
+            store_info["file_ids"] = store_info["file_ids"].copy()
+            store_info["file_ids"].append(file_id)
+            store_info["file_counts"]["total"] += 1
+            store_info["file_counts"][vector_store_file_object.status] += 1
+            # Save updated vector store to persistent storage
+            await self._save_openai_vector_store(vector_store_id, store_info)
         return vector_store_file_object
@@ -886,8 +974,8 @@ class OpenAIVectorStoreMixin(ABC):
         # Determine pagination info
         has_more = len(file_objects) > limit
-        first_id = file_objects[0].id if file_objects else None
-        last_id = file_objects[-1].id if file_objects else None
+        first_id = limited_files[0].id if file_objects else None
+        last_id = limited_files[-1].id if file_objects else None
         return VectorStoreListFilesResponse(
             data=limited_files,
@@ -916,22 +1004,27 @@ class OpenAIVectorStoreMixin(ABC):
         self,
         vector_store_id: str,
         file_id: str,
-    ) -> VectorStoreFileContentsResponse:
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
         """Retrieves the contents of a vector store file."""
         if vector_store_id not in self.openai_vector_stores:
             raise VectorStoreNotFoundError(vector_store_id)
-        file_info = await self._load_openai_vector_store_file(vector_store_id, file_id)
+        # Parameters are already provided directly
+        # include_embeddings and include_metadata are now function parameters
         dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id)
-        chunks = [Chunk.model_validate(c) for c in dict_chunks]
+        chunks = [EmbeddedChunk.model_validate(c) for c in dict_chunks]
         content = []
         for chunk in chunks:
-            content.extend(self._chunk_to_vector_store_content(chunk))
-        return VectorStoreFileContentsResponse(
-            file_id=file_id,
-            filename=file_info.get("filename", ""),
-            attributes=file_info.get("attributes", {}),
-            content=content,
+            content.extend(
+                self._chunk_to_vector_store_content(
+                    chunk, include_embeddings=include_embeddings or False, include_metadata=include_metadata or False
+                )
+            )
+        return VectorStoreFileContentResponse(
+            data=content,
         )
     async def openai_update_vector_store_file(
@@ -1048,7 +1141,10 @@ class OpenAIVectorStoreMixin(ABC):
         # Run cleanup if needed (throttled to once every 1 day)
         current_time = int(time.time())
-        if current_time - self._last_file_batch_cleanup_time >= FILE_BATCH_CLEANUP_INTERVAL_SECONDS:
+        if (
+            current_time - self._last_file_batch_cleanup_time
+            >= self.vector_stores_config.file_batch_params.cleanup_interval_seconds
+        ):
             logger.info("Running throttled cleanup of expired file batches")
             asyncio.create_task(self._cleanup_expired_file_batches())
             self._last_file_batch_cleanup_time = current_time
@@ -1065,7 +1161,7 @@ class OpenAIVectorStoreMixin(ABC):
         batch_info: dict[str, Any],
     ) -> None:
         """Process files with controlled concurrency and chunking."""
-        semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILES_PER_BATCH)
+        semaphore = asyncio.Semaphore(self.vector_stores_config.file_batch_params.max_concurrent_files_per_batch)
         async def process_single_file(file_id: str) -> tuple[str, bool]:
             """Process a single file with concurrency control."""
@@ -1084,12 +1180,13 @@ class OpenAIVectorStoreMixin(ABC):
         # Process files in chunks to avoid creating too many tasks at once
         total_files = len(file_ids)
-        for chunk_start in range(0, total_files, FILE_BATCH_CHUNK_SIZE):
-            chunk_end = min(chunk_start + FILE_BATCH_CHUNK_SIZE, total_files)
+        chunk_size = self.vector_stores_config.file_batch_params.file_batch_chunk_size
+        for chunk_start in range(0, total_files, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, total_files)
             chunk = file_ids[chunk_start:chunk_end]
-            chunk_num = chunk_start // FILE_BATCH_CHUNK_SIZE + 1
-            total_chunks = (total_files + FILE_BATCH_CHUNK_SIZE - 1) // FILE_BATCH_CHUNK_SIZE
+            chunk_num = chunk_start // chunk_size + 1
+            total_chunks = (total_files + chunk_size - 1) // chunk_size
             logger.info(
                 f"Processing chunk {chunk_num} of {total_chunks} ({len(chunk)} files, {chunk_start + 1}-{chunk_end} of {total_files} total files)"
             )

llama_stack/providers/utils/memory/vector_store.py CHANGED Viewed

@@ -17,21 +17,25 @@ import numpy as np
 from numpy.typing import NDArray
 from pydantic import BaseModel
-from llama_stack.apis.common.content_types import (
-    URL,
-    InterleavedContent,
-)
-from llama_stack.apis.inference import OpenAIEmbeddingsRequestWithExtraBody
-from llama_stack.apis.tools import RAGDocument
-from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
-from llama_stack.apis.vector_stores import VectorStore
+from llama_stack.core.datatypes import VectorStoresConfig
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.datatypes import Api
 from llama_stack.providers.utils.inference.prompt_adapter import (
     interleaved_content_as_str,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+from llama_stack_api import (
+    URL,
+    Api,
+    Chunk,
+    ChunkMetadata,
+    EmbeddedChunk,
+    InterleavedContent,
+    OpenAIEmbeddingsRequestWithExtraBody,
+    QueryChunksResponse,
+    RAGDocument,
+    VectorStore,
+)
 log = get_logger(name=__name__, category="providers::utils")
@@ -155,7 +159,11 @@ async def content_from_doc(doc: RAGDocument) -> str:
 def make_overlapped_chunks(
-    document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
+    document_id: str,
+    text: str,
+    window_len: int,
+    overlap_len: int,
+    metadata: dict[str, Any],
 ) -> list[Chunk]:
     default_tokenizer = "DEFAULT_TIKTOKEN_TOKENIZER"
     tokenizer = Tokenizer.get_instance()
@@ -187,7 +195,6 @@ def make_overlapped_chunks(
             updated_timestamp=int(time.time()),
             chunk_window=chunk_window,
             chunk_tokenizer=default_tokenizer,
-            chunk_embedding_model=None,  # This will be set in `VectorStoreWithIndex.insert_chunks`
             content_token_count=len(toks),
             metadata_token_count=len(metadata_tokens),
         )
@@ -196,6 +203,7 @@ def make_overlapped_chunks(
         chunks.append(
             Chunk(
                 content=chunk,
+                chunk_id=chunk_id,
                 metadata=chunk_metadata,
                 chunk_metadata=backend_chunk_metadata,
             )
@@ -222,7 +230,7 @@ def _validate_embedding(embedding: NDArray, index: int, expected_dimension: int)
 class EmbeddingIndex(ABC):
     @abstractmethod
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
+    async def add_chunks(self, embedded_chunks: list[EmbeddedChunk]):
         raise NotImplementedError()
     @abstractmethod
@@ -259,38 +267,25 @@ class VectorStoreWithIndex:
     vector_store: VectorStore
     index: EmbeddingIndex
     inference_api: Api.inference
+    vector_stores_config: VectorStoresConfig | None = None
     async def insert_chunks(
         self,
-        chunks: list[Chunk],
+        chunks: list[EmbeddedChunk],
     ) -> None:
-        chunks_to_embed = []
-        for i, c in enumerate(chunks):
-            if c.embedding is None:
-                chunks_to_embed.append(c)
-                if c.chunk_metadata:
-                    c.chunk_metadata.chunk_embedding_model = self.vector_store.embedding_model
-                    c.chunk_metadata.chunk_embedding_dimension = self.vector_store.embedding_dimension
-            else:
-                _validate_embedding(c.embedding, i, self.vector_store.embedding_dimension)
-        if chunks_to_embed:
-            params = OpenAIEmbeddingsRequestWithExtraBody(
-                model=self.vector_store.embedding_model,
-                input=[c.content for c in chunks_to_embed],
-            )
-            resp = await self.inference_api.openai_embeddings(params)
-            for c, data in zip(chunks_to_embed, resp.data, strict=False):
-                c.embedding = data.embedding
+        # Validate embedding dimensions match the vector store
+        for i, embedded_chunk in enumerate(chunks):
+            _validate_embedding(embedded_chunk.embedding, i, self.vector_store.embedding_dimension)
-        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
-        await self.index.add_chunks(chunks, embeddings)
+        await self.index.add_chunks(chunks)
     async def query_chunks(
         self,
         query: InterleavedContent,
         params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
+        config = self.vector_stores_config or VectorStoresConfig()
         if params is None:
             params = {}
         k = params.get("max_chunks", 3)
@@ -299,19 +294,25 @@ class VectorStoreWithIndex:
         ranker = params.get("ranker")
         if ranker is None:
-            reranker_type = RERANKER_TYPE_RRF
-            reranker_params = {"impact_factor": 60.0}
+            reranker_type = (
+                RERANKER_TYPE_RRF
+                if config.chunk_retrieval_params.default_reranker_strategy == "rrf"
+                else config.chunk_retrieval_params.default_reranker_strategy
+            )
+            reranker_params = {"impact_factor": config.chunk_retrieval_params.rrf_impact_factor}
         else:
-            strategy = ranker.get("strategy", "rrf")
+            strategy = ranker.get("strategy", config.chunk_retrieval_params.default_reranker_strategy)
             if strategy == "weighted":
                 weights = ranker.get("params", {}).get("weights", [0.5, 0.5])
                 reranker_type = RERANKER_TYPE_WEIGHTED
-                reranker_params = {"alpha": weights[0] if len(weights) > 0 else 0.5}
+                reranker_params = {
+                    "alpha": weights[0] if len(weights) > 0 else config.chunk_retrieval_params.weighted_search_alpha
+                }
             elif strategy == "normalized":
                 reranker_type = RERANKER_TYPE_NORMALIZED
             else:
                 reranker_type = RERANKER_TYPE_RRF
-                k_value = ranker.get("params", {}).get("k", 60.0)
+                k_value = ranker.get("params", {}).get("k", config.chunk_retrieval_params.rrf_impact_factor)
                 reranker_params = {"impact_factor": k_value}
         query_string = interleaved_content_as_str(query)

llama_stack/providers/utils/pagination.py CHANGED Viewed

@@ -6,7 +6,7 @@
 from typing import Any
-from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack_api import PaginatedResponse
 def paginate_records(

llama-stack 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

llama-stack 0.3.5py3-none-any.whl → 0.4.1py3-none-any.whl