PyPI - sf-vector-sdk - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

sf-vector-sdk 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.3.dist-info}/METADATA +38 -4
{sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.3.dist-info}/RECORD +17 -17
vector_sdk/__init__.py +17 -1
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py +2 -2
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py +2 -2
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py +2 -2
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +13 -7
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi +23 -1
vector_sdk/hash/__init__.py +2 -0
vector_sdk/hash/hasher.py +28 -2
vector_sdk/hash/types.py +10 -1
vector_sdk/namespaces/embeddings.py +31 -57
vector_sdk/namespaces/search.py +38 -60
vector_sdk/structured/__init__.py +13 -0
vector_sdk/structured/structured_embeddings.py +785 -0
vector_sdk/structured/tool_config.py +23 -4
{sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.3.dist-info}/WHEEL +0 -0

vector_sdk/structured/structured_embeddings.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..hash import (
     AudioRecapSectionData,
     FlashCardData,
     ToolCollection,
+    TopicData,
     compute_content_hash,
     extract_tool_text,
 )
@@ -47,6 +48,29 @@ class ToolMetadata:
         return result
+@dataclass
+class TopicMetadata:
+    """
+    Metadata for topic embeddings.
+    Unlike ToolMetadata, all fields are optional since topics don't have a toolId.
+    """
+    user_id: Optional[str] = None
+    topic_id: Optional[str] = None
+    extra: Optional[dict[str, Any]] = None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for document storage."""
+        result: dict[str, Any] = {}
+        if self.user_id:
+            result["userId"] = self.user_id
+        if self.topic_id:
+            result["topicId"] = self.topic_id
+        if self.extra:
+            result.update(self.extra)
+        return result
 @dataclass
 class TestQuestionInput:
     """Extended question data with question type."""
@@ -67,6 +91,46 @@ class TestQuestionInput:
         return result
+@dataclass
+class BatchItem:
+    """Batch item for embedding multiple items of the same type."""
+    data: dict[str, Any]
+    metadata: ToolMetadata
+@dataclass
+class FlashCardBatchItem:
+    """Batch item for FlashCard embeddings."""
+    data: FlashCardData
+    metadata: ToolMetadata
+@dataclass
+class TestQuestionBatchItem:
+    """Batch item for TestQuestion embeddings."""
+    data: TestQuestionInput
+    metadata: ToolMetadata
+@dataclass
+class AudioRecapBatchItem:
+    """Batch item for AudioRecap embeddings."""
+    data: AudioRecapSectionData
+    metadata: ToolMetadata
+@dataclass
+class TopicBatchItem:
+    """Batch item for Topic embeddings."""
+    data: TopicData
+    metadata: TopicMetadata
 # ============================================================================
 # StructuredEmbeddingsNamespace
 # ============================================================================
@@ -150,6 +214,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
         card_type = data.get("type")
         return self._embed_tool_and_wait("FlashCard", data, metadata, card_type, timeout)
+    def embed_flashcard_batch(
+        self,
+        items: list[FlashCardBatchItem],
+    ) -> str:
+        """
+        Embed a batch of flashcards and return the request ID.
+        All flashcards in the batch should have the same type for proper namespace routing.
+        Args:
+            items: List of FlashCardBatchItem objects
+        Returns:
+            The request ID
+        """
+        return self._embed_tool_batch(
+            "FlashCard",
+            [
+                {
+                    "data": item.data,
+                    "metadata": item.metadata,
+                    "sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
+                }
+                for item in items
+            ],
+        )
+    def embed_flashcard_batch_and_wait(
+        self,
+        items: list[FlashCardBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a batch of flashcards and wait for the result.
+        All flashcards in the batch should have the same type for proper namespace routing.
+        Args:
+            items: List of FlashCardBatchItem objects
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_batch_and_wait(
+            "FlashCard",
+            [
+                {
+                    "data": item.data,
+                    "metadata": item.metadata,
+                    "sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
+                }
+                for item in items
+            ],
+            timeout,
+        )
     # ==========================================================================
     # TestQuestion Methods
     # ==========================================================================
@@ -201,6 +320,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
             timeout,
         )
+    def embed_test_question_batch(
+        self,
+        items: list[TestQuestionBatchItem],
+    ) -> str:
+        """
+        Embed a batch of test questions and return the request ID.
+        All questions in the batch should have the same question_type for proper namespace routing.
+        Args:
+            items: List of TestQuestionBatchItem objects
+        Returns:
+            The request ID
+        """
+        return self._embed_tool_batch(
+            "TestQuestion",
+            [
+                {
+                    "data": item.data.to_question_data(),
+                    "metadata": item.metadata,
+                    "sub_type": item.data.question_type,
+                }
+                for item in items
+            ],
+        )
+    def embed_test_question_batch_and_wait(
+        self,
+        items: list[TestQuestionBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a batch of test questions and wait for the result.
+        All questions in the batch should have the same question_type for proper namespace routing.
+        Args:
+            items: List of TestQuestionBatchItem objects
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_batch_and_wait(
+            "TestQuestion",
+            [
+                {
+                    "data": item.data.to_question_data(),
+                    "metadata": item.metadata,
+                    "sub_type": item.data.question_type,
+                }
+                for item in items
+            ],
+            timeout,
+        )
     # ==========================================================================
     # SpacedTestQuestion Methods
     # ==========================================================================
@@ -252,6 +426,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
             timeout,
         )
+    def embed_spaced_test_question_batch(
+        self,
+        items: list[TestQuestionBatchItem],
+    ) -> str:
+        """
+        Embed a batch of spaced test questions and return the request ID.
+        All questions in the batch should have the same question_type for proper namespace routing.
+        Args:
+            items: List of TestQuestionBatchItem objects
+        Returns:
+            The request ID
+        """
+        return self._embed_tool_batch(
+            "SpacedTestQuestion",
+            [
+                {
+                    "data": item.data.to_question_data(),
+                    "metadata": item.metadata,
+                    "sub_type": item.data.question_type,
+                }
+                for item in items
+            ],
+        )
+    def embed_spaced_test_question_batch_and_wait(
+        self,
+        items: list[TestQuestionBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a batch of spaced test questions and wait for the result.
+        All questions in the batch should have the same question_type for proper namespace routing.
+        Args:
+            items: List of TestQuestionBatchItem objects
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_batch_and_wait(
+            "SpacedTestQuestion",
+            [
+                {
+                    "data": item.data.to_question_data(),
+                    "metadata": item.metadata,
+                    "sub_type": item.data.question_type,
+                }
+                for item in items
+            ],
+            timeout,
+        )
     # ==========================================================================
     # AudioRecap Methods
     # ==========================================================================
@@ -298,6 +527,401 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
             timeout,
         )
+    def embed_audio_recap_batch(
+        self,
+        items: list[AudioRecapBatchItem],
+    ) -> str:
+        """
+        Embed a batch of audio recap sections and return the request ID.
+        Args:
+            items: List of AudioRecapBatchItem objects
+        Returns:
+            The request ID
+        """
+        return self._embed_tool_batch(
+            "AudioRecapV2Section",
+            [
+                {
+                    "data": item.data,
+                    "metadata": item.metadata,
+                    "sub_type": None,
+                }
+                for item in items
+            ],
+        )
+    def embed_audio_recap_batch_and_wait(
+        self,
+        items: list[AudioRecapBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a batch of audio recap sections and wait for the result.
+        Args:
+            items: List of AudioRecapBatchItem objects
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_batch_and_wait(
+            "AudioRecapV2Section",
+            [
+                {
+                    "data": item.data,
+                    "metadata": item.metadata,
+                    "sub_type": None,
+                }
+                for item in items
+            ],
+            timeout,
+        )
+    # ==========================================================================
+    # Topic Methods
+    # ==========================================================================
+    def embed_topic(
+        self,
+        data: TopicData,
+        metadata: TopicMetadata,
+    ) -> str:
+        """
+        Embed a topic and return the request ID.
+        Args:
+            data: Topic data (topic, description)
+            metadata: Topic metadata (all fields optional)
+        Returns:
+            The request ID
+        """
+        return self._embed_topic_internal("Topic", data, metadata, None)
+    def embed_topic_and_wait(
+        self,
+        data: TopicData,
+        metadata: TopicMetadata,
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a topic and wait for the result.
+        Args:
+            data: Topic data
+            metadata: Topic metadata (all fields optional)
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_topic_internal_and_wait("Topic", data, metadata, None, timeout)
+    def embed_topic_batch(
+        self,
+        items: list[TopicBatchItem],
+    ) -> str:
+        """
+        Embed a batch of topics and return the request ID.
+        Args:
+            items: List of TopicBatchItem objects
+        Returns:
+            The request ID
+        """
+        return self._embed_topic_batch_internal(items)
+    def embed_topic_batch_and_wait(
+        self,
+        items: list[TopicBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a batch of topics and wait for the result.
+        Args:
+            items: List of TopicBatchItem objects
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_topic_batch_internal_and_wait(items, timeout)
+    # ==========================================================================
+    # Internal Topic Methods (using TopicMetadata)
+    # ==========================================================================
+    def _embed_topic_internal(
+        self,
+        tool_collection: ToolCollection,
+        data: TopicData,
+        metadata: TopicMetadata,
+        sub_type: Optional[str],
+    ) -> str:
+        """Internal method to embed a topic with TopicMetadata."""
+        # 1. Extract text using the spec
+        text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+        if not text:
+            raise ValueError(
+                f"Failed to extract text from {tool_collection} - empty content"
+            )
+        # 2. Compute content hash
+        content_hash = compute_content_hash(
+            {"toolCollection": tool_collection, "data": data}
+        )
+        if not content_hash:
+            raise ValueError(
+                f"Failed to compute content hash for {tool_collection} - empty content"
+            )
+        # 3. Get tool config
+        tool_config = get_tool_config(tool_collection)
+        # 4. Build document with metadata (TopicMetadata doesn't have toolId)
+        document = {
+            **metadata.to_dict(),
+            "toolCollection": tool_collection,
+            "contentHash": content_hash,
+        }
+        # 5. Build storage config using router
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=sub_type,
+            content_hash=content_hash,
+            document_fields=document,
+        )
+        # 6. Build text input
+        text_input = {
+            "id": content_hash,
+            "text": text,
+            "document": document,
+        }
+        # 7. Submit using embeddings namespace
+        return self._embeddings.create(
+            texts=[text_input],
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+        )
+    def _embed_topic_internal_and_wait(
+        self,
+        tool_collection: ToolCollection,
+        data: TopicData,
+        metadata: TopicMetadata,
+        sub_type: Optional[str],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """Internal method to embed a topic and wait for result."""
+        # 1. Extract text using the spec
+        text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+        if not text:
+            raise ValueError(
+                f"Failed to extract text from {tool_collection} - empty content"
+            )
+        # 2. Compute content hash
+        content_hash = compute_content_hash(
+            {"toolCollection": tool_collection, "data": data}
+        )
+        if not content_hash:
+            raise ValueError(
+                f"Failed to compute content hash for {tool_collection} - empty content"
+            )
+        # 3. Get tool config
+        tool_config = get_tool_config(tool_collection)
+        # 4. Build document with metadata
+        document = {
+            **metadata.to_dict(),
+            "toolCollection": tool_collection,
+            "contentHash": content_hash,
+        }
+        # 5. Build storage config using router
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=sub_type,
+            content_hash=content_hash,
+            document_fields=document,
+        )
+        # 6. Build text input
+        text_input = {
+            "id": content_hash,
+            "text": text,
+            "document": document,
+        }
+        # 7. Submit and wait using embeddings namespace
+        return self._embeddings.create_and_wait(
+            texts=[text_input],
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+            timeout=timeout,
+        )
+    def _embed_topic_batch_internal(
+        self,
+        items: list[TopicBatchItem],
+    ) -> str:
+        """Internal method to embed a batch of topics."""
+        if not items:
+            raise ValueError("Batch cannot be empty")
+        tool_collection: ToolCollection = "Topic"
+        tool_config = get_tool_config(tool_collection)
+        # Process each item
+        text_inputs = []
+        for item in items:
+            data = item.data
+            metadata = item.metadata
+            # Extract text
+            text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+            if not text:
+                raise ValueError(
+                    f"Failed to extract text from {tool_collection} - empty content"
+                )
+            # Compute content hash
+            content_hash = compute_content_hash(
+                {"toolCollection": tool_collection, "data": data}
+            )
+            if not content_hash:
+                raise ValueError(
+                    f"Failed to compute content hash for {tool_collection} - empty content"
+                )
+            # Build document with metadata (TopicMetadata doesn't have toolId)
+            document = {
+                **metadata.to_dict(),
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            }
+            text_inputs.append({
+                "id": content_hash,
+                "text": text,
+                "document": document,
+            })
+        # Build storage config using first item
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=None,
+            content_hash=text_inputs[0]["id"],
+            document_fields=text_inputs[0]["document"],
+        )
+        # Submit batch to embeddings namespace
+        return self._embeddings.create(
+            texts=text_inputs,
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "batchSize": len(items),
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+        )
+    def _embed_topic_batch_internal_and_wait(
+        self,
+        items: list[TopicBatchItem],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """Internal method to embed a batch of topics and wait for result."""
+        if not items:
+            raise ValueError("Batch cannot be empty")
+        tool_collection: ToolCollection = "Topic"
+        tool_config = get_tool_config(tool_collection)
+        # Process each item
+        text_inputs = []
+        for item in items:
+            data = item.data
+            metadata = item.metadata
+            # Extract text
+            text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+            if not text:
+                raise ValueError(
+                    f"Failed to extract text from {tool_collection} - empty content"
+                )
+            # Compute content hash
+            content_hash = compute_content_hash(
+                {"toolCollection": tool_collection, "data": data}
+            )
+            if not content_hash:
+                raise ValueError(
+                    f"Failed to compute content hash for {tool_collection} - empty content"
+                )
+            # Build document with metadata
+            document = {
+                **metadata.to_dict(),
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            }
+            text_inputs.append({
+                "id": content_hash,
+                "text": text,
+                "document": document,
+            })
+        # Build storage config using first item
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=None,
+            content_hash=text_inputs[0]["id"],
+            document_fields=text_inputs[0]["document"],
+        )
+        # Submit batch and wait
+        return self._embeddings.create_and_wait(
+            texts=text_inputs,
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "batchSize": len(items),
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+            timeout=timeout,
+        )
     # ==========================================================================
     # Internal Methods
     # ==========================================================================
@@ -429,3 +1053,164 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
             embedding_dimensions=tool_config.dimensions,
             timeout=timeout,
         )
+    def _embed_tool_batch(
+        self,
+        tool_collection: ToolCollection,
+        items: list[dict[str, Any]],
+    ) -> str:
+        """
+        Internal method to embed a batch of items of the same tool type.
+        Args:
+            tool_collection: The tool collection type
+            items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
+        Returns:
+            The request ID
+        """
+        if not items:
+            raise ValueError("Batch cannot be empty")
+        # Get tool config (same for all items)
+        tool_config = get_tool_config(tool_collection)
+        # Process each item
+        text_inputs = []
+        for item in items:
+            data = item["data"]
+            metadata = item["metadata"]
+            # Extract text
+            text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+            if not text:
+                raise ValueError(
+                    f"Failed to extract text from {tool_collection} - empty content"
+                )
+            # Compute content hash
+            content_hash = compute_content_hash(
+                {"toolCollection": tool_collection, "data": data}
+            )
+            if not content_hash:
+                raise ValueError(
+                    f"Failed to compute content hash for {tool_collection} - empty content"
+                )
+            # Build document with metadata
+            document = {
+                **metadata.to_dict(),
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            }
+            text_inputs.append({
+                "id": content_hash,
+                "text": text,
+                "document": document,
+            })
+        # Build storage config using first item's sub_type
+        first_item = items[0]
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=first_item.get("sub_type"),
+            content_hash=text_inputs[0]["id"],
+            document_fields=text_inputs[0]["document"],
+        )
+        # Submit batch to embeddings namespace
+        return self._embeddings.create(
+            texts=text_inputs,
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "batchSize": len(items),
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+        )
+    def _embed_tool_batch_and_wait(
+        self,
+        tool_collection: ToolCollection,
+        items: list[dict[str, Any]],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Internal method to embed a batch of items and wait for result.
+        Args:
+            tool_collection: The tool collection type
+            items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        if not items:
+            raise ValueError("Batch cannot be empty")
+        # Get tool config (same for all items)
+        tool_config = get_tool_config(tool_collection)
+        # Process each item
+        text_inputs = []
+        for item in items:
+            data = item["data"]
+            metadata = item["metadata"]
+            # Extract text
+            text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+            if not text:
+                raise ValueError(
+                    f"Failed to extract text from {tool_collection} - empty content"
+                )
+            # Compute content hash
+            content_hash = compute_content_hash(
+                {"toolCollection": tool_collection, "data": data}
+            )
+            if not content_hash:
+                raise ValueError(
+                    f"Failed to compute content hash for {tool_collection} - empty content"
+                )
+            # Build document with metadata
+            document = {
+                **metadata.to_dict(),
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            }
+            text_inputs.append({
+                "id": content_hash,
+                "text": text,
+                "document": document,
+            })
+        # Build storage config using first item's sub_type
+        first_item = items[0]
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=first_item.get("sub_type"),
+            content_hash=text_inputs[0]["id"],
+            document_fields=text_inputs[0]["document"],
+        )
+        # Submit batch and wait
+        return self._embeddings.create_and_wait(
+            texts=text_inputs,
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "batchSize": len(items),
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+            timeout=timeout,
+        )

sf-vector-sdk 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

sf-vector-sdk 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl