PyPI - sf-vector-sdk - Versions diffs - 0.2.0__py3-none-any.whl - Mend

sf-vector-sdk 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

sf_vector_sdk-0.2.0.dist-info/METADATA +476 -0
sf_vector_sdk-0.2.0.dist-info/RECORD +27 -0
sf_vector_sdk-0.2.0.dist-info/WHEEL +4 -0
vector_sdk/__init__.py +262 -0
vector_sdk/client.py +538 -0
vector_sdk/content_types.py +233 -0
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py +57 -0
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.pyi +141 -0
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py +58 -0
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.pyi +145 -0
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py +58 -0
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.pyi +109 -0
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +39 -0
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi +31 -0
vector_sdk/hash/__init__.py +31 -0
vector_sdk/hash/hasher.py +259 -0
vector_sdk/hash/types.py +67 -0
vector_sdk/namespaces/__init__.py +13 -0
vector_sdk/namespaces/base.py +45 -0
vector_sdk/namespaces/db.py +230 -0
vector_sdk/namespaces/embeddings.py +268 -0
vector_sdk/namespaces/search.py +258 -0
vector_sdk/structured/__init__.py +60 -0
vector_sdk/structured/router.py +190 -0
vector_sdk/structured/structured_embeddings.py +431 -0
vector_sdk/structured/tool_config.py +254 -0
vector_sdk/types.py +864 -0

vector_sdk/structured/structured_embeddings.py ADDED Viewed

@@ -0,0 +1,431 @@
+"""
+Structured Embeddings Namespace.
+Provides type-safe methods for embedding known tool types (FlashCard, TestQuestion, etc.)
+with automatic text extraction, content hash computation, and database routing.
+"""
+from dataclasses import dataclass
+from typing import Any, Optional
+from ..hash import (
+    AudioRecapSectionData,
+    FlashCardData,
+    ToolCollection,
+    compute_content_hash,
+    extract_tool_text,
+)
+from ..namespaces.base import BaseNamespace
+from ..namespaces.embeddings import EmbeddingsNamespace
+from ..types import EmbeddingResult
+from .router import build_storage_config, get_content_type
+from .tool_config import QuestionType, get_tool_config
+# ============================================================================
+# Types
+# ============================================================================
+@dataclass
+class ToolMetadata:
+    """Metadata to store alongside the embedding."""
+    tool_id: str
+    user_id: Optional[str] = None
+    topic_id: Optional[str] = None
+    extra: Optional[dict[str, Any]] = None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for document storage."""
+        result: dict[str, Any] = {"toolId": self.tool_id}
+        if self.user_id:
+            result["userId"] = self.user_id
+        if self.topic_id:
+            result["topicId"] = self.topic_id
+        if self.extra:
+            result.update(self.extra)
+        return result
+@dataclass
+class TestQuestionInput:
+    """Extended question data with question type."""
+    question: str
+    answers: list[dict[str, Any]]
+    question_type: Optional[QuestionType] = None
+    explanation: Optional[str] = None
+    def to_question_data(self) -> dict[str, Any]:
+        """Convert to QuestionData-compatible dict."""
+        result: dict[str, Any] = {
+            "question": self.question,
+            "answers": self.answers,
+        }
+        if self.explanation:
+            result["explanation"] = self.explanation
+        return result
+# ============================================================================
+# StructuredEmbeddingsNamespace
+# ============================================================================
+class StructuredEmbeddingsNamespace(BaseNamespace):
+    """
+    Namespace for structured tool embeddings.
+    Provides type-safe methods for embedding known tool types with automatic:
+    - Text extraction per tool spec
+    - Content hash computation
+    - Namespace derivation based on tool type and sub-type
+    - Database routing based on environment configuration
+    Example:
+        ```python
+        client = VectorClient("redis://localhost:6379")
+        # Embed a flashcard
+        result = client.structured_embeddings.embed_flashcard_and_wait(
+            data={"type": "BASIC", "term": "Mitochondria", "definition": "..."},
+            metadata=ToolMetadata(tool_id="tool123", user_id="user456"),
+        )
+        # SDK automatically extracts text, computes hash, and routes to correct database
+        ```
+    """
+    def __init__(self, redis: Any, embeddings: EmbeddingsNamespace, http_url: Optional[str] = None):
+        """
+        Initialize the namespace.
+        Args:
+            redis: Redis client instance
+            embeddings: EmbeddingsNamespace instance for submitting requests
+            http_url: Optional HTTP URL for query-gateway
+        """
+        super().__init__(redis, http_url)
+        self._embeddings = embeddings
+    # ==========================================================================
+    # FlashCard Methods
+    # ==========================================================================
+    def embed_flashcard(
+        self,
+        data: FlashCardData,
+        metadata: ToolMetadata,
+    ) -> str:
+        """
+        Embed a flashcard and return the request ID.
+        Args:
+            data: FlashCard data (type, term, definition, multiple_choice_options)
+            metadata: Tool metadata (tool_id, user_id, topic_id, etc.)
+        Returns:
+            The request ID
+        """
+        card_type = data.get("type")
+        return self._embed_tool("FlashCard", data, metadata, card_type)
+    def embed_flashcard_and_wait(
+        self,
+        data: FlashCardData,
+        metadata: ToolMetadata,
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a flashcard and wait for the result.
+        Args:
+            data: FlashCard data
+            metadata: Tool metadata
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        card_type = data.get("type")
+        return self._embed_tool_and_wait("FlashCard", data, metadata, card_type, timeout)
+    # ==========================================================================
+    # TestQuestion Methods
+    # ==========================================================================
+    def embed_test_question(
+        self,
+        data: TestQuestionInput,
+        metadata: ToolMetadata,
+    ) -> str:
+        """
+        Embed a test question and return the request ID.
+        Args:
+            data: Question data (question, answers, explanation, question_type)
+            metadata: Tool metadata
+        Returns:
+            The request ID
+        """
+        return self._embed_tool(
+            "TestQuestion",
+            data.to_question_data(),
+            metadata,
+            data.question_type,
+        )
+    def embed_test_question_and_wait(
+        self,
+        data: TestQuestionInput,
+        metadata: ToolMetadata,
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a test question and wait for the result.
+        Args:
+            data: Question data
+            metadata: Tool metadata
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_and_wait(
+            "TestQuestion",
+            data.to_question_data(),
+            metadata,
+            data.question_type,
+            timeout,
+        )
+    # ==========================================================================
+    # SpacedTestQuestion Methods
+    # ==========================================================================
+    def embed_spaced_test_question(
+        self,
+        data: TestQuestionInput,
+        metadata: ToolMetadata,
+    ) -> str:
+        """
+        Embed a spaced test question and return the request ID.
+        Args:
+            data: Question data
+            metadata: Tool metadata
+        Returns:
+            The request ID
+        """
+        return self._embed_tool(
+            "SpacedTestQuestion",
+            data.to_question_data(),
+            metadata,
+            data.question_type,
+        )
+    def embed_spaced_test_question_and_wait(
+        self,
+        data: TestQuestionInput,
+        metadata: ToolMetadata,
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed a spaced test question and wait for the result.
+        Args:
+            data: Question data
+            metadata: Tool metadata
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_and_wait(
+            "SpacedTestQuestion",
+            data.to_question_data(),
+            metadata,
+            data.question_type,
+            timeout,
+        )
+    # ==========================================================================
+    # AudioRecap Methods
+    # ==========================================================================
+    def embed_audio_recap(
+        self,
+        data: AudioRecapSectionData,
+        metadata: ToolMetadata,
+    ) -> str:
+        """
+        Embed an audio recap section and return the request ID.
+        Args:
+            data: Audio recap data (script)
+            metadata: Tool metadata
+        Returns:
+            The request ID
+        """
+        return self._embed_tool("AudioRecapV2Section", data, metadata, None)
+    def embed_audio_recap_and_wait(
+        self,
+        data: AudioRecapSectionData,
+        metadata: ToolMetadata,
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """
+        Embed an audio recap section and wait for the result.
+        Args:
+            data: Audio recap data
+            metadata: Tool metadata
+            timeout: Timeout in seconds (default: 60)
+        Returns:
+            The embedding result
+        """
+        return self._embed_tool_and_wait(
+            "AudioRecapV2Section",
+            data,
+            metadata,
+            None,
+            timeout,
+        )
+    # ==========================================================================
+    # Internal Methods
+    # ==========================================================================
+    def _embed_tool(
+        self,
+        tool_collection: ToolCollection,
+        data: dict[str, Any],
+        metadata: ToolMetadata,
+        sub_type: Optional[str],
+    ) -> str:
+        """Internal method to embed any tool type."""
+        # 1. Extract text using the spec
+        text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+        if not text:
+            raise ValueError(
+                f"Failed to extract text from {tool_collection} - empty content"
+            )
+        # 2. Compute content hash
+        content_hash = compute_content_hash(
+            {"toolCollection": tool_collection, "data": data}
+        )
+        if not content_hash:
+            raise ValueError(
+                f"Failed to compute content hash for {tool_collection} - empty content"
+            )
+        # 3. Get tool config
+        tool_config = get_tool_config(tool_collection)
+        # 4. Build document with metadata
+        document = {
+            **metadata.to_dict(),
+            "toolCollection": tool_collection,
+            "contentHash": content_hash,
+        }
+        # 5. Build storage config using router
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=sub_type,
+            content_hash=content_hash,
+            document_fields=document,
+        )
+        # 6. Build text input
+        text_input = {
+            "id": content_hash,
+            "text": text,
+            "document": document,
+        }
+        # 7. Submit to embeddings namespace
+        return self._embeddings.create(
+            texts=[text_input],
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+        )
+    def _embed_tool_and_wait(
+        self,
+        tool_collection: ToolCollection,
+        data: dict[str, Any],
+        metadata: ToolMetadata,
+        sub_type: Optional[str],
+        timeout: int = 60,
+    ) -> EmbeddingResult:
+        """Internal method to embed any tool type and wait for result."""
+        # 1. Extract text using the spec
+        text = extract_tool_text({"toolCollection": tool_collection, "data": data})
+        if not text:
+            raise ValueError(
+                f"Failed to extract text from {tool_collection} - empty content"
+            )
+        # 2. Compute content hash
+        content_hash = compute_content_hash(
+            {"toolCollection": tool_collection, "data": data}
+        )
+        if not content_hash:
+            raise ValueError(
+                f"Failed to compute content hash for {tool_collection} - empty content"
+            )
+        # 3. Get tool config
+        tool_config = get_tool_config(tool_collection)
+        # 4. Build document with metadata
+        document = {
+            **metadata.to_dict(),
+            "toolCollection": tool_collection,
+            "contentHash": content_hash,
+        }
+        # 5. Build storage config using router
+        storage_config = build_storage_config(
+            tool_collection=tool_collection,
+            sub_type=sub_type,
+            content_hash=content_hash,
+            document_fields=document,
+        )
+        # 6. Build text input
+        text_input = {
+            "id": content_hash,
+            "text": text,
+            "document": document,
+        }
+        # 7. Submit and wait using embeddings namespace
+        return self._embeddings.create_and_wait(
+            texts=[text_input],
+            content_type=get_content_type(tool_collection),
+            priority=tool_config.default_priority,
+            storage=storage_config,
+            metadata={
+                "toolCollection": tool_collection,
+                "contentHash": content_hash,
+            },
+            embedding_model=tool_config.model,
+            embedding_dimensions=tool_config.dimensions,
+            timeout=timeout,
+        )

vector_sdk/structured/tool_config.py ADDED Viewed

@@ -0,0 +1,254 @@
+"""
+Tool Configuration for Structured Embeddings.
+Defines the configuration for each tool type including:
+- Embedding model and dimensions
+- TurboPuffer namespace patterns
+- Pinecone index and namespace patterns
+- Enabled/disabled status for each database
+"""
+from dataclasses import dataclass
+from typing import Literal, Optional
+from ..hash import FlashCardType, ToolCollection
+from ..types import PRIORITY_HIGH, PRIORITY_NORMAL
+# ============================================================================
+# Types
+# ============================================================================
+@dataclass(frozen=True)
+class ToolDatabaseConfig:
+    """Database-specific configuration for a tool type."""
+    enabled: bool
+    id_field: str
+    metadata_fields: tuple[str, ...]
+@dataclass(frozen=True)
+class TurboPufferToolConfig(ToolDatabaseConfig):
+    """TurboPuffer-specific configuration."""
+    namespace_pattern: str
+@dataclass(frozen=True)
+class PineconeToolConfig(ToolDatabaseConfig):
+    """Pinecone-specific configuration."""
+    index_name: str
+    namespace_pattern: str
+@dataclass(frozen=True)
+class ToolConfig:
+    """Configuration for a tool collection type."""
+    tool_collection: ToolCollection
+    model: str
+    dimensions: int
+    default_priority: str
+    turbopuffer: TurboPufferToolConfig
+    pinecone: PineconeToolConfig
+# ============================================================================
+# Tool Configurations
+# ============================================================================
+_DEFAULT_METADATA_FIELDS = (
+    "toolId",
+    "toolCollection",
+    "topicId",
+    "userId",
+    "contentHash",
+)
+TOOL_CONFIGS: dict[ToolCollection, ToolConfig] = {
+    "FlashCard": ToolConfig(
+        tool_collection="FlashCard",
+        model="gemini-embedding-001",
+        dimensions=3072,
+        default_priority=PRIORITY_HIGH,
+        turbopuffer=TurboPufferToolConfig(
+            enabled=True,
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="flashcard_{type}_tool_embedding",
+        ),
+        pinecone=PineconeToolConfig(
+            enabled=False,
+            index_name="tool-vectors",
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="flashcard_{type}",
+        ),
+    ),
+    "TestQuestion": ToolConfig(
+        tool_collection="TestQuestion",
+        model="gemini-embedding-001",
+        dimensions=3072,
+        default_priority=PRIORITY_HIGH,
+        turbopuffer=TurboPufferToolConfig(
+            enabled=True,
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="testquestion_{type}_tool_embedding",
+        ),
+        pinecone=PineconeToolConfig(
+            enabled=False,
+            index_name="tool-vectors",
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="testquestion_{type}",
+        ),
+    ),
+    "SpacedTestQuestion": ToolConfig(
+        tool_collection="SpacedTestQuestion",
+        model="gemini-embedding-001",
+        dimensions=3072,
+        default_priority=PRIORITY_NORMAL,
+        turbopuffer=TurboPufferToolConfig(
+            enabled=True,
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="spacedtestquestion_{type}_tool_embedding",
+        ),
+        pinecone=PineconeToolConfig(
+            enabled=False,
+            index_name="tool-vectors",
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="spacedtestquestion_{type}",
+        ),
+    ),
+    "AudioRecapV2Section": ToolConfig(
+        tool_collection="AudioRecapV2Section",
+        model="gemini-embedding-001",
+        dimensions=3072,
+        default_priority=PRIORITY_NORMAL,
+        turbopuffer=TurboPufferToolConfig(
+            enabled=True,
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="audiorecapv2section_tool_embedding",
+        ),
+        pinecone=PineconeToolConfig(
+            enabled=False,
+            index_name="tool-vectors",
+            id_field="contentHash",
+            metadata_fields=_DEFAULT_METADATA_FIELDS,
+            namespace_pattern="audiorecapv2section",
+        ),
+    ),
+}
+# ============================================================================
+# Sub-type Mappings
+# ============================================================================
+QuestionType = Literal[
+    "multiplechoice",
+    "truefalse",
+    "shortanswer",
+    "fillinblank",
+    "frq",
+]
+def get_flashcard_namespace_suffix(card_type: Optional[FlashCardType]) -> str:
+    """Map FlashCardType to namespace suffix."""
+    mapping = {
+        "BASIC": "basic",
+        "CLOZE": "cloze",
+        "FILL_IN_THE_BLANK": "fillintheblank",
+        "MULTIPLE_CHOICE": "multiplechoice",
+    }
+    return mapping.get(card_type or "BASIC", "basic")
+def get_question_namespace_suffix(question_type: Optional[QuestionType]) -> str:
+    """Map question type string to namespace suffix."""
+    return question_type or "multiplechoice"
+# ============================================================================
+# Namespace Derivation
+# ============================================================================
+def get_turbopuffer_namespace(
+    tool_collection: ToolCollection,
+    sub_type: Optional[str] = None,
+) -> str:
+    """
+    Derive the TurboPuffer namespace for a tool.
+    Args:
+        tool_collection: The tool collection type
+        sub_type: The sub-type (FlashCardType or QuestionType)
+    Returns:
+        The derived namespace string
+    """
+    config = TOOL_CONFIGS[tool_collection]
+    pattern = config.turbopuffer.namespace_pattern
+    # AudioRecapV2Section doesn't have sub-types
+    if tool_collection == "AudioRecapV2Section":
+        return pattern
+    # Derive the type suffix
+    if tool_collection == "FlashCard":
+        type_suffix = get_flashcard_namespace_suffix(sub_type)  # type: ignore
+    else:
+        type_suffix = get_question_namespace_suffix(sub_type)  # type: ignore
+    return pattern.replace("{type}", type_suffix)
+def get_pinecone_namespace(
+    tool_collection: ToolCollection,
+    sub_type: Optional[str] = None,
+) -> str:
+    """
+    Derive the Pinecone namespace for a tool.
+    Args:
+        tool_collection: The tool collection type
+        sub_type: The sub-type (FlashCardType or QuestionType)
+    Returns:
+        The derived namespace string
+    """
+    config = TOOL_CONFIGS[tool_collection]
+    pattern = config.pinecone.namespace_pattern
+    # AudioRecapV2Section doesn't have sub-types
+    if tool_collection == "AudioRecapV2Section":
+        return pattern
+    # Derive the type suffix
+    if tool_collection == "FlashCard":
+        type_suffix = get_flashcard_namespace_suffix(sub_type)  # type: ignore
+    else:
+        type_suffix = get_question_namespace_suffix(sub_type)  # type: ignore
+    return pattern.replace("{type}", type_suffix)
+def get_tool_config(tool_collection: ToolCollection) -> ToolConfig:
+    """
+    Get the tool configuration for a tool collection.
+    Args:
+        tool_collection: The tool collection type
+    Returns:
+        The tool configuration
+    """
+    return TOOL_CONFIGS[tool_collection]