PyPI - glean-indexing-sdk - Versions diffs - 0.0.3__py3-none-any.whl - Mend

glean-indexing-sdk 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

glean/indexing/__init__.py +56 -0
glean/indexing/common/__init__.py +15 -0
glean/indexing/common/batch_processor.py +31 -0
glean/indexing/common/content_formatter.py +46 -0
glean/indexing/common/glean_client.py +18 -0
glean/indexing/common/metrics.py +54 -0
glean/indexing/common/mocks.py +20 -0
glean/indexing/connectors/__init__.py +21 -0
glean/indexing/connectors/base_connector.py +60 -0
glean/indexing/connectors/base_data_client.py +35 -0
glean/indexing/connectors/base_datasource_connector.py +314 -0
glean/indexing/connectors/base_people_connector.py +154 -0
glean/indexing/connectors/base_streaming_data_client.py +39 -0
glean/indexing/connectors/base_streaming_datasource_connector.py +184 -0
glean/indexing/models.py +45 -0
glean/indexing/observability/__init__.py +19 -0
glean/indexing/observability/observability.py +262 -0
glean/indexing/py.typed +1 -0
glean/indexing/testing/__init__.py +13 -0
glean/indexing/testing/connector_test_harness.py +53 -0
glean/indexing/testing/mock_data_source.py +47 -0
glean/indexing/testing/mock_glean_client.py +69 -0
glean/indexing/testing/response_validator.py +52 -0
glean_indexing_sdk-0.0.3.dist-info/METADATA +482 -0
glean_indexing_sdk-0.0.3.dist-info/RECORD +27 -0
glean_indexing_sdk-0.0.3.dist-info/WHEEL +4 -0
glean_indexing_sdk-0.0.3.dist-info/licenses/LICENSE +21 -0

glean/indexing/__init__.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Glean Indexing SDK.
+A Python SDK for building custom Glean indexing solutions. This package provides
+the base classes and utilities to create custom connectors for Glean's indexing APIs.
+"""
+from importlib.metadata import version, PackageNotFoundError
+from glean.indexing.connectors import (
+    BaseConnector,
+    BaseDatasourceConnector,
+    BaseStreamingDatasourceConnector,
+    BasePeopleConnector,
+    BaseConnectorDataClient,
+    StreamingConnectorDataClient,
+)
+from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
+from glean.indexing.observability.observability import ConnectorObservability
+from glean.indexing.testing import ConnectorTestHarness
+from glean.indexing.models import (
+    DatasourceIdentityDefinitions,
+    IndexingMode,
+    TSourceData,
+    TIndexableEntityDefinition,
+)
+from glean.indexing import models
+__all__ = [
+    "BaseConnector",
+    "BaseDatasourceConnector",
+    "BasePeopleConnector",
+    "BaseStreamingDatasourceConnector",
+    "BaseConnectorDataClient",
+    "StreamingConnectorDataClient",
+    "BatchProcessor",
+    "ContentFormatter",
+    "ConnectorMetrics",
+    "ConnectorObservability",
+    "ConnectorTestHarness",
+    "DatasourceIdentityDefinitions",
+    "IndexingMode",
+    "TSourceData",
+    "TIndexableEntityDefinition",
+    "MockGleanClient",
+    "api_client",
+    "models",
+]
+try:
+    __version__ = version("glean-indexing-sdk")
+except PackageNotFoundError:
+    __version__ = "0.0.3"

glean/indexing/common/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Common utilities and client implementations for Glean API integration."""
+from glean.indexing.common.glean_client import api_client
+from glean.indexing.common.mocks import MockGleanClient
+from glean.indexing.common.batch_processor import BatchProcessor
+from glean.indexing.common.content_formatter import ContentFormatter
+from glean.indexing.common.metrics import ConnectorMetrics
+__all__ = [
+    "api_client",
+    "MockGleanClient",
+    "BatchProcessor",
+    "ContentFormatter",
+    "ConnectorMetrics",
+]

glean/indexing/common/batch_processor.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Batch processing utility for efficient data handling."""
+import logging
+from typing import Generic, Iterator, Sequence, TypeVar
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+class BatchProcessor(Generic[T]):
+    """A utility for processing data in batches."""
+    def __init__(self, data: Sequence[T], batch_size: int = 100):
+        """Initialize the BatchProcessor.
+        Args:
+            data: The data to process in batches.
+            batch_size: The size of each batch.
+        """
+        self.data = data
+        self.batch_size = batch_size
+    def __iter__(self) -> Iterator[Sequence[T]]:
+        """Iterate over the data in batches.
+        Yields:
+            Sequences of items of size batch_size (except possibly the last batch).
+        """
+        for i in range(0, len(self.data), self.batch_size):
+            yield self.data[i : i + self.batch_size]

glean/indexing/common/content_formatter.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Content formatting utility using Jinja2."""
+import logging
+from typing import Any, Dict
+from jinja2 import Environment
+logger = logging.getLogger(__name__)
+class ContentFormatter:
+    """A utility for formatting content using Jinja2 templates."""
+    def __init__(self, template_str: str):
+        """Initialize the ContentFormatter.
+        Args:
+            template_str: A Jinja2 template string.
+        """
+        self.env = Environment(autoescape=True)
+        self.template = self.env.from_string(template_str)
+    def render(self, context: Dict[str, Any]) -> str:
+        """Render the template with the given context.
+        Args:
+            context: A dictionary containing the context for rendering.
+        Returns:
+            The rendered template as a string.
+        """
+        return self.template.render(**context)
+    @classmethod
+    def from_file(cls, template_path: str) -> "ContentFormatter":
+        """Create a ContentFormatter from a template file.
+        Args:
+            template_path: Path to a Jinja2 template file.
+        Returns:
+            A ContentFormatter instance.
+        """
+        with open(template_path, "r", encoding="utf-8") as f:
+            template_str = f.read()
+        return cls(template_str)

glean/indexing/common/glean_client.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""Simple Glean API client helper for connectors."""
+import os
+from glean.api_client import Glean
+def api_client() -> Glean:
+    """Get the Glean API client."""
+    instance = os.getenv("GLEAN_INSTANCE")
+    api_token = os.getenv("GLEAN_INDEXING_API_TOKEN")
+    if not api_token or not instance:
+        raise ValueError(
+            "GLEAN_INDEXING_API_TOKEN and GLEAN_INSTANCE environment variables are required"
+        )
+    return Glean(api_token=api_token, instance=instance)

glean/indexing/common/metrics.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Performance metrics tracking utility for connectors."""
+import logging
+import time
+from typing import Any, Dict, Optional
+logger = logging.getLogger(__name__)
+class ConnectorMetrics:
+    """A context manager for tracking connector metrics."""
+    def __init__(self, name: str, logger: Optional[logging.Logger] = None):
+        """Initialize the ConnectorMetrics.
+        Args:
+            name: The name of the operation being timed.
+            logger: An optional logger to use for metrics. If None, the default logger is used.
+        """
+        self.name = name
+        self.logger = logger or logging.getLogger(__name__)
+        self.start_time = 0
+        self.end_time = 0
+        self.stats: Dict[str, Any] = {}
+    def __enter__(self) -> "ConnectorMetrics":
+        """Enter the context manager, starting the timer.
+        Returns:
+            The ConnectorMetrics instance.
+        """
+        self.start_time = time.time()
+        self.logger.info(f"Starting {self.name}")
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit the context manager, stopping the timer and logging metrics."""
+        self.end_time = time.time()
+        duration = self.end_time - self.start_time
+        self.stats["duration"] = duration
+        self.logger.info(f"Completed {self.name} in {duration:.2f} seconds")
+        if self.stats:
+            self.logger.info(f"Metrics for {self.name}: {self.stats}")
+    def record(self, metric: str, value: Any) -> None:
+        """Record a metric.
+        Args:
+            metric: The name of the metric.
+            value: The value of the metric.
+        """
+        self.stats[metric] = value
+        self.logger.debug(f"Recorded metric {metric}={value} for {self.name}")

glean/indexing/common/mocks.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Mock implementations for testing."""
+import logging
+from typing import List
+from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
+logger = logging.getLogger(__name__)
+class MockGleanClient:
+    """Mock Glean API client for examples and testing."""
+    def batch_index_documents(self, datasource: str, documents: List[DocumentDefinition]) -> None:
+        """Mock method for indexing documents."""
+        logger.info(f"Mock indexing {len(documents)} documents to datasource '{datasource}'")
+    def bulk_index_employees(self, employees: List[EmployeeInfoDefinition]) -> None:
+        """Mock method for indexing employees."""
+        logger.info(f"Mock indexing {len(employees)} employees")

glean/indexing/connectors/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Connector implementations for Glean indexing."""
+from glean.indexing.connectors.base_connector import BaseConnector
+from glean.indexing.connectors.base_data_client import BaseDataClient, BaseConnectorDataClient
+from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
+from glean.indexing.connectors.base_people_connector import BasePeopleConnector
+from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient, StreamingConnectorDataClient
+from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
+from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
+__all__ = [
+    "BaseConnector",
+    "BaseDataClient",
+    "BaseConnectorDataClient",  # Backward compatibility alias
+    "BaseDatasourceConnector",
+    "BasePeopleConnector",
+    "BaseStreamingDataClient",
+    "StreamingConnectorDataClient",  # Backward compatibility alias
+    "BaseStreamingDatasourceConnector",
+    "ConnectorTestHarness",
+]

glean/indexing/connectors/base_connector.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Base connector class for the Glean Connector SDK."""
+import logging
+from abc import ABC, abstractmethod
+from typing import Generic, Optional, Sequence
+from glean.indexing.models import IndexingMode, TIndexableEntityDefinition, TSourceData
+logger = logging.getLogger(__name__)
+class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
+    """
+    Abstract base class for all Glean connectors.
+    This class defines the core interface and lifecycle for all connector types (datasource, people, streaming, etc.).
+    Connector implementors should inherit from this class and provide concrete implementations for all abstract methods.
+    Type Parameters:
+        TSourceData: The type of raw data fetched from the external source (e.g., dict, TypedDict, or custom model).
+        TIndexableEntityDefinition: The type of Glean API entity definition produced by the connector (e.g., DocumentDefinition, EmployeeInfoDefinition).
+    Required Methods for Subclasses:
+        - get_data(since: Optional[str] = None) -> Sequence[TSourceData]:
+            Fetches source data from the external system. Should support incremental fetches if possible.
+        - transform(data: Sequence[TSourceData]) -> List[TIndexableEntityDefinition]:
+            Transforms source data into Glean API entity definitions ready for indexing.
+        - index_data(mode: IndexingMode = IndexingMode.FULL) -> None:
+            Orchestrates the full indexing process (fetch, transform, upload).
+    Attributes:
+        name (str): The unique name of the connector (should be snake_case).
+    Example:
+        class MyConnector(BaseConnector[MyRawType, DocumentDefinition]):
+            ...
+    """
+    def __init__(self, name: str):
+        """Initialize the connector.
+        Args:
+            name: The name of the connector.
+        """
+        self.name = name
+    @abstractmethod
+    def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
+        """Get data from the data client or source system."""
+        pass
+    @abstractmethod
+    def transform(self, data: Sequence[TSourceData]) -> Sequence[TIndexableEntityDefinition]:
+        """Transform source data to Glean entity definitions."""
+        pass
+    @abstractmethod
+    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+        """Index data from the connector to Glean."""
+        pass

glean/indexing/connectors/base_data_client.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Base data client interface for standard Glean connectors."""
+from abc import ABC, abstractmethod
+from typing import Any, Generic, Sequence
+from glean.indexing.models import TSourceData
+class BaseDataClient(ABC, Generic[TSourceData]):
+    """
+    Base class for all connector data clients.
+    This interface defines how connectors fetch data from external sources.
+    All data clients should inherit from this class and implement get_source_data.
+    Type Parameters:
+        TSourceData: The type of data returned from the external source
+    """
+    @abstractmethod
+    def get_source_data(self, **kwargs: Any) -> Sequence[TSourceData]:
+        """
+        Fetch all data from the external source.
+        Args:
+            **kwargs: Additional parameters for data fetching (e.g., since timestamp)
+        Returns:
+            A sequence of data items from the source
+        """
+        pass
+# Alias for backward compatibility during transition
+BaseConnectorDataClient = BaseDataClient

glean/indexing/connectors/base_datasource_connector.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""Base datasource connector for the Glean Connector SDK."""
+import logging
+import uuid
+from abc import ABC
+from typing import Optional, Sequence
+from glean.api_client.models import DocumentDefinition
+from glean.indexing.common import BatchProcessor, api_client
+from glean.indexing.connectors.base_connector import BaseConnector
+from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
+from glean.indexing.models import (
+    CustomDatasourceConfig,
+    DatasourceIdentityDefinitions,
+    IndexingMode,
+    TSourceData,
+)
+from glean.indexing.observability.observability import ConnectorObservability
+logger = logging.getLogger(__name__)
+class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], ABC):
+    """
+    Base class for all Glean datasource connectors.
+    This class provides the core logic for indexing document/content data from external systems into Glean.
+    Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the datasource.
+    To implement a custom connector, inherit from this class and implement:
+        - configuration: CustomDatasourceConfig (class or instance attribute)
+        - get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]
+        - transform(self, data: Sequence[TSourceData]) -> List[DocumentDefinition]
+    Attributes:
+        name (str): The unique name of the connector (should be snake_case).
+        configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
+        batch_size (int): The batch size for uploads (default: 1000).
+        data_client (BaseConnectorDataClient): The data client for fetching source data.
+        observability (ConnectorObservability): Observability and metrics for this connector.
+    Example:
+        class MyWikiConnector(BaseDatasourceConnector[WikiPageData]):
+            configuration = CustomDatasourceConfig(...)
+            ...
+    """
+    configuration: CustomDatasourceConfig
+    def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
+        """
+        Initialize the datasource connector.
+        Args:
+            name: The name of the connector
+            data_client: The data client for fetching source data
+        """
+        super().__init__(name)
+        self.data_client = data_client
+        self._observability = ConnectorObservability(name)
+        self.batch_size = 1000
+    @property
+    def display_name(self) -> str:
+        """Get the display name for this datasource."""
+        return self.name.replace("_", " ").title()
+    @property
+    def observability(self) -> ConnectorObservability:
+        """The observability instance for this connector."""
+        return self._observability
+    def get_identities(self) -> DatasourceIdentityDefinitions:
+        """
+        Gets all identities for this datasource (users, groups & memberships).
+        Returns:
+            A DatasourceIdentityDefinitions object containing all identities for this datasource.
+        """
+        return DatasourceIdentityDefinitions(users=[])
+    def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
+        """Get data from the datasource via the data client.
+        Args:
+            since: If provided, only get data modified since this timestamp.
+        Returns:
+            A sequence of source data items from the external system.
+        """
+        return self.data_client.get_source_data(since=since)
+    def configure_datasource(self, is_test: bool = False) -> None:
+        """
+        Configure the datasource in Glean using the datasources.add() API.
+        Args:
+            is_test: Whether this is a test datasource
+        """
+        config = self.configuration
+        if not config.name:
+            raise ValueError("Missing required field: name in Configuration")
+        if not config.display_name:
+            raise ValueError("Missing required field: display_name in Configuration")
+        logger.info(f"Configuring datasource: {config.name}")
+        if is_test:
+            config.is_test_datasource = True
+        with api_client() as client:
+            client.indexing.datasources.add(**config.dict(exclude_unset=True))
+            logger.info(f"Successfully configured datasource: {config.name}")
+    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+        """
+        Index data from the datasource to Glean with identity crawl followed by content crawl.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+        """
+        self._observability.start_execution()
+        try:
+            logger.info(f"Starting {mode.name.lower()} indexing for datasource '{self.name}'")
+            logger.info("Starting identity crawl")
+            identities = self.get_identities()
+            users = identities.get("users")
+            if users:
+                logger.info(f"Indexing {len(users)} users")
+                self._batch_index_users(users)
+            groups = identities.get("groups")
+            if groups:
+                logger.info(f"Indexing {len(groups)} groups")
+                self._batch_index_groups(groups)
+                memberships = identities.get("memberships")
+                if not memberships:
+                    raise ValueError("Groups were provided, but no memberships were provided.")
+                logger.info(f"Indexing {len(memberships)} memberships")
+                self._batch_index_memberships(memberships)
+            since = None
+            if mode == IndexingMode.INCREMENTAL:
+                since = self._get_last_crawl_timestamp()
+                logger.info(f"Incremental crawl since: {since}")
+            logger.info("Starting content crawl")
+            self._observability.start_timer("data_fetch")
+            data = self.get_data(since=since)
+            self._observability.end_timer("data_fetch")
+            logger.info(f"Retrieved {len(data)} items from datasource")
+            self._observability.record_metric("items_fetched", len(data))
+            self._observability.start_timer("data_transform")
+            documents = self.transform(data)
+            self._observability.end_timer("data_transform")
+            logger.info(f"Transformed {len(documents)} documents")
+            self._observability.record_metric("documents_transformed", len(documents))
+            self._observability.start_timer("data_upload")
+            if documents:
+                logger.info(f"Indexing {len(documents)} documents")
+                self._batch_index_documents(documents)
+            self._observability.end_timer("data_upload")
+            logger.info(f"Successfully indexed {len(documents)} documents to Glean")
+            self._observability.record_metric("documents_indexed", len(documents))
+        except Exception as e:
+            logger.exception(f"Error during indexing: {e}")
+            self._observability.increment_counter("indexing_errors")
+            raise
+        finally:
+            self._observability.end_execution()
+    def _batch_index_users(self, users) -> None:
+        """Index users in batches with proper page signaling."""
+        if not users:
+            return
+        batches = list(BatchProcessor(users, batch_size=self.batch_size))
+        total_batches = len(batches)
+        logger.info(f"Uploading {len(users)} users in {total_batches} batches")
+        upload_id = str(uuid.uuid4())
+        for i, batch in enumerate(batches):
+            try:
+                with api_client() as client:
+                    client.indexing.permissions.bulk_index_users(
+                        datasource=self.name,
+                        users=list(batch),
+                        upload_id=upload_id,
+                        is_first_page=(i == 0),
+                        is_last_page=(i == total_batches - 1),
+                    )
+                logger.info(f"User batch {i + 1}/{total_batches} uploaded successfully")
+                self._observability.increment_counter("batches_uploaded")
+            except Exception as e:
+                logger.error(f"Failed to upload user batch {i + 1}/{total_batches}: {e}")
+                self._observability.increment_counter("batch_upload_errors")
+                raise
+    def _batch_index_groups(self, groups) -> None:
+        """Index groups in batches with proper page signaling."""
+        if not groups:
+            return
+        batches = list(BatchProcessor(groups, batch_size=self.batch_size))
+        total_batches = len(batches)
+        logger.info(f"Uploading {len(groups)} groups in {total_batches} batches")
+        upload_id = str(uuid.uuid4())
+        for i, batch in enumerate(batches):
+            try:
+                with api_client() as client:
+                    client.indexing.permissions.bulk_index_groups(
+                        datasource=self.name,
+                        groups=list(batch),
+                        upload_id=upload_id,
+                        is_first_page=(i == 0),
+                        is_last_page=(i == total_batches - 1),
+                    )
+                logger.info(f"Group batch {i + 1}/{total_batches} uploaded successfully")
+                self._observability.increment_counter("batches_uploaded")
+            except Exception as e:
+                logger.error(f"Failed to upload group batch {i + 1}/{total_batches}: {e}")
+                self._observability.increment_counter("batch_upload_errors")
+                raise
+    def _batch_index_memberships(self, memberships) -> None:
+        """Index memberships in batches with proper page signaling."""
+        if not memberships:
+            return
+        batches = list(BatchProcessor(memberships, batch_size=self.batch_size))
+        total_batches = len(batches)
+        logger.info(f"Uploading {len(memberships)} memberships in {total_batches} batches")
+        upload_id = str(uuid.uuid4())
+        for i, batch in enumerate(batches):
+            try:
+                with api_client() as client:
+                    client.indexing.permissions.bulk_index_memberships(
+                        datasource=self.name,
+                        memberships=list(batch),
+                        upload_id=upload_id,
+                        is_first_page=(i == 0),
+                        is_last_page=(i == total_batches - 1),
+                    )
+                logger.info(f"Membership batch {i + 1}/{total_batches} uploaded successfully")
+                self._observability.increment_counter("batches_uploaded")
+            except Exception as e:
+                logger.error(f"Failed to upload membership batch {i + 1}/{total_batches}: {e}")
+                self._observability.increment_counter("batch_upload_errors")
+                raise
+    def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
+        """Index documents in batches with proper page signaling."""
+        if not documents:
+            return
+        batches = list(BatchProcessor(list(documents), batch_size=self.batch_size))
+        total_batches = len(batches)
+        logger.info(f"Uploading {len(documents)} documents in {total_batches} batches")
+        upload_id = str(uuid.uuid4())
+        for i, batch in enumerate(batches):
+            try:
+                with api_client() as client:
+                    client.indexing.documents.bulk_index(
+                        datasource=self.name,
+                        documents=list(batch),
+                        upload_id=upload_id,
+                        is_first_page=(i == 0),
+                        is_last_page=(i == total_batches - 1),
+                    )
+                logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
+                self._observability.increment_counter("batches_uploaded")
+            except Exception as e:
+                logger.error(f"Failed to upload document batch {i + 1}/{total_batches}: {e}")
+                self._observability.increment_counter("batch_upload_errors")
+                raise
+    def _get_last_crawl_timestamp(self) -> Optional[str]:
+        """
+        Get the timestamp of the last successful crawl for incremental indexing.
+        Subclasses should override this to implement proper timestamp tracking.
+        Returns:
+            ISO timestamp string or None for full crawl
+        """
+        return None