PyPI - glean-indexing-sdk - Versions diffs - 0.0.3__py3-none-any.whl - Mend

glean-indexing-sdk 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

glean/indexing/__init__.py +56 -0
glean/indexing/common/__init__.py +15 -0
glean/indexing/common/batch_processor.py +31 -0
glean/indexing/common/content_formatter.py +46 -0
glean/indexing/common/glean_client.py +18 -0
glean/indexing/common/metrics.py +54 -0
glean/indexing/common/mocks.py +20 -0
glean/indexing/connectors/__init__.py +21 -0
glean/indexing/connectors/base_connector.py +60 -0
glean/indexing/connectors/base_data_client.py +35 -0
glean/indexing/connectors/base_datasource_connector.py +314 -0
glean/indexing/connectors/base_people_connector.py +154 -0
glean/indexing/connectors/base_streaming_data_client.py +39 -0
glean/indexing/connectors/base_streaming_datasource_connector.py +184 -0
glean/indexing/models.py +45 -0
glean/indexing/observability/__init__.py +19 -0
glean/indexing/observability/observability.py +262 -0
glean/indexing/py.typed +1 -0
glean/indexing/testing/__init__.py +13 -0
glean/indexing/testing/connector_test_harness.py +53 -0
glean/indexing/testing/mock_data_source.py +47 -0
glean/indexing/testing/mock_glean_client.py +69 -0
glean/indexing/testing/response_validator.py +52 -0
glean_indexing_sdk-0.0.3.dist-info/METADATA +482 -0
glean_indexing_sdk-0.0.3.dist-info/RECORD +27 -0
glean_indexing_sdk-0.0.3.dist-info/WHEEL +4 -0
glean_indexing_sdk-0.0.3.dist-info/licenses/LICENSE +21 -0

glean/indexing/connectors/base_people_connector.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Base people connector for the Glean Connector SDK."""
+import logging
+import uuid
+from abc import ABC
+from typing import Optional, Sequence
+from glean.api_client.models import EmployeeInfoDefinition
+from glean.indexing.common import BatchProcessor, api_client
+from glean.indexing.connectors.base_connector import BaseConnector
+from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
+from glean.indexing.models import IndexingMode, TSourceData
+from glean.indexing.observability.observability import ConnectorObservability
+logger = logging.getLogger(__name__)
+class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], ABC):
+    """
+    Base class for all Glean people connectors.
+    This class provides the core logic for indexing people/identity data (users, groups, memberships) from external systems into Glean.
+    Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the people source.
+    To implement a custom people connector, inherit from this class and implement:
+        - configuration: CustomDatasourceConfig (class or instance attribute)
+        - get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]
+        - transform(self, data: Sequence[TSourceData]) -> Sequence[EmployeeInfoDefinition]
+    Attributes:
+        name (str): The unique name of the connector (should be snake_case).
+        configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
+        batch_size (int): The batch size for uploads (default: 1000).
+        data_client (BaseConnectorDataClient): The data client for fetching source data.
+        observability (ConnectorObservability): Observability and metrics for this connector.
+    Example:
+        class MyPeopleConnector(BasePeopleConnector[MyEmployeeData]):
+            configuration = CustomDatasourceConfig(...)
+            ...
+    """
+    def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
+        """
+        Initialize the people connector.
+        Args:
+            name: The name of the connector
+            data_client: The data client for fetching source data
+        """
+        super().__init__(name)
+        self.data_client = data_client
+        self._observability = ConnectorObservability(name)
+        self.batch_size = 1000
+    @property
+    def observability(self) -> ConnectorObservability:
+        """The observability instance for this connector."""
+        return self._observability
+    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+        """Index people data to Glean.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+        """
+        self._observability.start_execution()
+        try:
+            logger.info(f"Starting {mode.name.lower()} people indexing for '{self.name}'")
+            since = None
+            if mode == IndexingMode.INCREMENTAL:
+                since = self._get_last_crawl_timestamp()
+                logger.info(f"Incremental crawl since: {since}")
+            self._observability.start_timer("data_fetch")
+            data = self.get_data(since=since)
+            self._observability.end_timer("data_fetch")
+            logger.info(f"Retrieved {len(data)} people from source")
+            self._observability.record_metric("people_fetched", len(data))
+            self._observability.start_timer("data_transform")
+            employees = self.transform(data)
+            self._observability.end_timer("data_transform")
+            logger.info(f"Transformed {len(employees)} employees")
+            self._observability.record_metric("employees_transformed", len(employees))
+            self._observability.start_timer("data_upload")
+            self._batch_index_employees(employees)
+            self._observability.end_timer("data_upload")
+            logger.info(f"Successfully indexed {len(employees)} employees to Glean")
+            self._observability.record_metric("employees_indexed", len(employees))
+        except Exception as e:
+            logger.exception(f"Error during people indexing: {e}")
+            self._observability.increment_counter("indexing_errors")
+            raise
+        finally:
+            self._observability.end_execution()
+    def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
+        """Get data from the data client.
+        Args:
+            since: If provided, only get data modified since this timestamp.
+        Returns:
+            A sequence of source data items from the external system.
+        """
+        return self.data_client.get_source_data(since=since)
+    def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
+        """Index employees to Glean in batches."""
+        if not employees:
+            return
+        batches = list(BatchProcessor(list(employees), batch_size=self.batch_size))
+        total_batches = len(batches)
+        logger.info(f"Uploading {len(employees)} employees in {total_batches} batches")
+        upload_id = str(uuid.uuid4())
+        for i, batch in enumerate(batches):
+            try:
+                with api_client() as client:
+                    client.indexing.people.bulk_index(
+                        employees=list(batch),
+                        upload_id=upload_id,
+                        is_first_page=(i == 0),
+                        is_last_page=(i == total_batches - 1),
+                    )
+                logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
+                self._observability.increment_counter("batches_uploaded")
+            except Exception as e:
+                logger.error(f"Failed to upload employee batch {i + 1}/{total_batches}: {e}")
+                self._observability.increment_counter("batch_upload_errors")
+                raise
+    def _get_last_crawl_timestamp(self) -> Optional[str]:
+        """
+        Get the timestamp of the last successful crawl for incremental indexing.
+        Subclasses should override this to implement proper timestamp tracking.
+        Returns:
+            ISO timestamp string or None for full crawl
+        """
+        return None

glean/indexing/connectors/base_streaming_data_client.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Base streaming data client interface for Glean connectors."""
+from abc import ABC, abstractmethod
+from typing import Any, Generator, Generic
+from glean.indexing.models import TSourceData
+class BaseStreamingDataClient(ABC, Generic[TSourceData]):
+    """
+    Base class for streaming data clients that fetch data in chunks.
+    Use this for large datasets to minimize memory usage.
+    This class provides an iterable interface for data retrieval, allowing
+    for efficient processing of large datasets without loading all data into memory at once.
+    Type Parameters:
+        TSourceData: The type of data yielded from the external source
+    """
+    @abstractmethod
+    def get_source_data(self, **kwargs: Any) -> Generator[TSourceData, None, None]:
+        """
+        Retrieves source data as a generator.
+        This method should be implemented to return a generator
+        that yields data items one at a time or in small batches.
+        Args:
+            **kwargs: Additional keyword arguments for customizing data retrieval.
+        Returns:
+            A generator of data items.
+        """
+        pass
+# Alias for backward compatibility during transition
+StreamingConnectorDataClient = BaseStreamingDataClient

glean/indexing/connectors/base_streaming_datasource_connector.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Base streaming datasource connector for memory-efficient processing of large datasets."""
+import logging
+import uuid
+from abc import ABC
+from typing import Generator, List, Optional, Sequence
+from glean.indexing.common import api_client
+from glean.indexing.connectors import BaseDatasourceConnector, StreamingConnectorDataClient
+from glean.indexing.models import IndexingMode, TSourceData
+logger = logging.getLogger(__name__)
+class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
+    """
+    Base class for all Glean streaming datasource connectors.
+    This class provides the core logic for memory-efficient, incremental indexing of large document/content datasets from external systems into Glean.
+    Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the datasource.
+    To implement a custom streaming connector, inherit from this class and implement:
+        - configuration: CustomDatasourceConfig (class or instance attribute)
+        - get_data(self, since: Optional[str] = None) -> Generator[TSourceData, None, None]
+        - transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
+    Attributes:
+        name (str): The unique name of the connector (should be snake_case).
+        configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
+        batch_size (int): The batch size for uploads (default: 1000).
+        data_client (StreamingConnectorDataClient): The streaming data client for fetching source data.
+        observability (ConnectorObservability): Observability and metrics for this connector.
+    Notes:
+        - Use this class for very large datasets, paginated APIs, or memory-constrained environments.
+        - The data client should yield data incrementally (e.g., via a generator).
+    Example:
+        class MyStreamingConnector(BaseStreamingDatasourceConnector[MyDocData]):
+            configuration = CustomDatasourceConfig(...)
+            ...
+    """
+    def __init__(self, name: str, data_client: StreamingConnectorDataClient[TSourceData]):
+        # Note: We pass the streaming client as-is since it's a specialized version
+        # The type checker may warn about this, but it's intentional for streaming
+        super().__init__(name, data_client)  # type: ignore[arg-type]
+        self.batch_size = 1000
+        self._upload_id: Optional[str] = None
+    def generate_upload_id(self) -> str:
+        """Generate a unique upload ID for batch tracking."""
+        if not self._upload_id:
+            self._upload_id = str(uuid.uuid4())
+        return self._upload_id
+    def get_data(self, since: Optional[str] = None) -> Generator[TSourceData, None, None]:
+        """
+        Get data from the streaming data client.
+        Args:
+            since: If provided, only get data modified since this timestamp.
+        Yields:
+            Individual data items from the source
+        """
+        logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
+        yield from self.data_client.get_source_data(since=since)
+    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+        """
+        Index data from the datasource to Glean using streaming.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+        """
+        logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
+        since = None
+        if mode == IndexingMode.INCREMENTAL:
+            since = "2023-01-01T00:00:00Z"
+        upload_id = self.generate_upload_id()
+        data_iterator = self.get_data(since=since)
+        is_first_batch = True
+        batch: List[TSourceData] = []
+        batch_count = 0
+        try:
+            for item in data_iterator:
+                batch.append(item)
+                if len(batch) == self.batch_size:
+                    try:
+                        next_item = next(data_iterator)
+                        self._process_batch(
+                            batch=batch,
+                            upload_id=upload_id,
+                            is_first_batch=is_first_batch,
+                            is_last_batch=False,
+                            batch_number=batch_count,
+                        )
+                        batch_count += 1
+                        batch = [next_item]
+                        is_first_batch = False
+                    except StopIteration:
+                        break
+            if batch:
+                self._process_batch(
+                    batch=batch,
+                    upload_id=upload_id,
+                    is_first_batch=is_first_batch,
+                    is_last_batch=True,
+                    batch_number=batch_count,
+                )
+            logger.info(
+                f"Streaming indexing completed successfully. Processed {batch_count + 1} batches."
+            )
+        except Exception as e:
+            logger.exception(f"Error during streaming indexing: {e}")
+            raise
+    def _process_batch(
+        self,
+        batch: List[TSourceData],
+        upload_id: str,
+        is_first_batch: bool,
+        is_last_batch: bool,
+        batch_number: int,
+    ) -> None:
+        """
+        Process a single batch of data.
+        Args:
+            batch: The batch of raw data to process
+            upload_id: The upload ID for this indexing session
+            is_first_batch: Whether this is the first batch
+            is_last_batch: Whether this is the last batch
+            batch_number: The sequence number of this batch
+        """
+        logger.info(f"Processing batch {batch_number} with {len(batch)} items")
+        try:
+            transformed_batch = self.transform(batch)
+            logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
+            with api_client() as client:
+                client.indexing.documents.bulk_index(
+                    datasource=self.name,
+                    documents=list(transformed_batch),
+                    upload_id=upload_id,
+                    is_first_page=is_first_batch,
+                    is_last_page=is_last_batch,
+                )
+            logger.info(f"Batch {batch_number} indexed successfully")
+        except Exception as e:
+            logger.error(f"Failed to process batch {batch_number}: {e}")
+            raise
+    def get_data_non_streaming(self, since: Optional[str] = None) -> Sequence[TSourceData]:
+        """
+        Get all data at once (non-streaming mode).
+        This method is required by the base class but shouldn't be used
+        for streaming connectors as it defeats the purpose of streaming.
+        Args:
+            since: If provided, only get data modified since this timestamp.
+        Returns:
+            A sequence of source data items from the external system.
+        """
+        logger.warning(
+            "get_data_non_streaming called on streaming connector - this may cause memory issues"
+        )
+        return list(self.get_data(since=since))

glean/indexing/models.py ADDED Viewed

@@ -0,0 +1,45 @@
+from enum import Enum
+from typing import Any, Sequence, TypedDict, TypeVar
+from glean.api_client.models import (
+    ContentDefinition,
+    CustomDatasourceConfig,
+    DocumentDefinition,
+    EmployeeInfoDefinition,
+    UserReferenceDefinition,
+)
+class IndexingMode(str, Enum):
+    """Specifies the indexing strategy for a datasource: full or incremental."""
+    FULL = "full"
+    INCREMENTAL = "incremental"
+TSourceData = TypeVar("TSourceData")
+"""Type variable for the raw source data type used in indexing pipelines."""
+TIndexableEntityDefinition = TypeVar("TIndexableEntityDefinition")
+"""Type variable for the Glean API entity definition produced by the connector (e.g., DocumentDefinition, EmployeeInfoDefinition)."""
+class DatasourceIdentityDefinitions(TypedDict, total=False):
+    """Defines user, group, and membership identity data for a datasource."""
+    users: Sequence[Any]
+    groups: Sequence[Any]
+    memberships: Sequence[Any]
+__all__ = [
+    "CustomDatasourceConfig",
+    "DocumentDefinition",
+    "EmployeeInfoDefinition",
+    "ContentDefinition",
+    "UserReferenceDefinition",
+    "IndexingMode",
+    "DatasourceIdentityDefinitions",
+    "TSourceData",
+    "TIndexableEntityDefinition",
+]

glean/indexing/observability/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Observability and monitoring tools for Glean indexing."""
+from glean.indexing.observability.observability import (
+    ConnectorObservability,
+    with_observability,
+    track_crawl_progress,
+    PerformanceTracker,
+    ProgressCallback,
+    setup_connector_logging,
+)
+__all__ = [
+    "ConnectorObservability",
+    "with_observability",
+    "track_crawl_progress",
+    "PerformanceTracker",
+    "ProgressCallback",
+    "setup_connector_logging",
+]