PyPI - glean-indexing-sdk - Versions diffs - 0.0.3__py3-none-any.whl - Mend

glean-indexing-sdk 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

glean/indexing/__init__.py +56 -0
glean/indexing/common/__init__.py +15 -0
glean/indexing/common/batch_processor.py +31 -0
glean/indexing/common/content_formatter.py +46 -0
glean/indexing/common/glean_client.py +18 -0
glean/indexing/common/metrics.py +54 -0
glean/indexing/common/mocks.py +20 -0
glean/indexing/connectors/__init__.py +21 -0
glean/indexing/connectors/base_connector.py +60 -0
glean/indexing/connectors/base_data_client.py +35 -0
glean/indexing/connectors/base_datasource_connector.py +314 -0
glean/indexing/connectors/base_people_connector.py +154 -0
glean/indexing/connectors/base_streaming_data_client.py +39 -0
glean/indexing/connectors/base_streaming_datasource_connector.py +184 -0
glean/indexing/models.py +45 -0
glean/indexing/observability/__init__.py +19 -0
glean/indexing/observability/observability.py +262 -0
glean/indexing/py.typed +1 -0
glean/indexing/testing/__init__.py +13 -0
glean/indexing/testing/connector_test_harness.py +53 -0
glean/indexing/testing/mock_data_source.py +47 -0
glean/indexing/testing/mock_glean_client.py +69 -0
glean/indexing/testing/response_validator.py +52 -0
glean_indexing_sdk-0.0.3.dist-info/METADATA +482 -0
glean_indexing_sdk-0.0.3.dist-info/RECORD +27 -0
glean_indexing_sdk-0.0.3.dist-info/WHEEL +4 -0
glean_indexing_sdk-0.0.3.dist-info/licenses/LICENSE +21 -0

glean/indexing/observability/observability.py ADDED Viewed

@@ -0,0 +1,262 @@
+"""Observability infrastructure for Glean connectors."""
+import functools
+import logging
+import time
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, TypeVar
+logger = logging.getLogger(__name__)
+# Type variable for decorated classes
+T = TypeVar("T")
+class ConnectorObservability:
+    """
+    Centralized observability for connector operations.
+    Tracks metrics, performance, and provides structured logging.
+    """
+    def __init__(self, connector_name: str):
+        self.connector_name = connector_name
+        self.metrics: Dict[str, Any] = defaultdict(int)
+        self.timers: Dict[str, float] = {}
+        self.start_time: Optional[float] = None
+    def start_execution(self):
+        """Mark the start of connector execution."""
+        self.start_time = time.time()
+        logger.info(f"[{self.connector_name}] Execution started")
+    def end_execution(self):
+        """Mark the end of connector execution."""
+        if self.start_time:
+            duration = time.time() - self.start_time
+            self.metrics["total_execution_time"] = duration
+            logger.info(f"[{self.connector_name}] Execution completed in {duration:.2f}s")
+    def record_metric(self, key: str, value: Any):
+        """Record a custom metric."""
+        self.metrics[key] = value
+        logger.debug(f"[{self.connector_name}] Metric recorded: {key}={value}")
+    def increment_counter(self, key: str, value: int = 1):
+        """Increment a counter metric."""
+        self.metrics[key] += value
+    def start_timer(self, operation: str):
+        """Start timing an operation."""
+        self.timers[operation] = time.time()
+    def end_timer(self, operation: str):
+        """End timing an operation and record the duration."""
+        if operation in self.timers:
+            duration = time.time() - self.timers[operation]
+            self.record_metric(f"{operation}_duration", duration)
+            del self.timers[operation]
+            return duration
+        return None
+    def get_metrics_summary(self) -> Dict[str, Any]:
+        """Get a summary of all collected metrics."""
+        return dict(self.metrics)
+def with_observability(
+    exclude_methods: Optional[List[str]] = None,
+    include_args: bool = False,
+    include_return: bool = False,
+) -> Callable[[type], type]:
+    """
+    Class decorator that adds comprehensive logging to all public methods.
+    Args:
+        exclude_methods: List of method names to exclude from logging
+        include_args: Whether to log method arguments
+        include_return: Whether to log return values
+    Returns:
+        Decorated class with enhanced logging
+    """
+    if exclude_methods is None:
+        exclude_methods = ["__init__", "__str__", "__repr__"]
+    def decorator(cls: type) -> type:
+        def wrap_method(method: Callable[..., Any]) -> Callable[..., Any]:
+            if method.__name__ in exclude_methods:
+                return method
+            @functools.wraps(method)
+            def wrapped_method(self, *args: Any, **kwargs: Any) -> Any:
+                method_name = method.__name__
+                class_name = self.__class__.__name__
+                # Log method start
+                if include_args:
+                    logger.info(
+                        f"[{class_name}] {method_name} started with args={args}, kwargs={kwargs}"
+                    )
+                else:
+                    logger.info(f"[{class_name}] {method_name} started")
+                start_time = time.time()
+                try:
+                    result = method(self, *args, **kwargs)
+                    duration = time.time() - start_time
+                    # Log successful completion
+                    if include_return:
+                        logger.info(
+                            f"[{class_name}] {method_name} completed in {duration:.3f}s with result={result}"
+                        )
+                    else:
+                        logger.info(f"[{class_name}] {method_name} completed in {duration:.3f}s")
+                    # Record timing metric if observability is available
+                    if hasattr(self, "_observability"):
+                        self._observability.record_metric(f"{method_name}_duration", duration)
+                    return result
+                except Exception as e:
+                    duration = time.time() - start_time
+                    logger.error(f"[{class_name}] {method_name} failed after {duration:.3f}s: {e}")
+                    # Record error metric if observability is available
+                    if hasattr(self, "_observability"):
+                        self._observability.increment_counter(f"{method_name}_errors")
+                    raise
+            return wrapped_method
+        # Apply the wrapper to all public methods
+        for attr_name, attr_value in cls.__dict__.items():
+            if callable(attr_value) and not attr_name.startswith("_"):
+                setattr(cls, attr_name, wrap_method(attr_value))
+        return cls
+    return decorator
+def track_crawl_progress(method: Callable[..., Any]) -> Callable[..., Any]:
+    """
+    Decorator that tracks crawling progress and item counts.
+    Expects the method to return a sequence and increments crawl metrics.
+    """
+    @functools.wraps(method)
+    def wrapper(self, *args: Any, **kwargs: Any) -> Any:
+        result = method(self, *args, **kwargs)
+        # Track item count if result is a sequence
+        if hasattr(result, "__len__"):
+            item_count = len(result)
+            if hasattr(self, "_observability"):
+                self._observability.increment_counter("items_processed", item_count)
+                self._observability.increment_counter("total_items_crawled", item_count)
+            logger.info(f"Processed {item_count} items in {method.__name__}")
+        return result
+    return wrapper
+class PerformanceTracker:
+    """
+    Context manager for tracking performance of operations.
+    """
+    def __init__(self, operation_name: str, observability: Optional[ConnectorObservability] = None):
+        self.operation_name = operation_name
+        self.observability = observability
+        self.start_time: Optional[float] = None
+    def __enter__(self):
+        self.start_time = time.time()
+        logger.info(f"Starting operation: {self.operation_name}")
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.start_time:
+            duration = time.time() - self.start_time
+            if exc_type is None:
+                logger.info(f"Operation '{self.operation_name}' completed in {duration:.3f}s")
+            else:
+                logger.error(
+                    f"Operation '{self.operation_name}' failed after {duration:.3f}s: {exc_val}"
+                )
+            if self.observability:
+                self.observability.record_metric(f"{self.operation_name}_duration", duration)
+                if exc_type is not None:
+                    self.observability.increment_counter(f"{self.operation_name}_errors")
+class ProgressCallback:
+    """
+    Callback interface for tracking connector progress.
+    """
+    def __init__(self, total_items: Optional[int] = None):
+        self.total_items = total_items
+        self.processed_items = 0
+        self.start_time = time.time()
+    def update(self, items_processed: int):
+        """Update progress with number of items processed."""
+        self.processed_items += items_processed
+        elapsed = time.time() - self.start_time
+        if self.total_items:
+            progress_pct = (self.processed_items / self.total_items) * 100
+            logger.info(
+                f"Progress: {self.processed_items}/{self.total_items} ({progress_pct:.1f}%) - "
+                f"Rate: {self.processed_items / elapsed:.1f} items/sec"
+            )
+        else:
+            logger.info(
+                f"Progress: {self.processed_items} items processed - "
+                f"Rate: {self.processed_items / elapsed:.1f} items/sec"
+            )
+    def complete(self):
+        """Mark progress as complete."""
+        elapsed = time.time() - self.start_time
+        logger.info(
+            f"Processing complete: {self.processed_items} items in {elapsed:.2f}s "
+            f"(avg rate: {self.processed_items / elapsed:.1f} items/sec)"
+        )
+def setup_connector_logging(
+    connector_name: str, log_level: str = "INFO", log_format: Optional[str] = None
+):
+    """
+    Set up standardized logging for a connector.
+    Args:
+        connector_name: Name of the connector for log identification
+        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
+        log_format: Custom log format string
+    """
+    if log_format is None:
+        log_format = f"%(asctime)s - {connector_name} - %(name)s - %(levelname)s - %(message)s"
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper()),
+        format=log_format,
+        handlers=[
+            logging.StreamHandler(),
+            # Add file handler if needed
+            # logging.FileHandler(f"{connector_name}.log")
+        ],
+    )
+    logger.info(f"Logging configured for connector: {connector_name}")

glean/indexing/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+

glean/indexing/testing/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Testing utilities for Glean connectors."""
+from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
+from glean.indexing.testing.mock_data_source import MockDataSource
+from glean.indexing.testing.mock_glean_client import MockGleanClient
+from glean.indexing.testing.response_validator import ResponseValidator
+__all__ = [
+    "ConnectorTestHarness",
+    "MockDataSource",
+    "MockGleanClient",
+    "ResponseValidator",
+]

glean/indexing/testing/connector_test_harness.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Test harness for running and validating connectors."""
+import logging
+from unittest.mock import patch
+from glean.indexing.connectors import BaseConnector, BaseDatasourceConnector, BasePeopleConnector
+from glean.indexing.testing.mock_glean_client import MockGleanClient
+from glean.indexing.testing.response_validator import ResponseValidator
+logger = logging.getLogger(__name__)
+class ConnectorTestHarness:
+    """Test harness for connectors that works with the new dependency injection pattern."""
+    def __init__(self, connector: BaseConnector):
+        """Initialize the ConnectorTestHarness.
+        Args:
+            connector: The connector to test.
+        """
+        self.connector = connector
+        self.validator = ResponseValidator()
+        self.mock_client = MockGleanClient(self.validator)
+    def run(self) -> None:
+        """Run the connector."""
+        logger.info(f"Running test harness for connector '{self.connector.name}'")
+        # Reset validator
+        self.validator.reset()
+        # Patch the api_client to return our mock client
+        with (
+            patch(
+                "glean.indexing.connectors.base_datasource_connector.api_client"
+            ) as mock_api_client,
+            patch(
+                "glean.indexing.connectors.base_people_connector.api_client"
+            ) as mock_people_api_client,
+        ):
+            mock_api_client.return_value.__enter__.return_value = self.mock_client
+            mock_people_api_client.return_value.__enter__.return_value = self.mock_client
+            # Run the connector for any supported type
+            if isinstance(self.connector, (BaseDatasourceConnector, BasePeopleConnector)):
+                self.connector.index_data()
+            else:
+                raise ValueError(f"Unsupported connector type: {type(self.connector)}")
+    def get_validator(self) -> ResponseValidator:
+        """Get the response validator for checking results."""
+        return self.validator

glean/indexing/testing/mock_data_source.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Mock data source for testing connectors."""
+import logging
+from typing import Any, Dict, List, Optional
+logger = logging.getLogger(__name__)
+class MockDataSource:
+    """Mock data source for testing."""
+    def __init__(
+        self,
+        all_items: Optional[List[Dict[str, Any]]] = None,
+        modified_items: Optional[List[Dict[str, Any]]] = None,
+    ):
+        """Initialize the MockDataSource.
+        Args:
+            all_items: Items to return for get_all_items.
+            modified_items: Items to return for get_modified_items.
+        """
+        self.all_items = all_items or []
+        self.modified_items = modified_items or []
+    def get_all_items(self) -> List[Dict[str, Any]]:
+        """Get all items.
+        Returns:
+            A list of all items.
+        """
+        logger.info(f"MockDataSource.get_all_items() returning {len(self.all_items)} items")
+        return self.all_items
+    def get_modified_items(self, since: str) -> List[Dict[str, Any]]:
+        """Get modified items.
+        Args:
+            since: Timestamp to filter by.
+        Returns:
+            A list of modified items.
+        """
+        logger.info(
+            f"MockDataSource.get_modified_items(since={since}) returning {len(self.modified_items)} items"
+        )
+        return self.modified_items

glean/indexing/testing/mock_glean_client.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Mock Glean API client for testing."""
+import logging
+from typing import Any, Dict, List, Optional
+from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
+from glean.indexing.testing.response_validator import ResponseValidator
+logger = logging.getLogger(__name__)
+class MockGleanClient:
+    """Mock Glean API client for testing that matches the new GleanClient interface."""
+    def __init__(self, validator: ResponseValidator):
+        """Initialize the MockGleanClient.
+        Args:
+            validator: Validator to record posted items.
+        """
+        self.validator = validator
+    def index_documents(
+        self,
+        datasource: str,
+        documents: List[DocumentDefinition],
+        upload_id: Optional[str] = None,
+        is_first_page: bool = True,
+        is_last_page: bool = True,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Mock method for indexing documents (new interface).
+        Args:
+            datasource: The datasource name.
+            documents: The documents to index.
+            upload_id: Optional upload ID for batch tracking
+            is_first_page: Whether this is the first page of a multi-page upload
+            is_last_page: Whether this is the last page of a multi-page upload
+            **kwargs: Additional parameters
+        Returns:
+            Mock API response
+        """
+        logger.info(f"Mock indexing {len(documents)} documents to datasource '{datasource}'")
+        self.validator.documents_posted.extend(documents)
+        return {"status": "success", "indexed": len(documents)}
+    def index_employees(self, employees: List[EmployeeInfoDefinition], **kwargs) -> Dict[str, Any]:
+        """Mock method for indexing employees (new interface).
+        Args:
+            employees: The employees to index.
+            **kwargs: Additional parameters
+        Returns:
+            Mock API response
+        """
+        logger.info(f"Mock indexing {len(employees)} employees")
+        self.validator.employees_posted.extend(employees)
+        return {"status": "success", "indexed": len(employees)}
+    def batch_index_documents(self, datasource: str, documents: List[DocumentDefinition]) -> None:
+        """Legacy method for indexing documents."""
+        self.index_documents(datasource=datasource, documents=documents)
+    def bulk_index_employees(self, employees: List[EmployeeInfoDefinition]) -> None:
+        """Legacy method for indexing employees."""
+        self.index_employees(employees=employees)

glean/indexing/testing/response_validator.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Response validator for testing connector outputs."""
+import logging
+from typing import List, Optional
+from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
+logger = logging.getLogger(__name__)
+class ResponseValidator:
+    """Validator for connector responses."""
+    def __init__(self):
+        """Initialize the ResponseValidator."""
+        self.documents_posted: List[DocumentDefinition] = []
+        self.employees_posted: List[EmployeeInfoDefinition] = []
+    def assert_documents_posted(self, count: Optional[int] = None) -> None:
+        """Assert that documents were posted.
+        Args:
+            count: Optional expected count of documents.
+        """
+        if count is not None:
+            assert len(self.documents_posted) == count, (
+                f"Expected {count} documents to be posted, but got {len(self.documents_posted)}"
+            )
+        else:
+            assert len(self.documents_posted) > 0, "No documents were posted"
+        logger.info(f"Validated {len(self.documents_posted)} documents posted")
+    def assert_employees_posted(self, count: Optional[int] = None) -> None:
+        """Assert that employees were posted.
+        Args:
+            count: Optional expected count of employees.
+        """
+        if count is not None:
+            assert len(self.employees_posted) == count, (
+                f"Expected {count} employees to be posted, but got {len(self.employees_posted)}"
+            )
+        else:
+            assert len(self.employees_posted) > 0, "No employees were posted"
+        logger.info(f"Validated {len(self.employees_posted)} employees posted")
+    def reset(self) -> None:
+        """Reset the validator state."""
+        self.documents_posted.clear()
+        self.employees_posted.clear()