PyPI - glean-indexing-sdk - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

glean-indexing-sdk 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

glean/indexing/__init__.py CHANGED Viewed

@@ -1,56 +1,56 @@
 """Glean Indexing SDK.
-A Python SDK for building custom Glean indexing solutions. This package provides
+A Python SDK for building custom Glean indexing solutions. This package provides
 the base classes and utilities to create custom connectors for Glean's indexing APIs.
 """
-from importlib.metadata import version, PackageNotFoundError
+from importlib.metadata import PackageNotFoundError, version
+from glean.indexing import models
+from glean.indexing.common import BatchProcessor, ConnectorMetrics, ContentFormatter, MockGleanClient, api_client
 from glean.indexing.connectors import (
+    BaseAsyncStreamingDataClient,
+    BaseAsyncStreamingDatasourceConnector,
     BaseConnector,
+    BaseDataClient,
     BaseDatasourceConnector,
-    BaseStreamingDatasourceConnector,
     BasePeopleConnector,
-    BaseConnectorDataClient,
-    StreamingConnectorDataClient,
+    BaseStreamingDataClient,
+    BaseStreamingDatasourceConnector,
 )
-from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
-from glean.indexing.observability.observability import ConnectorObservability
-from glean.indexing.testing import ConnectorTestHarness
 from glean.indexing.models import (
     DatasourceIdentityDefinitions,
     IndexingMode,
-    TSourceData,
     TIndexableEntityDefinition,
+    TSourceData,
 )
-from glean.indexing import models
+from glean.indexing.observability.observability import ConnectorObservability
+from glean.indexing.testing import ConnectorTestHarness
 __all__ = [
     "BaseConnector",
+    "BaseDataClient",
     "BaseDatasourceConnector",
     "BasePeopleConnector",
+    "BaseStreamingDataClient",
     "BaseStreamingDatasourceConnector",
-    "BaseConnectorDataClient",
-    "StreamingConnectorDataClient",
+    "BaseAsyncStreamingDataClient",
+    "BaseAsyncStreamingDatasourceConnector",
     "BatchProcessor",
     "ContentFormatter",
     "ConnectorMetrics",
     "ConnectorObservability",
     "ConnectorTestHarness",
     "DatasourceIdentityDefinitions",
     "IndexingMode",
     "TSourceData",
     "TIndexableEntityDefinition",
     "MockGleanClient",
     "api_client",
     "models",
 ]
 try:
     __version__ = version("glean-indexing-sdk")
 except PackageNotFoundError:
-    __version__ = "0.1.0"
+    __version__ = "0.3.0"

glean/indexing/connectors/__init__.py CHANGED Viewed

@@ -1,21 +1,23 @@
 """Connector implementations for Glean indexing."""
 from glean.indexing.connectors.base_connector import BaseConnector
-from glean.indexing.connectors.base_data_client import BaseDataClient, BaseConnectorDataClient
+from glean.indexing.connectors.base_data_client import BaseDataClient
 from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
 from glean.indexing.connectors.base_people_connector import BasePeopleConnector
-from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient, StreamingConnectorDataClient
+from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
 from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
+from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
+from glean.indexing.connectors.base_async_streaming_datasource_connector import BaseAsyncStreamingDatasourceConnector
 from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
 __all__ = [
     "BaseConnector",
     "BaseDataClient",
-    "BaseConnectorDataClient",  # Backward compatibility alias
     "BaseDatasourceConnector",
     "BasePeopleConnector",
-    "BaseStreamingDataClient",
-    "StreamingConnectorDataClient",  # Backward compatibility alias
+    "BaseStreamingDataClient",
     "BaseStreamingDatasourceConnector",
+    "BaseAsyncStreamingDataClient",
+    "BaseAsyncStreamingDatasourceConnector",
     "ConnectorTestHarness",
 ]

glean/indexing/connectors/base_async_streaming_data_client.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Base async streaming data client for fetching data in chunks."""
+from abc import ABC, abstractmethod
+from typing import Any, AsyncGenerator, Generic
+from glean.indexing.models import TSourceData
+class BaseAsyncStreamingDataClient(ABC, Generic[TSourceData]):
+    """
+    Base class for async streaming data clients that fetch data in chunks.
+    Use this for large datasets with async APIs to minimize memory usage
+    and maximize I/O throughput.
+    Type Parameters:
+        TSourceData: The type of data yielded from the external source
+    Example:
+        class MyAsyncDataClient(BaseAsyncStreamingDataClient[MyDocData]):
+            async def get_source_data(self, **kwargs) -> AsyncGenerator[MyDocData, None]:
+                async for page in self.fetch_pages():
+                    for item in page:
+                        yield item
+    """
+    @abstractmethod
+    async def get_source_data(self, **kwargs: Any) -> AsyncGenerator[TSourceData, None]:
+        """
+        Retrieves source data as an async generator.
+        This method should be implemented to return an async generator
+        that yields data items one at a time or in small batches.
+        Args:
+            **kwargs: Additional keyword arguments for customizing data retrieval.
+        Yields:
+            Individual data items from the external source.
+        """
+        if False:
+            yield  # type: ignore[misc]

glean/indexing/connectors/base_async_streaming_datasource_connector.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""Base async streaming datasource connector for memory-efficient processing of large datasets."""
+import asyncio
+import logging
+import uuid
+from abc import ABC
+from typing import AsyncGenerator, List, Optional, Sequence
+from glean.indexing.common import api_client
+from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
+from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
+from glean.indexing.models import IndexingMode, TSourceData
+logger = logging.getLogger(__name__)
+class BaseAsyncStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
+    """
+    Base class for async streaming datasource connectors.
+    This class provides async-native streaming for memory-efficient processing
+    of large datasets. Use this when your data source provides async APIs
+    (e.g., aiohttp, httpx async, etc.).
+    To implement a custom async streaming connector, inherit from this class and implement:
+        - configuration: CustomDatasourceConfig (class or instance attribute)
+        - async_data_client: BaseAsyncStreamingDataClient (set in __init__)
+        - transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
+    Attributes:
+        name (str): The unique name of the connector (should be snake_case).
+        configuration (CustomDatasourceConfig): The datasource configuration.
+        batch_size (int): The batch size for uploads (default: 1000).
+        async_data_client (BaseAsyncStreamingDataClient): The async streaming data client.
+    Example:
+        class MyAsyncConnector(BaseAsyncStreamingDatasourceConnector[MyDocData]):
+            configuration = CustomDatasourceConfig(...)
+            def __init__(self, name: str):
+                async_client = MyAsyncDataClient()
+                super().__init__(name, async_client)
+            def transform(self, data: Sequence[MyDocData]) -> Sequence[DocumentDefinition]:
+                return [self._transform_doc(d) for d in data]
+    """
+    def __init__(
+        self,
+        name: str,
+        async_data_client: BaseAsyncStreamingDataClient[TSourceData],
+    ):
+        super().__init__(name, None)  # type: ignore[arg-type]
+        self.async_data_client = async_data_client
+        self.batch_size = 1000
+        self._upload_id: Optional[str] = None
+        self._force_restart: bool = False
+    def generate_upload_id(self) -> str:
+        """Generate a unique upload ID for batch tracking."""
+        if not self._upload_id:
+            self._upload_id = str(uuid.uuid4())
+        return self._upload_id
+    async def get_data_async(
+        self, since: Optional[str] = None
+    ) -> AsyncGenerator[TSourceData, None]:
+        """
+        Get data from the async streaming data client.
+        Args:
+            since: If provided, only get data modified since this timestamp.
+        Yields:
+            Individual data items from the source
+        """
+        logger.info(
+            f"Fetching async streaming data from source{' since ' + since if since else ''}"
+        )
+        async for item in self.async_data_client.get_source_data(since=since):
+            yield item
+    async def index_data_async(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
+        """
+        Index data from the datasource to Glean using async streaming.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload.
+        """
+        logger.info(
+            f"Starting {mode.name.lower()} async streaming indexing for datasource '{self.name}'"
+        )
+        since = None
+        if mode == IndexingMode.INCREMENTAL:
+            since = self._get_last_crawl_timestamp()
+            logger.info(f"Incremental crawl since: {since}")
+        upload_id = self.generate_upload_id()
+        self._force_restart = force_restart
+        is_first_batch = True
+        batch: List[TSourceData] = []
+        batch_count = 0
+        try:
+            data_iterator = self.get_data_async(since=since).__aiter__()
+            exhausted = False
+            while not exhausted:
+                try:
+                    item = await data_iterator.__anext__()
+                    batch.append(item)
+                    if len(batch) == self.batch_size:
+                        try:
+                            next_item = await data_iterator.__anext__()
+                            await self._process_batch_async(
+                                batch=batch,
+                                upload_id=upload_id,
+                                is_first_batch=is_first_batch,
+                                is_last_batch=False,
+                                batch_number=batch_count,
+                            )
+                            batch_count += 1
+                            batch = [next_item]
+                            is_first_batch = False
+                        except StopAsyncIteration:
+                            exhausted = True
+                except StopAsyncIteration:
+                    exhausted = True
+            if batch:
+                await self._process_batch_async(
+                    batch=batch,
+                    upload_id=upload_id,
+                    is_first_batch=is_first_batch,
+                    is_last_batch=True,
+                    batch_number=batch_count,
+                )
+                batch_count += 1
+            logger.info(
+                f"Async streaming indexing completed successfully. Processed {batch_count} batches."
+            )
+        except Exception as e:
+            logger.exception(f"Error during async streaming indexing: {e}")
+            raise
+    async def _process_batch_async(
+        self,
+        batch: List[TSourceData],
+        upload_id: str,
+        is_first_batch: bool,
+        is_last_batch: bool,
+        batch_number: int,
+    ) -> None:
+        """
+        Process a single batch of data.
+        Args:
+            batch: The batch of raw data to process
+            upload_id: The upload ID for this indexing session
+            is_first_batch: Whether this is the first batch
+            is_last_batch: Whether this is the last batch
+            batch_number: The sequence number of this batch
+        """
+        logger.info(f"Processing batch {batch_number} with {len(batch)} items")
+        try:
+            transformed_batch = self.transform(batch)
+            logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
+            bulk_index_kwargs = {
+                "datasource": self.name,
+                "documents": list(transformed_batch),
+                "upload_id": upload_id,
+                "is_first_page": is_first_batch,
+                "is_last_page": is_last_batch,
+            }
+            if self._force_restart and is_first_batch:
+                bulk_index_kwargs["forceRestartUpload"] = True
+                logger.info("Force restarting upload - discarding any previous upload progress")
+            with api_client() as client:
+                client.indexing.documents.bulk_index(**bulk_index_kwargs)
+            logger.info(f"Batch {batch_number} indexed successfully")
+        except Exception as e:
+            logger.error(f"Failed to process batch {batch_number}: {e}")
+            raise
+    def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
+        """
+        Sync fallback - collects all data into memory.
+        Warning: This defeats the purpose of streaming. Use get_data_async() instead.
+        """
+        async def collect() -> List[TSourceData]:
+            result: List[TSourceData] = []
+            async for item in self.get_data_async(since=since):
+                result.append(item)
+            return result
+        logger.warning(
+            "Sync get_data() called on async connector - using asyncio.run(). "
+            "Consider using get_data_async() for better performance."
+        )
+        return asyncio.run(collect())
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
+        """
+        Sync fallback for index_data.
+        Warning: This blocks the current thread. Use index_data_async() instead.
+        """
+        logger.warning(
+            "Sync index_data() called on async connector - using asyncio.run(). "
+            "Consider using index_data_async() for better performance."
+        )
+        asyncio.run(self.index_data_async(mode=mode, force_restart=force_restart))

glean/indexing/connectors/base_connector.py CHANGED Viewed

@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
         pass
     @abstractmethod
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
-        """Index data from the connector to Glean."""
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
+        """Index data from the connector to Glean.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+        """
         pass

glean/indexing/connectors/base_data_client.py CHANGED Viewed

@@ -29,7 +29,3 @@ class BaseDataClient(ABC, Generic[TSourceData]):
             A sequence of data items from the source
         """
         pass
-# Alias for backward compatibility during transition
-BaseConnectorDataClient = BaseDataClient

glean/indexing/connectors/base_datasource_connector.py CHANGED Viewed

@@ -6,9 +6,10 @@ from abc import ABC
 from typing import Optional, Sequence
 from glean.api_client.models import DocumentDefinition
 from glean.indexing.common import BatchProcessor, api_client
 from glean.indexing.connectors.base_connector import BaseConnector
-from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
+from glean.indexing.connectors.base_data_client import BaseDataClient
 from glean.indexing.models import (
     CustomDatasourceConfig,
     DatasourceIdentityDefinitions,
@@ -36,7 +37,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
         name (str): The unique name of the connector (should be snake_case).
         configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
         batch_size (int): The batch size for uploads (default: 1000).
-        data_client (BaseConnectorDataClient): The data client for fetching source data.
+        data_client (BaseDataClient): The data client for fetching source data.
         observability (ConnectorObservability): Observability and metrics for this connector.
     Example:
@@ -47,7 +48,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
     configuration: CustomDatasourceConfig
-    def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
+    def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
         """
         Initialize the datasource connector.
@@ -114,12 +115,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
             client.indexing.datasources.add(**config.dict(exclude_unset=True))
             logger.info(f"Successfully configured datasource: {config.name}")
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """
         Index data from the datasource to Glean with identity crawl followed by content crawl.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         self._observability.start_execution()
@@ -169,7 +174,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
             self._observability.start_timer("data_upload")
             if documents:
                 logger.info(f"Indexing {len(documents)} documents")
-                self._batch_index_documents(documents)
+                self._batch_index_documents(documents, force_restart=force_restart)
             self._observability.end_timer("data_upload")
             logger.info(f"Successfully indexed {len(documents)} documents to Glean")
@@ -272,8 +277,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
                 self._observability.increment_counter("batch_upload_errors")
                 raise
-    def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
-        """Index documents in batches with proper page signaling."""
+    def _batch_index_documents(
+        self, documents: Sequence[DocumentDefinition], force_restart: bool = False
+    ) -> None:
+        """Index documents in batches with proper page signaling.
+        Args:
+            documents: The documents to index
+            force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
+        """
         if not documents:
             return
@@ -285,14 +297,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
         upload_id = str(uuid.uuid4())
         for i, batch in enumerate(batches):
             try:
+                is_first_page = i == 0
+                bulk_index_kwargs = {
+                    "datasource": self.name,
+                    "documents": list(batch),
+                    "upload_id": upload_id,
+                    "is_first_page": is_first_page,
+                    "is_last_page": (i == total_batches - 1),
+                }
+                if force_restart and is_first_page:
+                    bulk_index_kwargs["forceRestartUpload"] = True
+                    logger.info("Force restarting upload - discarding any previous upload progress")
                 with api_client() as client:
-                    client.indexing.documents.bulk_index(
-                        datasource=self.name,
-                        documents=list(batch),
-                        upload_id=upload_id,
-                        is_first_page=(i == 0),
-                        is_last_page=(i == total_batches - 1),
-                    )
+                    client.indexing.documents.bulk_index(**bulk_index_kwargs)
                 logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
                 self._observability.increment_counter("batches_uploaded")

glean/indexing/connectors/base_people_connector.py CHANGED Viewed

@@ -6,9 +6,10 @@ from abc import ABC
 from typing import Optional, Sequence
 from glean.api_client.models import EmployeeInfoDefinition
 from glean.indexing.common import BatchProcessor, api_client
 from glean.indexing.connectors.base_connector import BaseConnector
-from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
+from glean.indexing.connectors.base_data_client import BaseDataClient
 from glean.indexing.models import IndexingMode, TSourceData
 from glean.indexing.observability.observability import ConnectorObservability
@@ -31,7 +32,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         name (str): The unique name of the connector (should be snake_case).
         configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
         batch_size (int): The batch size for uploads (default: 1000).
-        data_client (BaseConnectorDataClient): The data client for fetching source data.
+        data_client (BaseDataClient): The data client for fetching source data.
         observability (ConnectorObservability): Observability and metrics for this connector.
     Example:
@@ -40,7 +41,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
             ...
     """
-    def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
+    def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
         """
         Initialize the people connector.
@@ -58,11 +59,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         """The observability instance for this connector."""
         return self._observability
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """Index people data to Glean.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         self._observability.start_execution()
@@ -89,7 +94,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
             self._observability.record_metric("employees_transformed", len(employees))
             self._observability.start_timer("data_upload")
-            self._batch_index_employees(employees)
+            self._batch_index_employees(employees, force_restart=force_restart)
             self._observability.end_timer("data_upload")
             logger.info(f"Successfully indexed {len(employees)} employees to Glean")
@@ -113,8 +118,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         """
         return self.data_client.get_source_data(since=since)
-    def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
-        """Index employees to Glean in batches."""
+    def _batch_index_employees(
+        self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
+    ) -> None:
+        """Index employees to Glean in batches.
+        Args:
+            employees: The employees to index
+            force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
+        """
         if not employees:
             return
@@ -126,13 +138,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         upload_id = str(uuid.uuid4())
         for i, batch in enumerate(batches):
             try:
+                is_first_page = i == 0
+                bulk_index_kwargs = {
+                    "employees": list(batch),
+                    "upload_id": upload_id,
+                    "is_first_page": is_first_page,
+                    "is_last_page": (i == total_batches - 1),
+                }
+                if force_restart and is_first_page:
+                    bulk_index_kwargs["forceRestartUpload"] = True
+                    logger.info("Force restarting upload - discarding any previous upload progress")
                 with api_client() as client:
-                    client.indexing.people.bulk_index(
-                        employees=list(batch),
-                        upload_id=upload_id,
-                        is_first_page=(i == 0),
-                        is_last_page=(i == total_batches - 1),
-                    )
+                    client.indexing.people.bulk_index(**bulk_index_kwargs)
                 logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
                 self._observability.increment_counter("batches_uploaded")

glean/indexing/connectors/base_streaming_data_client.py CHANGED Viewed

@@ -33,7 +33,3 @@ class BaseStreamingDataClient(ABC, Generic[TSourceData]):
             A generator of data items.
         """
         pass
-# Alias for backward compatibility during transition
-StreamingConnectorDataClient = BaseStreamingDataClient

glean/indexing/connectors/base_streaming_datasource_connector.py CHANGED Viewed

@@ -6,7 +6,8 @@ from abc import ABC
 from typing import Generator, List, Optional, Sequence
 from glean.indexing.common import api_client
-from glean.indexing.connectors import BaseDatasourceConnector, StreamingConnectorDataClient
+from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
+from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
 from glean.indexing.models import IndexingMode, TSourceData
 logger = logging.getLogger(__name__)
@@ -28,7 +29,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
         name (str): The unique name of the connector (should be snake_case).
         configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
         batch_size (int): The batch size for uploads (default: 1000).
-        data_client (StreamingConnectorDataClient): The streaming data client for fetching source data.
+        data_client (BaseStreamingDataClient): The streaming data client for fetching source data.
         observability (ConnectorObservability): Observability and metrics for this connector.
     Notes:
@@ -41,12 +42,13 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
             ...
     """
-    def __init__(self, name: str, data_client: StreamingConnectorDataClient[TSourceData]):
+    def __init__(self, name: str, data_client: BaseStreamingDataClient[TSourceData]):
         # Note: We pass the streaming client as-is since it's a specialized version
         # The type checker may warn about this, but it's intentional for streaming
         super().__init__(name, data_client)  # type: ignore[arg-type]
         self.batch_size = 1000
         self._upload_id: Optional[str] = None
+        self._force_restart: bool = False
     def generate_upload_id(self) -> str:
         """Generate a unique upload ID for batch tracking."""
@@ -67,20 +69,26 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
         logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
         yield from self.data_client.get_source_data(since=since)
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """
         Index data from the datasource to Glean using streaming.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
         since = None
         if mode == IndexingMode.INCREMENTAL:
-            since = "2023-01-01T00:00:00Z"
+            since = self._get_last_crawl_timestamp()
+            logger.info(f"Incremental crawl since: {since}")
         upload_id = self.generate_upload_id()
+        self._force_restart = force_restart
         data_iterator = self.get_data(since=since)
         is_first_batch = True
         batch: List[TSourceData] = []
@@ -150,14 +158,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
             transformed_batch = self.transform(batch)
             logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
+            bulk_index_kwargs = {
+                "datasource": self.name,
+                "documents": list(transformed_batch),
+                "upload_id": upload_id,
+                "is_first_page": is_first_batch,
+                "is_last_page": is_last_batch,
+            }
+            if self._force_restart and is_first_batch:
+                bulk_index_kwargs["forceRestartUpload"] = True
+                logger.info("Force restarting upload - discarding any previous upload progress")
             with api_client() as client:
-                client.indexing.documents.bulk_index(
-                    datasource=self.name,
-                    documents=list(transformed_batch),
-                    upload_id=upload_id,
-                    is_first_page=is_first_batch,
-                    is_last_page=is_last_batch,
-                )
+                client.indexing.documents.bulk_index(**bulk_index_kwargs)
             logger.info(f"Batch {batch_number} indexed successfully")

glean/indexing/testing/mock_glean_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 from typing import Any, Dict, List, Optional
 from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
 from glean.indexing.testing.response_validator import ResponseValidator
 logger = logging.getLogger(__name__)

{glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: glean-indexing-sdk
-Version: 0.1.0
+Version: 0.3.0
 Summary: SDK for building custom Glean indexing integrations
 Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
 Author-email: Steve Calvert <steve.calvert@glean.com>
@@ -232,6 +232,18 @@ connector.configure_datasource()
 connector.index_data(mode=IndexingMode.FULL)
 ```
+**When to use forced restarts:**
+- When you need to abort and restart a failed or interrupted upload
+- When you want to ensure a clean upload state by discarding partial uploads
+- When recovering from upload errors or inconsistent states
+**How it works:**
+- Generates a new `upload_id` to ensure clean separation from previous uploads
+- Sets `forceRestartUpload=True` on the **first batch only**
+- Continues with normal batch processing for subsequent batches
+This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
 ### Complete Example
 ```python snippet=non_streaming/complete.py
@@ -423,6 +435,7 @@ class LargeKnowledgeBaseClient(StreamingConnectorDataClient[ArticleData]):
 from typing import List, Sequence
 from glean.api_client.models.userreferencedefinition import UserReferenceDefinition
 from glean.indexing.connectors import BaseStreamingDatasourceConnector
 from glean.indexing.models import ContentDefinition, CustomDatasourceConfig, DocumentDefinition

{glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-glean/indexing/__init__.py,sha256=pYmCWpPddpoOR3fGN2ex8wjcQM4PHF8VM1ylmeHfxZY,1519
+glean/indexing/__init__.py,sha256=APnkKfvATYeZF1NCePp7V2OAa5mwWTf7D_aCKaYV9Gw,1629
 glean/indexing/models.py,sha256=UuaEDCx0ygvU4u0lRbSn4YXXZVo7D_pyD_whQtjORm8,1223
 glean/indexing/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
 glean/indexing/common/__init__.py,sha256=6COS3jP66xJ7VcNGI8I95tkF5zpqHy9QPVn82CB4m4I,513
@@ -8,21 +8,23 @@ glean/indexing/common/glean_client.py,sha256=tKRWK_C1Nja0gVy2FLnj9SmUbpIdOA3WKmp
 glean/indexing/common/metrics.py,sha256=SWCWCYnNOkN4cnwCxyWyEF8iHVwQ4HZqhewi2lqyS84,1771
 glean/indexing/common/mocks.py,sha256=-TbLzpZ7yUstQW58AICixiIQM2CV5_OPRXejjI_brhE,726
 glean/indexing/common/property_definition_builder.py,sha256=NZFhSqsSZlhI0Ia76sn0meYr82msBMCKMd78zMKLWAM,3724
-glean/indexing/connectors/__init__.py,sha256=YaHEmCj246zKIvPIAOjTBTDV2O-KvMLncc6jjmaEeOw,1035
-glean/indexing/connectors/base_connector.py,sha256=Q435TzSLqs0OTFBrD3KCcjQnGSICQg11pdSfJ7C3XtI,2398
-glean/indexing/connectors/base_data_client.py,sha256=krOFHJbwCZI-hCS6fr-z44TvjCbPCTCw54hkk0CZFsQ,1004
-glean/indexing/connectors/base_datasource_connector.py,sha256=x0Fsc7uCKgTtTgyOus1yDFBr87JbVGHM3zHFp9mGgc4,12440
-glean/indexing/connectors/base_people_connector.py,sha256=XuSCFyegenW271GZJ408IQgT19sBq9C9NkKHkiSxLKg,6239
-glean/indexing/connectors/base_streaming_data_client.py,sha256=xW67crQ_rHaOnD0NFBi2zTGex9JGME886CjX4EqgbZM,1241
-glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=wUcsBPExzmgMQd6P24epR4bZFBl40aN6qm6di_F2hmA,7116
+glean/indexing/connectors/__init__.py,sha256=d9U2-elD7DewkuvY02UQJ1_khhdYVwyQCkADzg8jVjw,1147
+glean/indexing/connectors/base_async_streaming_data_client.py,sha256=JaKa1kfK1R1FKI7151g0zsbCutS7TmpZLabQi0LetA4,1419
+glean/indexing/connectors/base_async_streaming_datasource_connector.py,sha256=l6BuIbz_OGFxSZv5BsJ1uOFJLlwrf9BgJugtSXmuayE,8627
+glean/indexing/connectors/base_connector.py,sha256=m_zKbg-MMc1bjG5m2SsIarSeiPhFJKzfBQzgnlqTKF8,2640
+glean/indexing/connectors/base_data_client.py,sha256=0_QSdcjr1VK1COnpbzJFzowDVpODIRAPHgsjMNRh4As,908
+glean/indexing/connectors/base_datasource_connector.py,sha256=8_FQcQsc5gX9g_N6nw_8jj0ppccaBtGMjID2bBq9VcU,13271
+glean/indexing/connectors/base_people_connector.py,sha256=7aD_B8mVUWKinV4kfzWVw0y3RRIbKZ-AbONywQf2Gxc,7071
+glean/indexing/connectors/base_streaming_data_client.py,sha256=0p_OPLv7eKKCER3tuvsOuvzakiQhAG-ztyKUs9bSIl0,1131
+glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=96gehVYoxrzgHLr2U-EzO9kuKMdy_GsZ56QR2m3qls8,7872
 glean/indexing/observability/__init__.py,sha256=SuWJ7pHs5WFq5vL036B3RIsJSbjDsy6SI705u83874I,455
 glean/indexing/observability/observability.py,sha256=cHlo-tbrmGie6YeWXqEUap0YE6JRtFvOKTnxWD-7yac,9222
 glean/indexing/testing/__init__.py,sha256=h9mK0QjRZD5f470ePTeg635jZNwPBAd2S7g1DQO4LuE,448
 glean/indexing/testing/connector_test_harness.py,sha256=CMQZmn0cOIrj_GdIHb3OwRN9jTaZrn3pYkHHz50rqK8,1988
 glean/indexing/testing/mock_data_source.py,sha256=ICYbbHQZe9RVTzvrlwcxp_suxm9yXgjEAGiNCU-SkS4,1325
-glean/indexing/testing/mock_glean_client.py,sha256=aY_Jfg_NJNPw2HSM1IshgT2lkT59SD9BJzOnvNFJhck,2528
+glean/indexing/testing/mock_glean_client.py,sha256=-0-ppfD1DmLbmtc5T_vFOfZB_ACx2RL6MAoVUqxl_Us,2529
 glean/indexing/testing/response_validator.py,sha256=jehEtXlW0AQcOVck-_VPoDFtQM_vkHJQ10SUN1ftr1Q,1800
-glean_indexing_sdk-0.1.0.dist-info/METADATA,sha256=Y5J0IXw5FzP6k_Ao7AlU7RGPgW3Jom1noJMZDU8gHYw,15619
-glean_indexing_sdk-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-glean_indexing_sdk-0.1.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
-glean_indexing_sdk-0.1.0.dist-info/RECORD,,
+glean_indexing_sdk-0.3.0.dist-info/METADATA,sha256=lpXuoNquAdBGHTGhm1XNYzvAhYKPudKdCPBQ41q95v0,16225
+glean_indexing_sdk-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+glean_indexing_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
+glean_indexing_sdk-0.3.0.dist-info/RECORD,,

{glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

glean-indexing-sdk 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

glean-indexing-sdk 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl