PyPI - glean-indexing-sdk - Versions diffs - 0.1.0__tar.gz → 0.2.0__tar.gz - Mend

glean-indexing-sdk 0.1.0tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.cz.toml RENAMED Viewed

@@ -1,5 +1,5 @@
 [tool.commitizen]
 name = "cz_conventional_commits"
-version = "0.1.0"
+version = "0.2.0"
 tag_format = "v$version"
 version_files = ["pyproject.toml:version", "src/glean/indexing/__init__.py:__version__"]

glean_indexing_sdk-0.2.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,10 @@
+## v0.1.0 (2025-07-23)
+### Feat
+- Adds property definition builder
+### Fix
+- Fixing format of tags for release
+- Adds addition model for re-export

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: glean-indexing-sdk
-Version: 0.1.0
+Version: 0.2.0
 Summary: SDK for building custom Glean indexing integrations
 Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
 Author-email: Steve Calvert <steve.calvert@glean.com>
@@ -232,6 +232,18 @@ connector.configure_datasource()
 connector.index_data(mode=IndexingMode.FULL)
 ```
+**When to use forced restarts:**
+- When you need to abort and restart a failed or interrupted upload
+- When you want to ensure a clean upload state by discarding partial uploads
+- When recovering from upload errors or inconsistent states
+**How it works:**
+- Generates a new `upload_id` to ensure clean separation from previous uploads
+- Sets `forceRestartUpload=True` on the **first batch only**
+- Continues with normal batch processing for subsequent batches
+This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
 ### Complete Example
 ```python snippet=non_streaming/complete.py

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/README.md RENAMED Viewed

@@ -202,6 +202,18 @@ connector.configure_datasource()
 connector.index_data(mode=IndexingMode.FULL)
 ```
+**When to use forced restarts:**
+- When you need to abort and restart a failed or interrupted upload
+- When you want to ensure a clean upload state by discarding partial uploads
+- When recovering from upload errors or inconsistent states
+**How it works:**
+- Generates a new `upload_id` to ensure clean separation from previous uploads
+- Sets `forceRestartUpload=True` on the **first batch only**
+- Continues with normal batch processing for subsequent batches
+This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
 ### Complete Example
 ```python snippet=non_streaming/complete.py

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "glean-indexing-sdk"
-version = "0.1.0"
+version = "0.2.0"
 description = "SDK for building custom Glean indexing integrations"
 authors = [{ name = "Steve Calvert", email = "steve.calvert@glean.com" }]
 readme = "README.md"

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/__init__.py RENAMED Viewed

@@ -53,4 +53,4 @@ __all__ = [
 try:
     __version__ = version("glean-indexing-sdk")
 except PackageNotFoundError:
-    __version__ = "0.1.0"
+    __version__ = "0.2.0"

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_connector.py RENAMED Viewed

@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
         pass
     @abstractmethod
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
-        """Index data from the connector to Glean."""
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
+        """Index data from the connector to Glean.
+        Args:
+            mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+        """
         pass

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_datasource_connector.py RENAMED Viewed

@@ -114,12 +114,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
             client.indexing.datasources.add(**config.dict(exclude_unset=True))
             logger.info(f"Successfully configured datasource: {config.name}")
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """
         Index data from the datasource to Glean with identity crawl followed by content crawl.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         self._observability.start_execution()
@@ -169,7 +173,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
             self._observability.start_timer("data_upload")
             if documents:
                 logger.info(f"Indexing {len(documents)} documents")
-                self._batch_index_documents(documents)
+                self._batch_index_documents(documents, force_restart=force_restart)
             self._observability.end_timer("data_upload")
             logger.info(f"Successfully indexed {len(documents)} documents to Glean")
@@ -272,8 +276,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
                 self._observability.increment_counter("batch_upload_errors")
                 raise
-    def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
-        """Index documents in batches with proper page signaling."""
+    def _batch_index_documents(
+        self, documents: Sequence[DocumentDefinition], force_restart: bool = False
+    ) -> None:
+        """Index documents in batches with proper page signaling.
+        Args:
+            documents: The documents to index
+            force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
+        """
         if not documents:
             return
@@ -285,14 +296,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
         upload_id = str(uuid.uuid4())
         for i, batch in enumerate(batches):
             try:
+                is_first_page = i == 0
+                bulk_index_kwargs = {
+                    "datasource": self.name,
+                    "documents": list(batch),
+                    "upload_id": upload_id,
+                    "is_first_page": is_first_page,
+                    "is_last_page": (i == total_batches - 1),
+                }
+                if force_restart and is_first_page:
+                    bulk_index_kwargs["forceRestartUpload"] = True
+                    logger.info("Force restarting upload - discarding any previous upload progress")
                 with api_client() as client:
-                    client.indexing.documents.bulk_index(
-                        datasource=self.name,
-                        documents=list(batch),
-                        upload_id=upload_id,
-                        is_first_page=(i == 0),
-                        is_last_page=(i == total_batches - 1),
-                    )
+                    client.indexing.documents.bulk_index(**bulk_index_kwargs)
                 logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
                 self._observability.increment_counter("batches_uploaded")

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_people_connector.py RENAMED Viewed

@@ -58,11 +58,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         """The observability instance for this connector."""
         return self._observability
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """Index people data to Glean.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         self._observability.start_execution()
@@ -89,7 +93,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
             self._observability.record_metric("employees_transformed", len(employees))
             self._observability.start_timer("data_upload")
-            self._batch_index_employees(employees)
+            self._batch_index_employees(employees, force_restart=force_restart)
             self._observability.end_timer("data_upload")
             logger.info(f"Successfully indexed {len(employees)} employees to Glean")
@@ -113,8 +117,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         """
         return self.data_client.get_source_data(since=since)
-    def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
-        """Index employees to Glean in batches."""
+    def _batch_index_employees(
+        self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
+    ) -> None:
+        """Index employees to Glean in batches.
+        Args:
+            employees: The employees to index
+            force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
+        """
         if not employees:
             return
@@ -126,13 +137,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
         upload_id = str(uuid.uuid4())
         for i, batch in enumerate(batches):
             try:
+                is_first_page = i == 0
+                bulk_index_kwargs = {
+                    "employees": list(batch),
+                    "upload_id": upload_id,
+                    "is_first_page": is_first_page,
+                    "is_last_page": (i == total_batches - 1),
+                }
+                if force_restart and is_first_page:
+                    bulk_index_kwargs["forceRestartUpload"] = True
+                    logger.info("Force restarting upload - discarding any previous upload progress")
                 with api_client() as client:
-                    client.indexing.people.bulk_index(
-                        employees=list(batch),
-                        upload_id=upload_id,
-                        is_first_page=(i == 0),
-                        is_last_page=(i == total_batches - 1),
-                    )
+                    client.indexing.people.bulk_index(**bulk_index_kwargs)
                 logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
                 self._observability.increment_counter("batches_uploaded")

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_datasource_connector.py RENAMED Viewed

@@ -47,6 +47,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
         super().__init__(name, data_client)  # type: ignore[arg-type]
         self.batch_size = 1000
         self._upload_id: Optional[str] = None
+        self._force_restart: bool = False
     def generate_upload_id(self) -> str:
         """Generate a unique upload ID for batch tracking."""
@@ -67,12 +68,16 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
         logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
         yield from self.data_client.get_source_data(since=since)
-    def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
+    def index_data(
+        self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
+    ) -> None:
         """
         Index data from the datasource to Glean using streaming.
         Args:
             mode: The indexing mode to use (FULL or INCREMENTAL).
+            force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
+                          This sets forceRestartUpload=True on the first batch and generates a new upload ID.
         """
         logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
@@ -81,6 +86,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
             since = "2023-01-01T00:00:00Z"
         upload_id = self.generate_upload_id()
+        self._force_restart = force_restart
         data_iterator = self.get_data(since=since)
         is_first_batch = True
         batch: List[TSourceData] = []
@@ -150,14 +156,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
             transformed_batch = self.transform(batch)
             logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
+            bulk_index_kwargs = {
+                "datasource": self.name,
+                "documents": list(transformed_batch),
+                "upload_id": upload_id,
+                "is_first_page": is_first_batch,
+                "is_last_page": is_last_batch,
+            }
+            if self._force_restart and is_first_batch:
+                bulk_index_kwargs["forceRestartUpload"] = True
+                logger.info("Force restarting upload - discarding any previous upload progress")
             with api_client() as client:
-                client.indexing.documents.bulk_index(
-                    datasource=self.name,
-                    documents=list(transformed_batch),
-                    upload_id=upload_id,
-                    is_first_page=is_first_batch,
-                    is_last_page=is_last_batch,
-                )
+                client.indexing.documents.bulk_index(**bulk_index_kwargs)
             logger.info(f"Batch {batch_number} indexed successfully")

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_datasource_connector.py RENAMED Viewed

@@ -128,3 +128,72 @@ class TestBaseDatasourceConnector:
         timestamp = connector._get_last_crawl_timestamp()
         assert timestamp is None
+    @patch("glean.indexing.connectors.base_datasource_connector.api_client")
+    def test_force_restart_upload(self, mock_api_client):
+        """Test that force_restart parameter sets forceRestartUpload on first batch."""
+        mock_client = Mock()
+        mock_api_client.return_value.__enter__.return_value = mock_client
+        test_data = [
+            {
+                "id": "1",
+                "title": "Test Doc 1",
+                "content": "Content 1",
+                "url": "https://test.example.com/1",
+            },
+            {
+                "id": "2",
+                "title": "Test Doc 2",
+                "content": "Content 2",
+                "url": "https://test.example.com/2",
+            },
+        ]
+        data_client = MockDataClient(test_data)
+        connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
+        connector.batch_size = 1
+        connector.index_data(force_restart=True)
+        # Should be called twice (one batch per document)
+        assert mock_client.indexing.documents.bulk_index.call_count == 2
+        # First call should have forceRestartUpload=True
+        first_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[0][1]
+        assert first_call_kwargs["forceRestartUpload"] is True
+        assert first_call_kwargs["is_first_page"] is True
+        assert first_call_kwargs["is_last_page"] is False
+        # Second call should NOT have forceRestartUpload
+        second_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[1][1]
+        assert "forceRestartUpload" not in second_call_kwargs
+        assert second_call_kwargs["is_first_page"] is False
+        assert second_call_kwargs["is_last_page"] is True
+    @patch("glean.indexing.connectors.base_datasource_connector.api_client")
+    def test_normal_upload_no_force_restart(self, mock_api_client):
+        """Test that normal upload does not include forceRestartUpload parameter."""
+        mock_client = Mock()
+        mock_api_client.return_value.__enter__.return_value = mock_client
+        test_data = [
+            {
+                "id": "1",
+                "title": "Test Doc",
+                "content": "Content",
+                "url": "https://test.example.com/1",
+            }
+        ]
+        data_client = MockDataClient(test_data)
+        connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
+        connector.index_data(force_restart=False)
+        # Should be called once
+        assert mock_client.indexing.documents.bulk_index.call_count == 1
+        # Should NOT have forceRestartUpload parameter
+        call_kwargs = mock_client.indexing.documents.bulk_index.call_args[1]
+        assert "forceRestartUpload" not in call_kwargs
+        assert call_kwargs["is_first_page"] is True
+        assert call_kwargs["is_last_page"] is True

{glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_streaming_datasource_connector.py RENAMED Viewed

@@ -82,8 +82,7 @@ def test_index_data_batches_and_uploads():
 def test_index_data_empty():
     class EmptyClient(BaseStreamingDataClient[dict]):
         def get_source_data(self, **kwargs):
-            if False:
-                yield
+            yield from []
     connector = DummyStreamingConnector("test_stream", EmptyClient())
     with patch(
@@ -104,3 +103,56 @@ def test_index_data_error_handling():
         bulk_index.side_effect = Exception("upload failed")
         with pytest.raises(Exception):
             connector.index_data()
+def test_force_restart_upload():
+    """Test that force_restart parameter sets forceRestartUpload on first batch."""
+    client = DummyStreamingDataClient()
+    connector = DummyStreamingConnector("test_stream", client)
+    connector.batch_size = 2
+    with patch(
+        "glean.indexing.connectors.base_streaming_datasource_connector.api_client"
+    ) as api_client:
+        bulk_index = api_client().__enter__().indexing.documents.bulk_index
+        connector.index_data(force_restart=True)
+        assert bulk_index.call_count == 3
+        # First call should have forceRestartUpload=True
+        first_call_kwargs = bulk_index.call_args_list[0][1]
+        assert first_call_kwargs["forceRestartUpload"] is True
+        assert first_call_kwargs["is_first_page"] is True
+        assert first_call_kwargs["is_last_page"] is False
+        # Subsequent calls should NOT have forceRestartUpload
+        second_call_kwargs = bulk_index.call_args_list[1][1]
+        assert "forceRestartUpload" not in second_call_kwargs
+        assert second_call_kwargs["is_first_page"] is False
+        assert second_call_kwargs["is_last_page"] is False
+        third_call_kwargs = bulk_index.call_args_list[2][1]
+        assert "forceRestartUpload" not in third_call_kwargs
+        assert third_call_kwargs["is_first_page"] is False
+        assert third_call_kwargs["is_last_page"] is True
+def test_normal_upload_no_force_restart():
+    """Test that normal upload does not include forceRestartUpload parameter."""
+    client = DummyStreamingDataClient()
+    connector = DummyStreamingConnector("test_stream", client)
+    connector.batch_size = 5
+    with patch(
+        "glean.indexing.connectors.base_streaming_datasource_connector.api_client"
+    ) as api_client:
+        bulk_index = api_client().__enter__().indexing.documents.bulk_index
+        connector.index_data(force_restart=False)
+        assert bulk_index.call_count == 1
+        # Should NOT have forceRestartUpload parameter
+        call_kwargs = bulk_index.call_args[1]
+        assert "forceRestartUpload" not in call_kwargs
+        assert call_kwargs["is_first_page"] is True
+        assert call_kwargs["is_last_page"] is True

glean-indexing-sdk 0.1.0__tar.gz → 0.2.0__tar.gz

glean-indexing-sdk 0.1.0tar.gz → 0.2.0tar.gz