glean-indexing-sdk 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ """Glean Indexing SDK.
2
+
3
+ A Python SDK for building custom Glean indexing solutions. This package provides
4
+ the base classes and utilities to create custom connectors for Glean's indexing APIs.
5
+ """
6
+
7
+ from importlib.metadata import version, PackageNotFoundError
8
+ from glean.indexing.connectors import (
9
+ BaseConnector,
10
+ BaseDatasourceConnector,
11
+ BaseStreamingDatasourceConnector,
12
+ BasePeopleConnector,
13
+ BaseConnectorDataClient,
14
+ StreamingConnectorDataClient,
15
+ )
16
+ from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
17
+ from glean.indexing.observability.observability import ConnectorObservability
18
+ from glean.indexing.testing import ConnectorTestHarness
19
+ from glean.indexing.models import (
20
+ DatasourceIdentityDefinitions,
21
+ IndexingMode,
22
+ TSourceData,
23
+ TIndexableEntityDefinition,
24
+ )
25
+ from glean.indexing import models
26
+
27
+ __all__ = [
28
+ "BaseConnector",
29
+ "BaseDatasourceConnector",
30
+ "BasePeopleConnector",
31
+ "BaseStreamingDatasourceConnector",
32
+
33
+ "BaseConnectorDataClient",
34
+ "StreamingConnectorDataClient",
35
+
36
+ "BatchProcessor",
37
+ "ContentFormatter",
38
+ "ConnectorMetrics",
39
+ "ConnectorObservability",
40
+ "ConnectorTestHarness",
41
+
42
+ "DatasourceIdentityDefinitions",
43
+ "IndexingMode",
44
+ "TSourceData",
45
+ "TIndexableEntityDefinition",
46
+
47
+ "MockGleanClient",
48
+ "api_client",
49
+
50
+ "models",
51
+ ]
52
+
53
+ try:
54
+ __version__ = version("glean-indexing-sdk")
55
+ except PackageNotFoundError:
56
+ __version__ = "0.0.3"
@@ -0,0 +1,15 @@
1
+ """Common utilities and client implementations for Glean API integration."""
2
+
3
+ from glean.indexing.common.glean_client import api_client
4
+ from glean.indexing.common.mocks import MockGleanClient
5
+ from glean.indexing.common.batch_processor import BatchProcessor
6
+ from glean.indexing.common.content_formatter import ContentFormatter
7
+ from glean.indexing.common.metrics import ConnectorMetrics
8
+
9
+ __all__ = [
10
+ "api_client",
11
+ "MockGleanClient",
12
+ "BatchProcessor",
13
+ "ContentFormatter",
14
+ "ConnectorMetrics",
15
+ ]
@@ -0,0 +1,31 @@
1
+ """Batch processing utility for efficient data handling."""
2
+
3
+ import logging
4
+ from typing import Generic, Iterator, Sequence, TypeVar
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ T = TypeVar("T")
9
+
10
+
11
+ class BatchProcessor(Generic[T]):
12
+ """A utility for processing data in batches."""
13
+
14
+ def __init__(self, data: Sequence[T], batch_size: int = 100):
15
+ """Initialize the BatchProcessor.
16
+
17
+ Args:
18
+ data: The data to process in batches.
19
+ batch_size: The size of each batch.
20
+ """
21
+ self.data = data
22
+ self.batch_size = batch_size
23
+
24
+ def __iter__(self) -> Iterator[Sequence[T]]:
25
+ """Iterate over the data in batches.
26
+
27
+ Yields:
28
+ Sequences of items of size batch_size (except possibly the last batch).
29
+ """
30
+ for i in range(0, len(self.data), self.batch_size):
31
+ yield self.data[i : i + self.batch_size]
@@ -0,0 +1,46 @@
1
+ """Content formatting utility using Jinja2."""
2
+
3
+ import logging
4
+ from typing import Any, Dict
5
+
6
+ from jinja2 import Environment
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ContentFormatter:
12
+ """A utility for formatting content using Jinja2 templates."""
13
+
14
+ def __init__(self, template_str: str):
15
+ """Initialize the ContentFormatter.
16
+
17
+ Args:
18
+ template_str: A Jinja2 template string.
19
+ """
20
+ self.env = Environment(autoescape=True)
21
+ self.template = self.env.from_string(template_str)
22
+
23
+ def render(self, context: Dict[str, Any]) -> str:
24
+ """Render the template with the given context.
25
+
26
+ Args:
27
+ context: A dictionary containing the context for rendering.
28
+
29
+ Returns:
30
+ The rendered template as a string.
31
+ """
32
+ return self.template.render(**context)
33
+
34
+ @classmethod
35
+ def from_file(cls, template_path: str) -> "ContentFormatter":
36
+ """Create a ContentFormatter from a template file.
37
+
38
+ Args:
39
+ template_path: Path to a Jinja2 template file.
40
+
41
+ Returns:
42
+ A ContentFormatter instance.
43
+ """
44
+ with open(template_path, "r", encoding="utf-8") as f:
45
+ template_str = f.read()
46
+ return cls(template_str)
@@ -0,0 +1,18 @@
1
+ """Simple Glean API client helper for connectors."""
2
+
3
+ import os
4
+
5
+ from glean.api_client import Glean
6
+
7
+
8
+ def api_client() -> Glean:
9
+ """Get the Glean API client."""
10
+ instance = os.getenv("GLEAN_INSTANCE")
11
+ api_token = os.getenv("GLEAN_INDEXING_API_TOKEN")
12
+
13
+ if not api_token or not instance:
14
+ raise ValueError(
15
+ "GLEAN_INDEXING_API_TOKEN and GLEAN_INSTANCE environment variables are required"
16
+ )
17
+
18
+ return Glean(api_token=api_token, instance=instance)
@@ -0,0 +1,54 @@
1
+ """Performance metrics tracking utility for connectors."""
2
+
3
+ import logging
4
+ import time
5
+ from typing import Any, Dict, Optional
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class ConnectorMetrics:
11
+ """A context manager for tracking connector metrics."""
12
+
13
+ def __init__(self, name: str, logger: Optional[logging.Logger] = None):
14
+ """Initialize the ConnectorMetrics.
15
+
16
+ Args:
17
+ name: The name of the operation being timed.
18
+ logger: An optional logger to use for metrics. If None, the default logger is used.
19
+ """
20
+ self.name = name
21
+ self.logger = logger or logging.getLogger(__name__)
22
+ self.start_time = 0
23
+ self.end_time = 0
24
+ self.stats: Dict[str, Any] = {}
25
+
26
+ def __enter__(self) -> "ConnectorMetrics":
27
+ """Enter the context manager, starting the timer.
28
+
29
+ Returns:
30
+ The ConnectorMetrics instance.
31
+ """
32
+ self.start_time = time.time()
33
+ self.logger.info(f"Starting {self.name}")
34
+ return self
35
+
36
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
37
+ """Exit the context manager, stopping the timer and logging metrics."""
38
+ self.end_time = time.time()
39
+ duration = self.end_time - self.start_time
40
+ self.stats["duration"] = duration
41
+ self.logger.info(f"Completed {self.name} in {duration:.2f} seconds")
42
+
43
+ if self.stats:
44
+ self.logger.info(f"Metrics for {self.name}: {self.stats}")
45
+
46
+ def record(self, metric: str, value: Any) -> None:
47
+ """Record a metric.
48
+
49
+ Args:
50
+ metric: The name of the metric.
51
+ value: The value of the metric.
52
+ """
53
+ self.stats[metric] = value
54
+ self.logger.debug(f"Recorded metric {metric}={value} for {self.name}")
@@ -0,0 +1,20 @@
1
+ """Mock implementations for testing."""
2
+
3
+ import logging
4
+ from typing import List
5
+
6
+ from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class MockGleanClient:
12
+ """Mock Glean API client for examples and testing."""
13
+
14
+ def batch_index_documents(self, datasource: str, documents: List[DocumentDefinition]) -> None:
15
+ """Mock method for indexing documents."""
16
+ logger.info(f"Mock indexing {len(documents)} documents to datasource '{datasource}'")
17
+
18
+ def bulk_index_employees(self, employees: List[EmployeeInfoDefinition]) -> None:
19
+ """Mock method for indexing employees."""
20
+ logger.info(f"Mock indexing {len(employees)} employees")
@@ -0,0 +1,21 @@
1
+ """Connector implementations for Glean indexing."""
2
+
3
+ from glean.indexing.connectors.base_connector import BaseConnector
4
+ from glean.indexing.connectors.base_data_client import BaseDataClient, BaseConnectorDataClient
5
+ from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
6
+ from glean.indexing.connectors.base_people_connector import BasePeopleConnector
7
+ from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient, StreamingConnectorDataClient
8
+ from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
9
+ from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
10
+
11
+ __all__ = [
12
+ "BaseConnector",
13
+ "BaseDataClient",
14
+ "BaseConnectorDataClient", # Backward compatibility alias
15
+ "BaseDatasourceConnector",
16
+ "BasePeopleConnector",
17
+ "BaseStreamingDataClient",
18
+ "StreamingConnectorDataClient", # Backward compatibility alias
19
+ "BaseStreamingDatasourceConnector",
20
+ "ConnectorTestHarness",
21
+ ]
@@ -0,0 +1,60 @@
1
+ """Base connector class for the Glean Connector SDK."""
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from typing import Generic, Optional, Sequence
6
+
7
+ from glean.indexing.models import IndexingMode, TIndexableEntityDefinition, TSourceData
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
13
+ """
14
+ Abstract base class for all Glean connectors.
15
+
16
+ This class defines the core interface and lifecycle for all connector types (datasource, people, streaming, etc.).
17
+ Connector implementors should inherit from this class and provide concrete implementations for all abstract methods.
18
+
19
+ Type Parameters:
20
+ TSourceData: The type of raw data fetched from the external source (e.g., dict, TypedDict, or custom model).
21
+ TIndexableEntityDefinition: The type of Glean API entity definition produced by the connector (e.g., DocumentDefinition, EmployeeInfoDefinition).
22
+
23
+ Required Methods for Subclasses:
24
+ - get_data(since: Optional[str] = None) -> Sequence[TSourceData]:
25
+ Fetches source data from the external system. Should support incremental fetches if possible.
26
+ - transform(data: Sequence[TSourceData]) -> List[TIndexableEntityDefinition]:
27
+ Transforms source data into Glean API entity definitions ready for indexing.
28
+ - index_data(mode: IndexingMode = IndexingMode.FULL) -> None:
29
+ Orchestrates the full indexing process (fetch, transform, upload).
30
+
31
+ Attributes:
32
+ name (str): The unique name of the connector (should be snake_case).
33
+
34
+ Example:
35
+ class MyConnector(BaseConnector[MyRawType, DocumentDefinition]):
36
+ ...
37
+ """
38
+
39
+ def __init__(self, name: str):
40
+ """Initialize the connector.
41
+
42
+ Args:
43
+ name: The name of the connector.
44
+ """
45
+ self.name = name
46
+
47
+ @abstractmethod
48
+ def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
49
+ """Get data from the data client or source system."""
50
+ pass
51
+
52
+ @abstractmethod
53
+ def transform(self, data: Sequence[TSourceData]) -> Sequence[TIndexableEntityDefinition]:
54
+ """Transform source data to Glean entity definitions."""
55
+ pass
56
+
57
+ @abstractmethod
58
+ def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
59
+ """Index data from the connector to Glean."""
60
+ pass
@@ -0,0 +1,35 @@
1
+ """Base data client interface for standard Glean connectors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Generic, Sequence
5
+
6
+ from glean.indexing.models import TSourceData
7
+
8
+
9
+ class BaseDataClient(ABC, Generic[TSourceData]):
10
+ """
11
+ Base class for all connector data clients.
12
+
13
+ This interface defines how connectors fetch data from external sources.
14
+ All data clients should inherit from this class and implement get_source_data.
15
+
16
+ Type Parameters:
17
+ TSourceData: The type of data returned from the external source
18
+ """
19
+
20
+ @abstractmethod
21
+ def get_source_data(self, **kwargs: Any) -> Sequence[TSourceData]:
22
+ """
23
+ Fetch all data from the external source.
24
+
25
+ Args:
26
+ **kwargs: Additional parameters for data fetching (e.g., since timestamp)
27
+
28
+ Returns:
29
+ A sequence of data items from the source
30
+ """
31
+ pass
32
+
33
+
34
+ # Alias for backward compatibility during transition
35
+ BaseConnectorDataClient = BaseDataClient
@@ -0,0 +1,314 @@
1
+ """Base datasource connector for the Glean Connector SDK."""
2
+
3
+ import logging
4
+ import uuid
5
+ from abc import ABC
6
+ from typing import Optional, Sequence
7
+
8
+ from glean.api_client.models import DocumentDefinition
9
+ from glean.indexing.common import BatchProcessor, api_client
10
+ from glean.indexing.connectors.base_connector import BaseConnector
11
+ from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
12
+ from glean.indexing.models import (
13
+ CustomDatasourceConfig,
14
+ DatasourceIdentityDefinitions,
15
+ IndexingMode,
16
+ TSourceData,
17
+ )
18
+ from glean.indexing.observability.observability import ConnectorObservability
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], ABC):
24
+ """
25
+ Base class for all Glean datasource connectors.
26
+
27
+ This class provides the core logic for indexing document/content data from external systems into Glean.
28
+ Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the datasource.
29
+
30
+ To implement a custom connector, inherit from this class and implement:
31
+ - configuration: CustomDatasourceConfig (class or instance attribute)
32
+ - get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]
33
+ - transform(self, data: Sequence[TSourceData]) -> List[DocumentDefinition]
34
+
35
+ Attributes:
36
+ name (str): The unique name of the connector (should be snake_case).
37
+ configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
38
+ batch_size (int): The batch size for uploads (default: 1000).
39
+ data_client (BaseConnectorDataClient): The data client for fetching source data.
40
+ observability (ConnectorObservability): Observability and metrics for this connector.
41
+
42
+ Example:
43
+ class MyWikiConnector(BaseDatasourceConnector[WikiPageData]):
44
+ configuration = CustomDatasourceConfig(...)
45
+ ...
46
+ """
47
+
48
+ configuration: CustomDatasourceConfig
49
+
50
+ def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
51
+ """
52
+ Initialize the datasource connector.
53
+
54
+ Args:
55
+ name: The name of the connector
56
+ data_client: The data client for fetching source data
57
+ """
58
+ super().__init__(name)
59
+ self.data_client = data_client
60
+ self._observability = ConnectorObservability(name)
61
+ self.batch_size = 1000
62
+
63
+ @property
64
+ def display_name(self) -> str:
65
+ """Get the display name for this datasource."""
66
+ return self.name.replace("_", " ").title()
67
+
68
+ @property
69
+ def observability(self) -> ConnectorObservability:
70
+ """The observability instance for this connector."""
71
+ return self._observability
72
+
73
+ def get_identities(self) -> DatasourceIdentityDefinitions:
74
+ """
75
+ Gets all identities for this datasource (users, groups & memberships).
76
+
77
+ Returns:
78
+ A DatasourceIdentityDefinitions object containing all identities for this datasource.
79
+ """
80
+ return DatasourceIdentityDefinitions(users=[])
81
+
82
+ def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
83
+ """Get data from the datasource via the data client.
84
+
85
+ Args:
86
+ since: If provided, only get data modified since this timestamp.
87
+
88
+ Returns:
89
+ A sequence of source data items from the external system.
90
+ """
91
+ return self.data_client.get_source_data(since=since)
92
+
93
+ def configure_datasource(self, is_test: bool = False) -> None:
94
+ """
95
+ Configure the datasource in Glean using the datasources.add() API.
96
+
97
+ Args:
98
+ is_test: Whether this is a test datasource
99
+ """
100
+ config = self.configuration
101
+
102
+ if not config.name:
103
+ raise ValueError("Missing required field: name in Configuration")
104
+
105
+ if not config.display_name:
106
+ raise ValueError("Missing required field: display_name in Configuration")
107
+
108
+ logger.info(f"Configuring datasource: {config.name}")
109
+
110
+ if is_test:
111
+ config.is_test_datasource = True
112
+
113
+ with api_client() as client:
114
+ client.indexing.datasources.add(**config.dict(exclude_unset=True))
115
+ logger.info(f"Successfully configured datasource: {config.name}")
116
+
117
+ def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
118
+ """
119
+ Index data from the datasource to Glean with identity crawl followed by content crawl.
120
+
121
+ Args:
122
+ mode: The indexing mode to use (FULL or INCREMENTAL).
123
+ """
124
+ self._observability.start_execution()
125
+
126
+ try:
127
+ logger.info(f"Starting {mode.name.lower()} indexing for datasource '{self.name}'")
128
+
129
+ logger.info("Starting identity crawl")
130
+ identities = self.get_identities()
131
+
132
+ users = identities.get("users")
133
+ if users:
134
+ logger.info(f"Indexing {len(users)} users")
135
+ self._batch_index_users(users)
136
+
137
+ groups = identities.get("groups")
138
+ if groups:
139
+ logger.info(f"Indexing {len(groups)} groups")
140
+ self._batch_index_groups(groups)
141
+
142
+ memberships = identities.get("memberships")
143
+ if not memberships:
144
+ raise ValueError("Groups were provided, but no memberships were provided.")
145
+
146
+ logger.info(f"Indexing {len(memberships)} memberships")
147
+ self._batch_index_memberships(memberships)
148
+
149
+ since = None
150
+ if mode == IndexingMode.INCREMENTAL:
151
+ since = self._get_last_crawl_timestamp()
152
+ logger.info(f"Incremental crawl since: {since}")
153
+
154
+ logger.info("Starting content crawl")
155
+ self._observability.start_timer("data_fetch")
156
+ data = self.get_data(since=since)
157
+ self._observability.end_timer("data_fetch")
158
+
159
+ logger.info(f"Retrieved {len(data)} items from datasource")
160
+ self._observability.record_metric("items_fetched", len(data))
161
+
162
+ self._observability.start_timer("data_transform")
163
+ documents = self.transform(data)
164
+ self._observability.end_timer("data_transform")
165
+
166
+ logger.info(f"Transformed {len(documents)} documents")
167
+ self._observability.record_metric("documents_transformed", len(documents))
168
+
169
+ self._observability.start_timer("data_upload")
170
+ if documents:
171
+ logger.info(f"Indexing {len(documents)} documents")
172
+ self._batch_index_documents(documents)
173
+ self._observability.end_timer("data_upload")
174
+
175
+ logger.info(f"Successfully indexed {len(documents)} documents to Glean")
176
+ self._observability.record_metric("documents_indexed", len(documents))
177
+
178
+ except Exception as e:
179
+ logger.exception(f"Error during indexing: {e}")
180
+ self._observability.increment_counter("indexing_errors")
181
+ raise
182
+ finally:
183
+ self._observability.end_execution()
184
+
185
+ def _batch_index_users(self, users) -> None:
186
+ """Index users in batches with proper page signaling."""
187
+ if not users:
188
+ return
189
+
190
+ batches = list(BatchProcessor(users, batch_size=self.batch_size))
191
+ total_batches = len(batches)
192
+
193
+ logger.info(f"Uploading {len(users)} users in {total_batches} batches")
194
+
195
+ upload_id = str(uuid.uuid4())
196
+ for i, batch in enumerate(batches):
197
+ try:
198
+ with api_client() as client:
199
+ client.indexing.permissions.bulk_index_users(
200
+ datasource=self.name,
201
+ users=list(batch),
202
+ upload_id=upload_id,
203
+ is_first_page=(i == 0),
204
+ is_last_page=(i == total_batches - 1),
205
+ )
206
+
207
+ logger.info(f"User batch {i + 1}/{total_batches} uploaded successfully")
208
+ self._observability.increment_counter("batches_uploaded")
209
+
210
+ except Exception as e:
211
+ logger.error(f"Failed to upload user batch {i + 1}/{total_batches}: {e}")
212
+ self._observability.increment_counter("batch_upload_errors")
213
+ raise
214
+
215
+ def _batch_index_groups(self, groups) -> None:
216
+ """Index groups in batches with proper page signaling."""
217
+ if not groups:
218
+ return
219
+
220
+ batches = list(BatchProcessor(groups, batch_size=self.batch_size))
221
+ total_batches = len(batches)
222
+
223
+ logger.info(f"Uploading {len(groups)} groups in {total_batches} batches")
224
+
225
+ upload_id = str(uuid.uuid4())
226
+ for i, batch in enumerate(batches):
227
+ try:
228
+ with api_client() as client:
229
+ client.indexing.permissions.bulk_index_groups(
230
+ datasource=self.name,
231
+ groups=list(batch),
232
+ upload_id=upload_id,
233
+ is_first_page=(i == 0),
234
+ is_last_page=(i == total_batches - 1),
235
+ )
236
+
237
+ logger.info(f"Group batch {i + 1}/{total_batches} uploaded successfully")
238
+ self._observability.increment_counter("batches_uploaded")
239
+
240
+ except Exception as e:
241
+ logger.error(f"Failed to upload group batch {i + 1}/{total_batches}: {e}")
242
+ self._observability.increment_counter("batch_upload_errors")
243
+ raise
244
+
245
+ def _batch_index_memberships(self, memberships) -> None:
246
+ """Index memberships in batches with proper page signaling."""
247
+ if not memberships:
248
+ return
249
+
250
+ batches = list(BatchProcessor(memberships, batch_size=self.batch_size))
251
+ total_batches = len(batches)
252
+
253
+ logger.info(f"Uploading {len(memberships)} memberships in {total_batches} batches")
254
+
255
+ upload_id = str(uuid.uuid4())
256
+ for i, batch in enumerate(batches):
257
+ try:
258
+ with api_client() as client:
259
+ client.indexing.permissions.bulk_index_memberships(
260
+ datasource=self.name,
261
+ memberships=list(batch),
262
+ upload_id=upload_id,
263
+ is_first_page=(i == 0),
264
+ is_last_page=(i == total_batches - 1),
265
+ )
266
+
267
+ logger.info(f"Membership batch {i + 1}/{total_batches} uploaded successfully")
268
+ self._observability.increment_counter("batches_uploaded")
269
+
270
+ except Exception as e:
271
+ logger.error(f"Failed to upload membership batch {i + 1}/{total_batches}: {e}")
272
+ self._observability.increment_counter("batch_upload_errors")
273
+ raise
274
+
275
+ def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
276
+ """Index documents in batches with proper page signaling."""
277
+ if not documents:
278
+ return
279
+
280
+ batches = list(BatchProcessor(list(documents), batch_size=self.batch_size))
281
+ total_batches = len(batches)
282
+
283
+ logger.info(f"Uploading {len(documents)} documents in {total_batches} batches")
284
+
285
+ upload_id = str(uuid.uuid4())
286
+ for i, batch in enumerate(batches):
287
+ try:
288
+ with api_client() as client:
289
+ client.indexing.documents.bulk_index(
290
+ datasource=self.name,
291
+ documents=list(batch),
292
+ upload_id=upload_id,
293
+ is_first_page=(i == 0),
294
+ is_last_page=(i == total_batches - 1),
295
+ )
296
+
297
+ logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
298
+ self._observability.increment_counter("batches_uploaded")
299
+
300
+ except Exception as e:
301
+ logger.error(f"Failed to upload document batch {i + 1}/{total_batches}: {e}")
302
+ self._observability.increment_counter("batch_upload_errors")
303
+ raise
304
+
305
+ def _get_last_crawl_timestamp(self) -> Optional[str]:
306
+ """
307
+ Get the timestamp of the last successful crawl for incremental indexing.
308
+
309
+ Subclasses should override this to implement proper timestamp tracking.
310
+
311
+ Returns:
312
+ ISO timestamp string or None for full crawl
313
+ """
314
+ return None