glean-indexing-sdk 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,56 +1,56 @@
1
1
  """Glean Indexing SDK.
2
2
 
3
- A Python SDK for building custom Glean indexing solutions. This package provides
3
+ A Python SDK for building custom Glean indexing solutions. This package provides
4
4
  the base classes and utilities to create custom connectors for Glean's indexing APIs.
5
5
  """
6
6
 
7
- from importlib.metadata import version, PackageNotFoundError
7
+ from importlib.metadata import PackageNotFoundError, version
8
+
9
+ from glean.indexing import models
10
+ from glean.indexing.common import BatchProcessor, ConnectorMetrics, ContentFormatter, MockGleanClient, api_client
8
11
  from glean.indexing.connectors import (
12
+ BaseAsyncStreamingDataClient,
13
+ BaseAsyncStreamingDatasourceConnector,
9
14
  BaseConnector,
15
+ BaseDataClient,
10
16
  BaseDatasourceConnector,
11
- BaseStreamingDatasourceConnector,
12
17
  BasePeopleConnector,
13
- BaseConnectorDataClient,
14
- StreamingConnectorDataClient,
18
+ BaseStreamingDataClient,
19
+ BaseStreamingDatasourceConnector,
15
20
  )
16
- from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
17
- from glean.indexing.observability.observability import ConnectorObservability
18
- from glean.indexing.testing import ConnectorTestHarness
19
21
  from glean.indexing.models import (
20
22
  DatasourceIdentityDefinitions,
21
23
  IndexingMode,
22
- TSourceData,
23
24
  TIndexableEntityDefinition,
25
+ TSourceData,
24
26
  )
25
- from glean.indexing import models
27
+ from glean.indexing.observability.observability import ConnectorObservability
28
+ from glean.indexing.testing import ConnectorTestHarness
26
29
 
27
30
  __all__ = [
28
31
  "BaseConnector",
32
+ "BaseDataClient",
29
33
  "BaseDatasourceConnector",
30
34
  "BasePeopleConnector",
35
+ "BaseStreamingDataClient",
31
36
  "BaseStreamingDatasourceConnector",
32
-
33
- "BaseConnectorDataClient",
34
- "StreamingConnectorDataClient",
35
-
37
+ "BaseAsyncStreamingDataClient",
38
+ "BaseAsyncStreamingDatasourceConnector",
36
39
  "BatchProcessor",
37
40
  "ContentFormatter",
38
41
  "ConnectorMetrics",
39
42
  "ConnectorObservability",
40
43
  "ConnectorTestHarness",
41
-
42
44
  "DatasourceIdentityDefinitions",
43
45
  "IndexingMode",
44
46
  "TSourceData",
45
47
  "TIndexableEntityDefinition",
46
-
47
48
  "MockGleanClient",
48
49
  "api_client",
49
-
50
50
  "models",
51
51
  ]
52
52
 
53
53
  try:
54
54
  __version__ = version("glean-indexing-sdk")
55
55
  except PackageNotFoundError:
56
- __version__ = "0.1.0"
56
+ __version__ = "0.3.0"
@@ -1,21 +1,23 @@
1
1
  """Connector implementations for Glean indexing."""
2
2
 
3
3
  from glean.indexing.connectors.base_connector import BaseConnector
4
- from glean.indexing.connectors.base_data_client import BaseDataClient, BaseConnectorDataClient
4
+ from glean.indexing.connectors.base_data_client import BaseDataClient
5
5
  from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
6
6
  from glean.indexing.connectors.base_people_connector import BasePeopleConnector
7
- from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient, StreamingConnectorDataClient
7
+ from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
8
8
  from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
9
+ from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
10
+ from glean.indexing.connectors.base_async_streaming_datasource_connector import BaseAsyncStreamingDatasourceConnector
9
11
  from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
10
12
 
11
13
  __all__ = [
12
14
  "BaseConnector",
13
15
  "BaseDataClient",
14
- "BaseConnectorDataClient", # Backward compatibility alias
15
16
  "BaseDatasourceConnector",
16
17
  "BasePeopleConnector",
17
- "BaseStreamingDataClient",
18
- "StreamingConnectorDataClient", # Backward compatibility alias
18
+ "BaseStreamingDataClient",
19
19
  "BaseStreamingDatasourceConnector",
20
+ "BaseAsyncStreamingDataClient",
21
+ "BaseAsyncStreamingDatasourceConnector",
20
22
  "ConnectorTestHarness",
21
23
  ]
@@ -0,0 +1,42 @@
1
+ """Base async streaming data client for fetching data in chunks."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, AsyncGenerator, Generic
5
+
6
+ from glean.indexing.models import TSourceData
7
+
8
+
9
+ class BaseAsyncStreamingDataClient(ABC, Generic[TSourceData]):
10
+ """
11
+ Base class for async streaming data clients that fetch data in chunks.
12
+
13
+ Use this for large datasets with async APIs to minimize memory usage
14
+ and maximize I/O throughput.
15
+
16
+ Type Parameters:
17
+ TSourceData: The type of data yielded from the external source
18
+
19
+ Example:
20
+ class MyAsyncDataClient(BaseAsyncStreamingDataClient[MyDocData]):
21
+ async def get_source_data(self, **kwargs) -> AsyncGenerator[MyDocData, None]:
22
+ async for page in self.fetch_pages():
23
+ for item in page:
24
+ yield item
25
+ """
26
+
27
+ @abstractmethod
28
+ async def get_source_data(self, **kwargs: Any) -> AsyncGenerator[TSourceData, None]:
29
+ """
30
+ Retrieves source data as an async generator.
31
+
32
+ This method should be implemented to return an async generator
33
+ that yields data items one at a time or in small batches.
34
+
35
+ Args:
36
+ **kwargs: Additional keyword arguments for customizing data retrieval.
37
+
38
+ Yields:
39
+ Individual data items from the external source.
40
+ """
41
+ if False:
42
+ yield # type: ignore[misc]
@@ -0,0 +1,233 @@
1
+ """Base async streaming datasource connector for memory-efficient processing of large datasets."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import uuid
6
+ from abc import ABC
7
+ from typing import AsyncGenerator, List, Optional, Sequence
8
+
9
+ from glean.indexing.common import api_client
10
+ from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
11
+ from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
12
+ from glean.indexing.models import IndexingMode, TSourceData
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class BaseAsyncStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
18
+ """
19
+ Base class for async streaming datasource connectors.
20
+
21
+ This class provides async-native streaming for memory-efficient processing
22
+ of large datasets. Use this when your data source provides async APIs
23
+ (e.g., aiohttp, httpx async, etc.).
24
+
25
+ To implement a custom async streaming connector, inherit from this class and implement:
26
+ - configuration: CustomDatasourceConfig (class or instance attribute)
27
+ - async_data_client: BaseAsyncStreamingDataClient (set in __init__)
28
+ - transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
29
+
30
+ Attributes:
31
+ name (str): The unique name of the connector (should be snake_case).
32
+ configuration (CustomDatasourceConfig): The datasource configuration.
33
+ batch_size (int): The batch size for uploads (default: 1000).
34
+ async_data_client (BaseAsyncStreamingDataClient): The async streaming data client.
35
+
36
+ Example:
37
+ class MyAsyncConnector(BaseAsyncStreamingDatasourceConnector[MyDocData]):
38
+ configuration = CustomDatasourceConfig(...)
39
+
40
+ def __init__(self, name: str):
41
+ async_client = MyAsyncDataClient()
42
+ super().__init__(name, async_client)
43
+
44
+ def transform(self, data: Sequence[MyDocData]) -> Sequence[DocumentDefinition]:
45
+ return [self._transform_doc(d) for d in data]
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ name: str,
51
+ async_data_client: BaseAsyncStreamingDataClient[TSourceData],
52
+ ):
53
+ super().__init__(name, None) # type: ignore[arg-type]
54
+ self.async_data_client = async_data_client
55
+ self.batch_size = 1000
56
+ self._upload_id: Optional[str] = None
57
+ self._force_restart: bool = False
58
+
59
+ def generate_upload_id(self) -> str:
60
+ """Generate a unique upload ID for batch tracking."""
61
+ if not self._upload_id:
62
+ self._upload_id = str(uuid.uuid4())
63
+ return self._upload_id
64
+
65
+ async def get_data_async(
66
+ self, since: Optional[str] = None
67
+ ) -> AsyncGenerator[TSourceData, None]:
68
+ """
69
+ Get data from the async streaming data client.
70
+
71
+ Args:
72
+ since: If provided, only get data modified since this timestamp.
73
+
74
+ Yields:
75
+ Individual data items from the source
76
+ """
77
+ logger.info(
78
+ f"Fetching async streaming data from source{' since ' + since if since else ''}"
79
+ )
80
+ async for item in self.async_data_client.get_source_data(since=since):
81
+ yield item
82
+
83
+ async def index_data_async(
84
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
85
+ ) -> None:
86
+ """
87
+ Index data from the datasource to Glean using async streaming.
88
+
89
+ Args:
90
+ mode: The indexing mode to use (FULL or INCREMENTAL).
91
+ force_restart: If True, forces a restart of the upload.
92
+ """
93
+ logger.info(
94
+ f"Starting {mode.name.lower()} async streaming indexing for datasource '{self.name}'"
95
+ )
96
+
97
+ since = None
98
+ if mode == IndexingMode.INCREMENTAL:
99
+ since = self._get_last_crawl_timestamp()
100
+ logger.info(f"Incremental crawl since: {since}")
101
+
102
+ upload_id = self.generate_upload_id()
103
+ self._force_restart = force_restart
104
+ is_first_batch = True
105
+ batch: List[TSourceData] = []
106
+ batch_count = 0
107
+
108
+ try:
109
+ data_iterator = self.get_data_async(since=since).__aiter__()
110
+ exhausted = False
111
+
112
+ while not exhausted:
113
+ try:
114
+ item = await data_iterator.__anext__()
115
+ batch.append(item)
116
+
117
+ if len(batch) == self.batch_size:
118
+ try:
119
+ next_item = await data_iterator.__anext__()
120
+
121
+ await self._process_batch_async(
122
+ batch=batch,
123
+ upload_id=upload_id,
124
+ is_first_batch=is_first_batch,
125
+ is_last_batch=False,
126
+ batch_number=batch_count,
127
+ )
128
+
129
+ batch_count += 1
130
+ batch = [next_item]
131
+ is_first_batch = False
132
+
133
+ except StopAsyncIteration:
134
+ exhausted = True
135
+
136
+ except StopAsyncIteration:
137
+ exhausted = True
138
+
139
+ if batch:
140
+ await self._process_batch_async(
141
+ batch=batch,
142
+ upload_id=upload_id,
143
+ is_first_batch=is_first_batch,
144
+ is_last_batch=True,
145
+ batch_number=batch_count,
146
+ )
147
+ batch_count += 1
148
+
149
+ logger.info(
150
+ f"Async streaming indexing completed successfully. Processed {batch_count} batches."
151
+ )
152
+
153
+ except Exception as e:
154
+ logger.exception(f"Error during async streaming indexing: {e}")
155
+ raise
156
+
157
+ async def _process_batch_async(
158
+ self,
159
+ batch: List[TSourceData],
160
+ upload_id: str,
161
+ is_first_batch: bool,
162
+ is_last_batch: bool,
163
+ batch_number: int,
164
+ ) -> None:
165
+ """
166
+ Process a single batch of data.
167
+
168
+ Args:
169
+ batch: The batch of raw data to process
170
+ upload_id: The upload ID for this indexing session
171
+ is_first_batch: Whether this is the first batch
172
+ is_last_batch: Whether this is the last batch
173
+ batch_number: The sequence number of this batch
174
+ """
175
+ logger.info(f"Processing batch {batch_number} with {len(batch)} items")
176
+
177
+ try:
178
+ transformed_batch = self.transform(batch)
179
+ logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
180
+
181
+ bulk_index_kwargs = {
182
+ "datasource": self.name,
183
+ "documents": list(transformed_batch),
184
+ "upload_id": upload_id,
185
+ "is_first_page": is_first_batch,
186
+ "is_last_page": is_last_batch,
187
+ }
188
+
189
+ if self._force_restart and is_first_batch:
190
+ bulk_index_kwargs["forceRestartUpload"] = True
191
+ logger.info("Force restarting upload - discarding any previous upload progress")
192
+
193
+ with api_client() as client:
194
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
195
+
196
+ logger.info(f"Batch {batch_number} indexed successfully")
197
+
198
+ except Exception as e:
199
+ logger.error(f"Failed to process batch {batch_number}: {e}")
200
+ raise
201
+
202
+ def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
203
+ """
204
+ Sync fallback - collects all data into memory.
205
+
206
+ Warning: This defeats the purpose of streaming. Use get_data_async() instead.
207
+ """
208
+
209
+ async def collect() -> List[TSourceData]:
210
+ result: List[TSourceData] = []
211
+ async for item in self.get_data_async(since=since):
212
+ result.append(item)
213
+ return result
214
+
215
+ logger.warning(
216
+ "Sync get_data() called on async connector - using asyncio.run(). "
217
+ "Consider using get_data_async() for better performance."
218
+ )
219
+ return asyncio.run(collect())
220
+
221
+ def index_data(
222
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
223
+ ) -> None:
224
+ """
225
+ Sync fallback for index_data.
226
+
227
+ Warning: This blocks the current thread. Use index_data_async() instead.
228
+ """
229
+ logger.warning(
230
+ "Sync index_data() called on async connector - using asyncio.run(). "
231
+ "Consider using index_data_async() for better performance."
232
+ )
233
+ asyncio.run(self.index_data_async(mode=mode, force_restart=force_restart))
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
55
55
  pass
56
56
 
57
57
  @abstractmethod
58
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
59
- """Index data from the connector to Glean."""
58
+ def index_data(
59
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
60
+ ) -> None:
61
+ """Index data from the connector to Glean.
62
+
63
+ Args:
64
+ mode: The indexing mode to use (FULL or INCREMENTAL).
65
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
66
+ """
60
67
  pass
@@ -29,7 +29,3 @@ class BaseDataClient(ABC, Generic[TSourceData]):
29
29
  A sequence of data items from the source
30
30
  """
31
31
  pass
32
-
33
-
34
- # Alias for backward compatibility during transition
35
- BaseConnectorDataClient = BaseDataClient
@@ -6,9 +6,10 @@ from abc import ABC
6
6
  from typing import Optional, Sequence
7
7
 
8
8
  from glean.api_client.models import DocumentDefinition
9
+
9
10
  from glean.indexing.common import BatchProcessor, api_client
10
11
  from glean.indexing.connectors.base_connector import BaseConnector
11
- from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
12
+ from glean.indexing.connectors.base_data_client import BaseDataClient
12
13
  from glean.indexing.models import (
13
14
  CustomDatasourceConfig,
14
15
  DatasourceIdentityDefinitions,
@@ -36,7 +37,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
36
37
  name (str): The unique name of the connector (should be snake_case).
37
38
  configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
38
39
  batch_size (int): The batch size for uploads (default: 1000).
39
- data_client (BaseConnectorDataClient): The data client for fetching source data.
40
+ data_client (BaseDataClient): The data client for fetching source data.
40
41
  observability (ConnectorObservability): Observability and metrics for this connector.
41
42
 
42
43
  Example:
@@ -47,7 +48,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
47
48
 
48
49
  configuration: CustomDatasourceConfig
49
50
 
50
- def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
51
+ def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
51
52
  """
52
53
  Initialize the datasource connector.
53
54
 
@@ -114,12 +115,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
114
115
  client.indexing.datasources.add(**config.dict(exclude_unset=True))
115
116
  logger.info(f"Successfully configured datasource: {config.name}")
116
117
 
117
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
118
+ def index_data(
119
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
120
+ ) -> None:
118
121
  """
119
122
  Index data from the datasource to Glean with identity crawl followed by content crawl.
120
123
 
121
124
  Args:
122
125
  mode: The indexing mode to use (FULL or INCREMENTAL).
126
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
127
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
123
128
  """
124
129
  self._observability.start_execution()
125
130
 
@@ -169,7 +174,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
169
174
  self._observability.start_timer("data_upload")
170
175
  if documents:
171
176
  logger.info(f"Indexing {len(documents)} documents")
172
- self._batch_index_documents(documents)
177
+ self._batch_index_documents(documents, force_restart=force_restart)
173
178
  self._observability.end_timer("data_upload")
174
179
 
175
180
  logger.info(f"Successfully indexed {len(documents)} documents to Glean")
@@ -272,8 +277,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
272
277
  self._observability.increment_counter("batch_upload_errors")
273
278
  raise
274
279
 
275
- def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
276
- """Index documents in batches with proper page signaling."""
280
+ def _batch_index_documents(
281
+ self, documents: Sequence[DocumentDefinition], force_restart: bool = False
282
+ ) -> None:
283
+ """Index documents in batches with proper page signaling.
284
+
285
+ Args:
286
+ documents: The documents to index
287
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
288
+ """
277
289
  if not documents:
278
290
  return
279
291
 
@@ -285,14 +297,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
285
297
  upload_id = str(uuid.uuid4())
286
298
  for i, batch in enumerate(batches):
287
299
  try:
300
+ is_first_page = i == 0
301
+ bulk_index_kwargs = {
302
+ "datasource": self.name,
303
+ "documents": list(batch),
304
+ "upload_id": upload_id,
305
+ "is_first_page": is_first_page,
306
+ "is_last_page": (i == total_batches - 1),
307
+ }
308
+
309
+ if force_restart and is_first_page:
310
+ bulk_index_kwargs["forceRestartUpload"] = True
311
+ logger.info("Force restarting upload - discarding any previous upload progress")
312
+
288
313
  with api_client() as client:
289
- client.indexing.documents.bulk_index(
290
- datasource=self.name,
291
- documents=list(batch),
292
- upload_id=upload_id,
293
- is_first_page=(i == 0),
294
- is_last_page=(i == total_batches - 1),
295
- )
314
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
296
315
 
297
316
  logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
298
317
  self._observability.increment_counter("batches_uploaded")
@@ -6,9 +6,10 @@ from abc import ABC
6
6
  from typing import Optional, Sequence
7
7
 
8
8
  from glean.api_client.models import EmployeeInfoDefinition
9
+
9
10
  from glean.indexing.common import BatchProcessor, api_client
10
11
  from glean.indexing.connectors.base_connector import BaseConnector
11
- from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
12
+ from glean.indexing.connectors.base_data_client import BaseDataClient
12
13
  from glean.indexing.models import IndexingMode, TSourceData
13
14
  from glean.indexing.observability.observability import ConnectorObservability
14
15
 
@@ -31,7 +32,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
31
32
  name (str): The unique name of the connector (should be snake_case).
32
33
  configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
33
34
  batch_size (int): The batch size for uploads (default: 1000).
34
- data_client (BaseConnectorDataClient): The data client for fetching source data.
35
+ data_client (BaseDataClient): The data client for fetching source data.
35
36
  observability (ConnectorObservability): Observability and metrics for this connector.
36
37
 
37
38
  Example:
@@ -40,7 +41,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
40
41
  ...
41
42
  """
42
43
 
43
- def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
44
+ def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
44
45
  """
45
46
  Initialize the people connector.
46
47
 
@@ -58,11 +59,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
58
59
  """The observability instance for this connector."""
59
60
  return self._observability
60
61
 
61
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
62
+ def index_data(
63
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
64
+ ) -> None:
62
65
  """Index people data to Glean.
63
66
 
64
67
  Args:
65
68
  mode: The indexing mode to use (FULL or INCREMENTAL).
69
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
70
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
66
71
  """
67
72
  self._observability.start_execution()
68
73
 
@@ -89,7 +94,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
89
94
  self._observability.record_metric("employees_transformed", len(employees))
90
95
 
91
96
  self._observability.start_timer("data_upload")
92
- self._batch_index_employees(employees)
97
+ self._batch_index_employees(employees, force_restart=force_restart)
93
98
  self._observability.end_timer("data_upload")
94
99
 
95
100
  logger.info(f"Successfully indexed {len(employees)} employees to Glean")
@@ -113,8 +118,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
113
118
  """
114
119
  return self.data_client.get_source_data(since=since)
115
120
 
116
- def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
117
- """Index employees to Glean in batches."""
121
+ def _batch_index_employees(
122
+ self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
123
+ ) -> None:
124
+ """Index employees to Glean in batches.
125
+
126
+ Args:
127
+ employees: The employees to index
128
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
129
+ """
118
130
  if not employees:
119
131
  return
120
132
 
@@ -126,13 +138,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
126
138
  upload_id = str(uuid.uuid4())
127
139
  for i, batch in enumerate(batches):
128
140
  try:
141
+ is_first_page = i == 0
142
+ bulk_index_kwargs = {
143
+ "employees": list(batch),
144
+ "upload_id": upload_id,
145
+ "is_first_page": is_first_page,
146
+ "is_last_page": (i == total_batches - 1),
147
+ }
148
+
149
+ if force_restart and is_first_page:
150
+ bulk_index_kwargs["forceRestartUpload"] = True
151
+ logger.info("Force restarting upload - discarding any previous upload progress")
152
+
129
153
  with api_client() as client:
130
- client.indexing.people.bulk_index(
131
- employees=list(batch),
132
- upload_id=upload_id,
133
- is_first_page=(i == 0),
134
- is_last_page=(i == total_batches - 1),
135
- )
154
+ client.indexing.people.bulk_index(**bulk_index_kwargs)
136
155
 
137
156
  logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
138
157
  self._observability.increment_counter("batches_uploaded")
@@ -33,7 +33,3 @@ class BaseStreamingDataClient(ABC, Generic[TSourceData]):
33
33
  A generator of data items.
34
34
  """
35
35
  pass
36
-
37
-
38
- # Alias for backward compatibility during transition
39
- StreamingConnectorDataClient = BaseStreamingDataClient
@@ -6,7 +6,8 @@ from abc import ABC
6
6
  from typing import Generator, List, Optional, Sequence
7
7
 
8
8
  from glean.indexing.common import api_client
9
- from glean.indexing.connectors import BaseDatasourceConnector, StreamingConnectorDataClient
9
+ from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
10
+ from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
10
11
  from glean.indexing.models import IndexingMode, TSourceData
11
12
 
12
13
  logger = logging.getLogger(__name__)
@@ -28,7 +29,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
28
29
  name (str): The unique name of the connector (should be snake_case).
29
30
  configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
30
31
  batch_size (int): The batch size for uploads (default: 1000).
31
- data_client (StreamingConnectorDataClient): The streaming data client for fetching source data.
32
+ data_client (BaseStreamingDataClient): The streaming data client for fetching source data.
32
33
  observability (ConnectorObservability): Observability and metrics for this connector.
33
34
 
34
35
  Notes:
@@ -41,12 +42,13 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
41
42
  ...
42
43
  """
43
44
 
44
- def __init__(self, name: str, data_client: StreamingConnectorDataClient[TSourceData]):
45
+ def __init__(self, name: str, data_client: BaseStreamingDataClient[TSourceData]):
45
46
  # Note: We pass the streaming client as-is since it's a specialized version
46
47
  # The type checker may warn about this, but it's intentional for streaming
47
48
  super().__init__(name, data_client) # type: ignore[arg-type]
48
49
  self.batch_size = 1000
49
50
  self._upload_id: Optional[str] = None
51
+ self._force_restart: bool = False
50
52
 
51
53
  def generate_upload_id(self) -> str:
52
54
  """Generate a unique upload ID for batch tracking."""
@@ -67,20 +69,26 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
67
69
  logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
68
70
  yield from self.data_client.get_source_data(since=since)
69
71
 
70
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
72
+ def index_data(
73
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
74
+ ) -> None:
71
75
  """
72
76
  Index data from the datasource to Glean using streaming.
73
77
 
74
78
  Args:
75
79
  mode: The indexing mode to use (FULL or INCREMENTAL).
80
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
81
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
76
82
  """
77
83
  logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
78
84
 
79
85
  since = None
80
86
  if mode == IndexingMode.INCREMENTAL:
81
- since = "2023-01-01T00:00:00Z"
87
+ since = self._get_last_crawl_timestamp()
88
+ logger.info(f"Incremental crawl since: {since}")
82
89
 
83
90
  upload_id = self.generate_upload_id()
91
+ self._force_restart = force_restart
84
92
  data_iterator = self.get_data(since=since)
85
93
  is_first_batch = True
86
94
  batch: List[TSourceData] = []
@@ -150,14 +158,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
150
158
  transformed_batch = self.transform(batch)
151
159
  logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
152
160
 
161
+ bulk_index_kwargs = {
162
+ "datasource": self.name,
163
+ "documents": list(transformed_batch),
164
+ "upload_id": upload_id,
165
+ "is_first_page": is_first_batch,
166
+ "is_last_page": is_last_batch,
167
+ }
168
+
169
+ if self._force_restart and is_first_batch:
170
+ bulk_index_kwargs["forceRestartUpload"] = True
171
+ logger.info("Force restarting upload - discarding any previous upload progress")
172
+
153
173
  with api_client() as client:
154
- client.indexing.documents.bulk_index(
155
- datasource=self.name,
156
- documents=list(transformed_batch),
157
- upload_id=upload_id,
158
- is_first_page=is_first_batch,
159
- is_last_page=is_last_batch,
160
- )
174
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
161
175
 
162
176
  logger.info(f"Batch {batch_number} indexed successfully")
163
177
 
@@ -4,6 +4,7 @@ import logging
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
6
  from glean.api_client.models import DocumentDefinition, EmployeeInfoDefinition
7
+
7
8
  from glean.indexing.testing.response_validator import ResponseValidator
8
9
 
9
10
  logger = logging.getLogger(__name__)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glean-indexing-sdk
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: SDK for building custom Glean indexing integrations
5
5
  Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
6
6
  Author-email: Steve Calvert <steve.calvert@glean.com>
@@ -232,6 +232,18 @@ connector.configure_datasource()
232
232
  connector.index_data(mode=IndexingMode.FULL)
233
233
  ```
234
234
 
235
+ **When to use forced restarts:**
236
+ - When you need to abort and restart a failed or interrupted upload
237
+ - When you want to ensure a clean upload state by discarding partial uploads
238
+ - When recovering from upload errors or inconsistent states
239
+
240
+ **How it works:**
241
+ - Generates a new `upload_id` to ensure clean separation from previous uploads
242
+ - Sets `forceRestartUpload=True` on the **first batch only**
243
+ - Continues with normal batch processing for subsequent batches
244
+
245
+ This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
246
+
235
247
  ### Complete Example
236
248
 
237
249
  ```python snippet=non_streaming/complete.py
@@ -423,6 +435,7 @@ class LargeKnowledgeBaseClient(StreamingConnectorDataClient[ArticleData]):
423
435
  from typing import List, Sequence
424
436
 
425
437
  from glean.api_client.models.userreferencedefinition import UserReferenceDefinition
438
+
426
439
  from glean.indexing.connectors import BaseStreamingDatasourceConnector
427
440
  from glean.indexing.models import ContentDefinition, CustomDatasourceConfig, DocumentDefinition
428
441
 
@@ -1,4 +1,4 @@
1
- glean/indexing/__init__.py,sha256=pYmCWpPddpoOR3fGN2ex8wjcQM4PHF8VM1ylmeHfxZY,1519
1
+ glean/indexing/__init__.py,sha256=APnkKfvATYeZF1NCePp7V2OAa5mwWTf7D_aCKaYV9Gw,1629
2
2
  glean/indexing/models.py,sha256=UuaEDCx0ygvU4u0lRbSn4YXXZVo7D_pyD_whQtjORm8,1223
3
3
  glean/indexing/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
4
4
  glean/indexing/common/__init__.py,sha256=6COS3jP66xJ7VcNGI8I95tkF5zpqHy9QPVn82CB4m4I,513
@@ -8,21 +8,23 @@ glean/indexing/common/glean_client.py,sha256=tKRWK_C1Nja0gVy2FLnj9SmUbpIdOA3WKmp
8
8
  glean/indexing/common/metrics.py,sha256=SWCWCYnNOkN4cnwCxyWyEF8iHVwQ4HZqhewi2lqyS84,1771
9
9
  glean/indexing/common/mocks.py,sha256=-TbLzpZ7yUstQW58AICixiIQM2CV5_OPRXejjI_brhE,726
10
10
  glean/indexing/common/property_definition_builder.py,sha256=NZFhSqsSZlhI0Ia76sn0meYr82msBMCKMd78zMKLWAM,3724
11
- glean/indexing/connectors/__init__.py,sha256=YaHEmCj246zKIvPIAOjTBTDV2O-KvMLncc6jjmaEeOw,1035
12
- glean/indexing/connectors/base_connector.py,sha256=Q435TzSLqs0OTFBrD3KCcjQnGSICQg11pdSfJ7C3XtI,2398
13
- glean/indexing/connectors/base_data_client.py,sha256=krOFHJbwCZI-hCS6fr-z44TvjCbPCTCw54hkk0CZFsQ,1004
14
- glean/indexing/connectors/base_datasource_connector.py,sha256=x0Fsc7uCKgTtTgyOus1yDFBr87JbVGHM3zHFp9mGgc4,12440
15
- glean/indexing/connectors/base_people_connector.py,sha256=XuSCFyegenW271GZJ408IQgT19sBq9C9NkKHkiSxLKg,6239
16
- glean/indexing/connectors/base_streaming_data_client.py,sha256=xW67crQ_rHaOnD0NFBi2zTGex9JGME886CjX4EqgbZM,1241
17
- glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=wUcsBPExzmgMQd6P24epR4bZFBl40aN6qm6di_F2hmA,7116
11
+ glean/indexing/connectors/__init__.py,sha256=d9U2-elD7DewkuvY02UQJ1_khhdYVwyQCkADzg8jVjw,1147
12
+ glean/indexing/connectors/base_async_streaming_data_client.py,sha256=JaKa1kfK1R1FKI7151g0zsbCutS7TmpZLabQi0LetA4,1419
13
+ glean/indexing/connectors/base_async_streaming_datasource_connector.py,sha256=l6BuIbz_OGFxSZv5BsJ1uOFJLlwrf9BgJugtSXmuayE,8627
14
+ glean/indexing/connectors/base_connector.py,sha256=m_zKbg-MMc1bjG5m2SsIarSeiPhFJKzfBQzgnlqTKF8,2640
15
+ glean/indexing/connectors/base_data_client.py,sha256=0_QSdcjr1VK1COnpbzJFzowDVpODIRAPHgsjMNRh4As,908
16
+ glean/indexing/connectors/base_datasource_connector.py,sha256=8_FQcQsc5gX9g_N6nw_8jj0ppccaBtGMjID2bBq9VcU,13271
17
+ glean/indexing/connectors/base_people_connector.py,sha256=7aD_B8mVUWKinV4kfzWVw0y3RRIbKZ-AbONywQf2Gxc,7071
18
+ glean/indexing/connectors/base_streaming_data_client.py,sha256=0p_OPLv7eKKCER3tuvsOuvzakiQhAG-ztyKUs9bSIl0,1131
19
+ glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=96gehVYoxrzgHLr2U-EzO9kuKMdy_GsZ56QR2m3qls8,7872
18
20
  glean/indexing/observability/__init__.py,sha256=SuWJ7pHs5WFq5vL036B3RIsJSbjDsy6SI705u83874I,455
19
21
  glean/indexing/observability/observability.py,sha256=cHlo-tbrmGie6YeWXqEUap0YE6JRtFvOKTnxWD-7yac,9222
20
22
  glean/indexing/testing/__init__.py,sha256=h9mK0QjRZD5f470ePTeg635jZNwPBAd2S7g1DQO4LuE,448
21
23
  glean/indexing/testing/connector_test_harness.py,sha256=CMQZmn0cOIrj_GdIHb3OwRN9jTaZrn3pYkHHz50rqK8,1988
22
24
  glean/indexing/testing/mock_data_source.py,sha256=ICYbbHQZe9RVTzvrlwcxp_suxm9yXgjEAGiNCU-SkS4,1325
23
- glean/indexing/testing/mock_glean_client.py,sha256=aY_Jfg_NJNPw2HSM1IshgT2lkT59SD9BJzOnvNFJhck,2528
25
+ glean/indexing/testing/mock_glean_client.py,sha256=-0-ppfD1DmLbmtc5T_vFOfZB_ACx2RL6MAoVUqxl_Us,2529
24
26
  glean/indexing/testing/response_validator.py,sha256=jehEtXlW0AQcOVck-_VPoDFtQM_vkHJQ10SUN1ftr1Q,1800
25
- glean_indexing_sdk-0.1.0.dist-info/METADATA,sha256=Y5J0IXw5FzP6k_Ao7AlU7RGPgW3Jom1noJMZDU8gHYw,15619
26
- glean_indexing_sdk-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
27
- glean_indexing_sdk-0.1.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
28
- glean_indexing_sdk-0.1.0.dist-info/RECORD,,
27
+ glean_indexing_sdk-0.3.0.dist-info/METADATA,sha256=lpXuoNquAdBGHTGhm1XNYzvAhYKPudKdCPBQ41q95v0,16225
28
+ glean_indexing_sdk-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
29
+ glean_indexing_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
30
+ glean_indexing_sdk-0.3.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any