glean-indexing-sdk 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glean/indexing/__init__.py +18 -18
- glean/indexing/connectors/__init__.py +7 -5
- glean/indexing/connectors/base_async_streaming_data_client.py +42 -0
- glean/indexing/connectors/base_async_streaming_datasource_connector.py +233 -0
- glean/indexing/connectors/base_connector.py +9 -2
- glean/indexing/connectors/base_data_client.py +0 -4
- glean/indexing/connectors/base_datasource_connector.py +33 -14
- glean/indexing/connectors/base_people_connector.py +32 -13
- glean/indexing/connectors/base_streaming_data_client.py +0 -4
- glean/indexing/connectors/base_streaming_datasource_connector.py +26 -12
- glean/indexing/testing/mock_glean_client.py +1 -0
- {glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/METADATA +14 -1
- {glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/RECORD +15 -13
- {glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/WHEEL +1 -1
- {glean_indexing_sdk-0.1.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/licenses/LICENSE +0 -0
glean/indexing/__init__.py
CHANGED
|
@@ -1,56 +1,56 @@
|
|
|
1
1
|
"""Glean Indexing SDK.
|
|
2
2
|
|
|
3
|
-
A Python SDK for building custom Glean indexing solutions. This package provides
|
|
3
|
+
A Python SDK for building custom Glean indexing solutions. This package provides
|
|
4
4
|
the base classes and utilities to create custom connectors for Glean's indexing APIs.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from importlib.metadata import
|
|
7
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
|
+
|
|
9
|
+
from glean.indexing import models
|
|
10
|
+
from glean.indexing.common import BatchProcessor, ConnectorMetrics, ContentFormatter, MockGleanClient, api_client
|
|
8
11
|
from glean.indexing.connectors import (
|
|
12
|
+
BaseAsyncStreamingDataClient,
|
|
13
|
+
BaseAsyncStreamingDatasourceConnector,
|
|
9
14
|
BaseConnector,
|
|
15
|
+
BaseDataClient,
|
|
10
16
|
BaseDatasourceConnector,
|
|
11
|
-
BaseStreamingDatasourceConnector,
|
|
12
17
|
BasePeopleConnector,
|
|
13
|
-
|
|
14
|
-
|
|
18
|
+
BaseStreamingDataClient,
|
|
19
|
+
BaseStreamingDatasourceConnector,
|
|
15
20
|
)
|
|
16
|
-
from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
|
|
17
|
-
from glean.indexing.observability.observability import ConnectorObservability
|
|
18
|
-
from glean.indexing.testing import ConnectorTestHarness
|
|
19
21
|
from glean.indexing.models import (
|
|
20
22
|
DatasourceIdentityDefinitions,
|
|
21
23
|
IndexingMode,
|
|
22
|
-
TSourceData,
|
|
23
24
|
TIndexableEntityDefinition,
|
|
25
|
+
TSourceData,
|
|
24
26
|
)
|
|
25
|
-
from glean.indexing import
|
|
27
|
+
from glean.indexing.observability.observability import ConnectorObservability
|
|
28
|
+
from glean.indexing.testing import ConnectorTestHarness
|
|
26
29
|
|
|
27
30
|
__all__ = [
|
|
28
31
|
"BaseConnector",
|
|
32
|
+
"BaseDataClient",
|
|
29
33
|
"BaseDatasourceConnector",
|
|
30
34
|
"BasePeopleConnector",
|
|
35
|
+
"BaseStreamingDataClient",
|
|
31
36
|
"BaseStreamingDatasourceConnector",
|
|
32
|
-
|
|
33
|
-
"
|
|
34
|
-
"StreamingConnectorDataClient",
|
|
35
|
-
|
|
37
|
+
"BaseAsyncStreamingDataClient",
|
|
38
|
+
"BaseAsyncStreamingDatasourceConnector",
|
|
36
39
|
"BatchProcessor",
|
|
37
40
|
"ContentFormatter",
|
|
38
41
|
"ConnectorMetrics",
|
|
39
42
|
"ConnectorObservability",
|
|
40
43
|
"ConnectorTestHarness",
|
|
41
|
-
|
|
42
44
|
"DatasourceIdentityDefinitions",
|
|
43
45
|
"IndexingMode",
|
|
44
46
|
"TSourceData",
|
|
45
47
|
"TIndexableEntityDefinition",
|
|
46
|
-
|
|
47
48
|
"MockGleanClient",
|
|
48
49
|
"api_client",
|
|
49
|
-
|
|
50
50
|
"models",
|
|
51
51
|
]
|
|
52
52
|
|
|
53
53
|
try:
|
|
54
54
|
__version__ = version("glean-indexing-sdk")
|
|
55
55
|
except PackageNotFoundError:
|
|
56
|
-
__version__ = "0.
|
|
56
|
+
__version__ = "0.3.0"
|
|
@@ -1,21 +1,23 @@
|
|
|
1
1
|
"""Connector implementations for Glean indexing."""
|
|
2
2
|
|
|
3
3
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
4
|
-
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
4
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
5
5
|
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
6
6
|
from glean.indexing.connectors.base_people_connector import BasePeopleConnector
|
|
7
|
-
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
7
|
+
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
8
8
|
from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
|
|
9
|
+
from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
|
|
10
|
+
from glean.indexing.connectors.base_async_streaming_datasource_connector import BaseAsyncStreamingDatasourceConnector
|
|
9
11
|
from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
|
|
10
12
|
|
|
11
13
|
__all__ = [
|
|
12
14
|
"BaseConnector",
|
|
13
15
|
"BaseDataClient",
|
|
14
|
-
"BaseConnectorDataClient", # Backward compatibility alias
|
|
15
16
|
"BaseDatasourceConnector",
|
|
16
17
|
"BasePeopleConnector",
|
|
17
|
-
"BaseStreamingDataClient",
|
|
18
|
-
"StreamingConnectorDataClient", # Backward compatibility alias
|
|
18
|
+
"BaseStreamingDataClient",
|
|
19
19
|
"BaseStreamingDatasourceConnector",
|
|
20
|
+
"BaseAsyncStreamingDataClient",
|
|
21
|
+
"BaseAsyncStreamingDatasourceConnector",
|
|
20
22
|
"ConnectorTestHarness",
|
|
21
23
|
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Base async streaming data client for fetching data in chunks."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, AsyncGenerator, Generic
|
|
5
|
+
|
|
6
|
+
from glean.indexing.models import TSourceData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseAsyncStreamingDataClient(ABC, Generic[TSourceData]):
|
|
10
|
+
"""
|
|
11
|
+
Base class for async streaming data clients that fetch data in chunks.
|
|
12
|
+
|
|
13
|
+
Use this for large datasets with async APIs to minimize memory usage
|
|
14
|
+
and maximize I/O throughput.
|
|
15
|
+
|
|
16
|
+
Type Parameters:
|
|
17
|
+
TSourceData: The type of data yielded from the external source
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
class MyAsyncDataClient(BaseAsyncStreamingDataClient[MyDocData]):
|
|
21
|
+
async def get_source_data(self, **kwargs) -> AsyncGenerator[MyDocData, None]:
|
|
22
|
+
async for page in self.fetch_pages():
|
|
23
|
+
for item in page:
|
|
24
|
+
yield item
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
async def get_source_data(self, **kwargs: Any) -> AsyncGenerator[TSourceData, None]:
|
|
29
|
+
"""
|
|
30
|
+
Retrieves source data as an async generator.
|
|
31
|
+
|
|
32
|
+
This method should be implemented to return an async generator
|
|
33
|
+
that yields data items one at a time or in small batches.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
**kwargs: Additional keyword arguments for customizing data retrieval.
|
|
37
|
+
|
|
38
|
+
Yields:
|
|
39
|
+
Individual data items from the external source.
|
|
40
|
+
"""
|
|
41
|
+
if False:
|
|
42
|
+
yield # type: ignore[misc]
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Base async streaming datasource connector for memory-efficient processing of large datasets."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import uuid
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from typing import AsyncGenerator, List, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
from glean.indexing.common import api_client
|
|
10
|
+
from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
|
|
11
|
+
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
12
|
+
from glean.indexing.models import IndexingMode, TSourceData
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseAsyncStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
|
|
18
|
+
"""
|
|
19
|
+
Base class for async streaming datasource connectors.
|
|
20
|
+
|
|
21
|
+
This class provides async-native streaming for memory-efficient processing
|
|
22
|
+
of large datasets. Use this when your data source provides async APIs
|
|
23
|
+
(e.g., aiohttp, httpx async, etc.).
|
|
24
|
+
|
|
25
|
+
To implement a custom async streaming connector, inherit from this class and implement:
|
|
26
|
+
- configuration: CustomDatasourceConfig (class or instance attribute)
|
|
27
|
+
- async_data_client: BaseAsyncStreamingDataClient (set in __init__)
|
|
28
|
+
- transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name (str): The unique name of the connector (should be snake_case).
|
|
32
|
+
configuration (CustomDatasourceConfig): The datasource configuration.
|
|
33
|
+
batch_size (int): The batch size for uploads (default: 1000).
|
|
34
|
+
async_data_client (BaseAsyncStreamingDataClient): The async streaming data client.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
class MyAsyncConnector(BaseAsyncStreamingDatasourceConnector[MyDocData]):
|
|
38
|
+
configuration = CustomDatasourceConfig(...)
|
|
39
|
+
|
|
40
|
+
def __init__(self, name: str):
|
|
41
|
+
async_client = MyAsyncDataClient()
|
|
42
|
+
super().__init__(name, async_client)
|
|
43
|
+
|
|
44
|
+
def transform(self, data: Sequence[MyDocData]) -> Sequence[DocumentDefinition]:
|
|
45
|
+
return [self._transform_doc(d) for d in data]
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
name: str,
|
|
51
|
+
async_data_client: BaseAsyncStreamingDataClient[TSourceData],
|
|
52
|
+
):
|
|
53
|
+
super().__init__(name, None) # type: ignore[arg-type]
|
|
54
|
+
self.async_data_client = async_data_client
|
|
55
|
+
self.batch_size = 1000
|
|
56
|
+
self._upload_id: Optional[str] = None
|
|
57
|
+
self._force_restart: bool = False
|
|
58
|
+
|
|
59
|
+
def generate_upload_id(self) -> str:
|
|
60
|
+
"""Generate a unique upload ID for batch tracking."""
|
|
61
|
+
if not self._upload_id:
|
|
62
|
+
self._upload_id = str(uuid.uuid4())
|
|
63
|
+
return self._upload_id
|
|
64
|
+
|
|
65
|
+
async def get_data_async(
|
|
66
|
+
self, since: Optional[str] = None
|
|
67
|
+
) -> AsyncGenerator[TSourceData, None]:
|
|
68
|
+
"""
|
|
69
|
+
Get data from the async streaming data client.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
since: If provided, only get data modified since this timestamp.
|
|
73
|
+
|
|
74
|
+
Yields:
|
|
75
|
+
Individual data items from the source
|
|
76
|
+
"""
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Fetching async streaming data from source{' since ' + since if since else ''}"
|
|
79
|
+
)
|
|
80
|
+
async for item in self.async_data_client.get_source_data(since=since):
|
|
81
|
+
yield item
|
|
82
|
+
|
|
83
|
+
async def index_data_async(
|
|
84
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
85
|
+
) -> None:
|
|
86
|
+
"""
|
|
87
|
+
Index data from the datasource to Glean using async streaming.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
91
|
+
force_restart: If True, forces a restart of the upload.
|
|
92
|
+
"""
|
|
93
|
+
logger.info(
|
|
94
|
+
f"Starting {mode.name.lower()} async streaming indexing for datasource '{self.name}'"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
since = None
|
|
98
|
+
if mode == IndexingMode.INCREMENTAL:
|
|
99
|
+
since = self._get_last_crawl_timestamp()
|
|
100
|
+
logger.info(f"Incremental crawl since: {since}")
|
|
101
|
+
|
|
102
|
+
upload_id = self.generate_upload_id()
|
|
103
|
+
self._force_restart = force_restart
|
|
104
|
+
is_first_batch = True
|
|
105
|
+
batch: List[TSourceData] = []
|
|
106
|
+
batch_count = 0
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
data_iterator = self.get_data_async(since=since).__aiter__()
|
|
110
|
+
exhausted = False
|
|
111
|
+
|
|
112
|
+
while not exhausted:
|
|
113
|
+
try:
|
|
114
|
+
item = await data_iterator.__anext__()
|
|
115
|
+
batch.append(item)
|
|
116
|
+
|
|
117
|
+
if len(batch) == self.batch_size:
|
|
118
|
+
try:
|
|
119
|
+
next_item = await data_iterator.__anext__()
|
|
120
|
+
|
|
121
|
+
await self._process_batch_async(
|
|
122
|
+
batch=batch,
|
|
123
|
+
upload_id=upload_id,
|
|
124
|
+
is_first_batch=is_first_batch,
|
|
125
|
+
is_last_batch=False,
|
|
126
|
+
batch_number=batch_count,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
batch_count += 1
|
|
130
|
+
batch = [next_item]
|
|
131
|
+
is_first_batch = False
|
|
132
|
+
|
|
133
|
+
except StopAsyncIteration:
|
|
134
|
+
exhausted = True
|
|
135
|
+
|
|
136
|
+
except StopAsyncIteration:
|
|
137
|
+
exhausted = True
|
|
138
|
+
|
|
139
|
+
if batch:
|
|
140
|
+
await self._process_batch_async(
|
|
141
|
+
batch=batch,
|
|
142
|
+
upload_id=upload_id,
|
|
143
|
+
is_first_batch=is_first_batch,
|
|
144
|
+
is_last_batch=True,
|
|
145
|
+
batch_number=batch_count,
|
|
146
|
+
)
|
|
147
|
+
batch_count += 1
|
|
148
|
+
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Async streaming indexing completed successfully. Processed {batch_count} batches."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.exception(f"Error during async streaming indexing: {e}")
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
async def _process_batch_async(
|
|
158
|
+
self,
|
|
159
|
+
batch: List[TSourceData],
|
|
160
|
+
upload_id: str,
|
|
161
|
+
is_first_batch: bool,
|
|
162
|
+
is_last_batch: bool,
|
|
163
|
+
batch_number: int,
|
|
164
|
+
) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Process a single batch of data.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
batch: The batch of raw data to process
|
|
170
|
+
upload_id: The upload ID for this indexing session
|
|
171
|
+
is_first_batch: Whether this is the first batch
|
|
172
|
+
is_last_batch: Whether this is the last batch
|
|
173
|
+
batch_number: The sequence number of this batch
|
|
174
|
+
"""
|
|
175
|
+
logger.info(f"Processing batch {batch_number} with {len(batch)} items")
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
transformed_batch = self.transform(batch)
|
|
179
|
+
logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
|
|
180
|
+
|
|
181
|
+
bulk_index_kwargs = {
|
|
182
|
+
"datasource": self.name,
|
|
183
|
+
"documents": list(transformed_batch),
|
|
184
|
+
"upload_id": upload_id,
|
|
185
|
+
"is_first_page": is_first_batch,
|
|
186
|
+
"is_last_page": is_last_batch,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if self._force_restart and is_first_batch:
|
|
190
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
191
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
192
|
+
|
|
193
|
+
with api_client() as client:
|
|
194
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
195
|
+
|
|
196
|
+
logger.info(f"Batch {batch_number} indexed successfully")
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Failed to process batch {batch_number}: {e}")
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
|
|
203
|
+
"""
|
|
204
|
+
Sync fallback - collects all data into memory.
|
|
205
|
+
|
|
206
|
+
Warning: This defeats the purpose of streaming. Use get_data_async() instead.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
async def collect() -> List[TSourceData]:
|
|
210
|
+
result: List[TSourceData] = []
|
|
211
|
+
async for item in self.get_data_async(since=since):
|
|
212
|
+
result.append(item)
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
logger.warning(
|
|
216
|
+
"Sync get_data() called on async connector - using asyncio.run(). "
|
|
217
|
+
"Consider using get_data_async() for better performance."
|
|
218
|
+
)
|
|
219
|
+
return asyncio.run(collect())
|
|
220
|
+
|
|
221
|
+
def index_data(
|
|
222
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
223
|
+
) -> None:
|
|
224
|
+
"""
|
|
225
|
+
Sync fallback for index_data.
|
|
226
|
+
|
|
227
|
+
Warning: This blocks the current thread. Use index_data_async() instead.
|
|
228
|
+
"""
|
|
229
|
+
logger.warning(
|
|
230
|
+
"Sync index_data() called on async connector - using asyncio.run(). "
|
|
231
|
+
"Consider using index_data_async() for better performance."
|
|
232
|
+
)
|
|
233
|
+
asyncio.run(self.index_data_async(mode=mode, force_restart=force_restart))
|
|
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
|
|
|
55
55
|
pass
|
|
56
56
|
|
|
57
57
|
@abstractmethod
|
|
58
|
-
def index_data(
|
|
59
|
-
|
|
58
|
+
def index_data(
|
|
59
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Index data from the connector to Glean.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
65
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
66
|
+
"""
|
|
60
67
|
pass
|
|
@@ -6,9 +6,10 @@ from abc import ABC
|
|
|
6
6
|
from typing import Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.api_client.models import DocumentDefinition
|
|
9
|
+
|
|
9
10
|
from glean.indexing.common import BatchProcessor, api_client
|
|
10
11
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
11
|
-
from glean.indexing.connectors.base_data_client import
|
|
12
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
12
13
|
from glean.indexing.models import (
|
|
13
14
|
CustomDatasourceConfig,
|
|
14
15
|
DatasourceIdentityDefinitions,
|
|
@@ -36,7 +37,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
36
37
|
name (str): The unique name of the connector (should be snake_case).
|
|
37
38
|
configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
|
|
38
39
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
39
|
-
data_client (
|
|
40
|
+
data_client (BaseDataClient): The data client for fetching source data.
|
|
40
41
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
41
42
|
|
|
42
43
|
Example:
|
|
@@ -47,7 +48,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
47
48
|
|
|
48
49
|
configuration: CustomDatasourceConfig
|
|
49
50
|
|
|
50
|
-
def __init__(self, name: str, data_client:
|
|
51
|
+
def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
|
|
51
52
|
"""
|
|
52
53
|
Initialize the datasource connector.
|
|
53
54
|
|
|
@@ -114,12 +115,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
114
115
|
client.indexing.datasources.add(**config.dict(exclude_unset=True))
|
|
115
116
|
logger.info(f"Successfully configured datasource: {config.name}")
|
|
116
117
|
|
|
117
|
-
def index_data(
|
|
118
|
+
def index_data(
|
|
119
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
120
|
+
) -> None:
|
|
118
121
|
"""
|
|
119
122
|
Index data from the datasource to Glean with identity crawl followed by content crawl.
|
|
120
123
|
|
|
121
124
|
Args:
|
|
122
125
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
126
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
127
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
123
128
|
"""
|
|
124
129
|
self._observability.start_execution()
|
|
125
130
|
|
|
@@ -169,7 +174,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
169
174
|
self._observability.start_timer("data_upload")
|
|
170
175
|
if documents:
|
|
171
176
|
logger.info(f"Indexing {len(documents)} documents")
|
|
172
|
-
self._batch_index_documents(documents)
|
|
177
|
+
self._batch_index_documents(documents, force_restart=force_restart)
|
|
173
178
|
self._observability.end_timer("data_upload")
|
|
174
179
|
|
|
175
180
|
logger.info(f"Successfully indexed {len(documents)} documents to Glean")
|
|
@@ -272,8 +277,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
272
277
|
self._observability.increment_counter("batch_upload_errors")
|
|
273
278
|
raise
|
|
274
279
|
|
|
275
|
-
def _batch_index_documents(
|
|
276
|
-
|
|
280
|
+
def _batch_index_documents(
|
|
281
|
+
self, documents: Sequence[DocumentDefinition], force_restart: bool = False
|
|
282
|
+
) -> None:
|
|
283
|
+
"""Index documents in batches with proper page signaling.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
documents: The documents to index
|
|
287
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
288
|
+
"""
|
|
277
289
|
if not documents:
|
|
278
290
|
return
|
|
279
291
|
|
|
@@ -285,14 +297,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
285
297
|
upload_id = str(uuid.uuid4())
|
|
286
298
|
for i, batch in enumerate(batches):
|
|
287
299
|
try:
|
|
300
|
+
is_first_page = i == 0
|
|
301
|
+
bulk_index_kwargs = {
|
|
302
|
+
"datasource": self.name,
|
|
303
|
+
"documents": list(batch),
|
|
304
|
+
"upload_id": upload_id,
|
|
305
|
+
"is_first_page": is_first_page,
|
|
306
|
+
"is_last_page": (i == total_batches - 1),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if force_restart and is_first_page:
|
|
310
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
311
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
312
|
+
|
|
288
313
|
with api_client() as client:
|
|
289
|
-
client.indexing.documents.bulk_index(
|
|
290
|
-
datasource=self.name,
|
|
291
|
-
documents=list(batch),
|
|
292
|
-
upload_id=upload_id,
|
|
293
|
-
is_first_page=(i == 0),
|
|
294
|
-
is_last_page=(i == total_batches - 1),
|
|
295
|
-
)
|
|
314
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
296
315
|
|
|
297
316
|
logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
|
|
298
317
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -6,9 +6,10 @@ from abc import ABC
|
|
|
6
6
|
from typing import Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.api_client.models import EmployeeInfoDefinition
|
|
9
|
+
|
|
9
10
|
from glean.indexing.common import BatchProcessor, api_client
|
|
10
11
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
11
|
-
from glean.indexing.connectors.base_data_client import
|
|
12
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
12
13
|
from glean.indexing.models import IndexingMode, TSourceData
|
|
13
14
|
from glean.indexing.observability.observability import ConnectorObservability
|
|
14
15
|
|
|
@@ -31,7 +32,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
31
32
|
name (str): The unique name of the connector (should be snake_case).
|
|
32
33
|
configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
|
|
33
34
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
34
|
-
data_client (
|
|
35
|
+
data_client (BaseDataClient): The data client for fetching source data.
|
|
35
36
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
36
37
|
|
|
37
38
|
Example:
|
|
@@ -40,7 +41,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
40
41
|
...
|
|
41
42
|
"""
|
|
42
43
|
|
|
43
|
-
def __init__(self, name: str, data_client:
|
|
44
|
+
def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
|
|
44
45
|
"""
|
|
45
46
|
Initialize the people connector.
|
|
46
47
|
|
|
@@ -58,11 +59,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
58
59
|
"""The observability instance for this connector."""
|
|
59
60
|
return self._observability
|
|
60
61
|
|
|
61
|
-
def index_data(
|
|
62
|
+
def index_data(
|
|
63
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
64
|
+
) -> None:
|
|
62
65
|
"""Index people data to Glean.
|
|
63
66
|
|
|
64
67
|
Args:
|
|
65
68
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
69
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
70
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
66
71
|
"""
|
|
67
72
|
self._observability.start_execution()
|
|
68
73
|
|
|
@@ -89,7 +94,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
89
94
|
self._observability.record_metric("employees_transformed", len(employees))
|
|
90
95
|
|
|
91
96
|
self._observability.start_timer("data_upload")
|
|
92
|
-
self._batch_index_employees(employees)
|
|
97
|
+
self._batch_index_employees(employees, force_restart=force_restart)
|
|
93
98
|
self._observability.end_timer("data_upload")
|
|
94
99
|
|
|
95
100
|
logger.info(f"Successfully indexed {len(employees)} employees to Glean")
|
|
@@ -113,8 +118,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
113
118
|
"""
|
|
114
119
|
return self.data_client.get_source_data(since=since)
|
|
115
120
|
|
|
116
|
-
def _batch_index_employees(
|
|
117
|
-
|
|
121
|
+
def _batch_index_employees(
|
|
122
|
+
self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
|
|
123
|
+
) -> None:
|
|
124
|
+
"""Index employees to Glean in batches.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
employees: The employees to index
|
|
128
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
129
|
+
"""
|
|
118
130
|
if not employees:
|
|
119
131
|
return
|
|
120
132
|
|
|
@@ -126,13 +138,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
126
138
|
upload_id = str(uuid.uuid4())
|
|
127
139
|
for i, batch in enumerate(batches):
|
|
128
140
|
try:
|
|
141
|
+
is_first_page = i == 0
|
|
142
|
+
bulk_index_kwargs = {
|
|
143
|
+
"employees": list(batch),
|
|
144
|
+
"upload_id": upload_id,
|
|
145
|
+
"is_first_page": is_first_page,
|
|
146
|
+
"is_last_page": (i == total_batches - 1),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if force_restart and is_first_page:
|
|
150
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
151
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
152
|
+
|
|
129
153
|
with api_client() as client:
|
|
130
|
-
client.indexing.people.bulk_index(
|
|
131
|
-
employees=list(batch),
|
|
132
|
-
upload_id=upload_id,
|
|
133
|
-
is_first_page=(i == 0),
|
|
134
|
-
is_last_page=(i == total_batches - 1),
|
|
135
|
-
)
|
|
154
|
+
client.indexing.people.bulk_index(**bulk_index_kwargs)
|
|
136
155
|
|
|
137
156
|
logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
|
|
138
157
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -6,7 +6,8 @@ from abc import ABC
|
|
|
6
6
|
from typing import Generator, List, Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.indexing.common import api_client
|
|
9
|
-
from glean.indexing.connectors import BaseDatasourceConnector
|
|
9
|
+
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
10
|
+
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
10
11
|
from glean.indexing.models import IndexingMode, TSourceData
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
@@ -28,7 +29,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
28
29
|
name (str): The unique name of the connector (should be snake_case).
|
|
29
30
|
configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
|
|
30
31
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
31
|
-
data_client (
|
|
32
|
+
data_client (BaseStreamingDataClient): The streaming data client for fetching source data.
|
|
32
33
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
33
34
|
|
|
34
35
|
Notes:
|
|
@@ -41,12 +42,13 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
41
42
|
...
|
|
42
43
|
"""
|
|
43
44
|
|
|
44
|
-
def __init__(self, name: str, data_client:
|
|
45
|
+
def __init__(self, name: str, data_client: BaseStreamingDataClient[TSourceData]):
|
|
45
46
|
# Note: We pass the streaming client as-is since it's a specialized version
|
|
46
47
|
# The type checker may warn about this, but it's intentional for streaming
|
|
47
48
|
super().__init__(name, data_client) # type: ignore[arg-type]
|
|
48
49
|
self.batch_size = 1000
|
|
49
50
|
self._upload_id: Optional[str] = None
|
|
51
|
+
self._force_restart: bool = False
|
|
50
52
|
|
|
51
53
|
def generate_upload_id(self) -> str:
|
|
52
54
|
"""Generate a unique upload ID for batch tracking."""
|
|
@@ -67,20 +69,26 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
67
69
|
logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
|
|
68
70
|
yield from self.data_client.get_source_data(since=since)
|
|
69
71
|
|
|
70
|
-
def index_data(
|
|
72
|
+
def index_data(
|
|
73
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
74
|
+
) -> None:
|
|
71
75
|
"""
|
|
72
76
|
Index data from the datasource to Glean using streaming.
|
|
73
77
|
|
|
74
78
|
Args:
|
|
75
79
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
80
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
81
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
76
82
|
"""
|
|
77
83
|
logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
|
|
78
84
|
|
|
79
85
|
since = None
|
|
80
86
|
if mode == IndexingMode.INCREMENTAL:
|
|
81
|
-
since =
|
|
87
|
+
since = self._get_last_crawl_timestamp()
|
|
88
|
+
logger.info(f"Incremental crawl since: {since}")
|
|
82
89
|
|
|
83
90
|
upload_id = self.generate_upload_id()
|
|
91
|
+
self._force_restart = force_restart
|
|
84
92
|
data_iterator = self.get_data(since=since)
|
|
85
93
|
is_first_batch = True
|
|
86
94
|
batch: List[TSourceData] = []
|
|
@@ -150,14 +158,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
150
158
|
transformed_batch = self.transform(batch)
|
|
151
159
|
logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
|
|
152
160
|
|
|
161
|
+
bulk_index_kwargs = {
|
|
162
|
+
"datasource": self.name,
|
|
163
|
+
"documents": list(transformed_batch),
|
|
164
|
+
"upload_id": upload_id,
|
|
165
|
+
"is_first_page": is_first_batch,
|
|
166
|
+
"is_last_page": is_last_batch,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if self._force_restart and is_first_batch:
|
|
170
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
171
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
172
|
+
|
|
153
173
|
with api_client() as client:
|
|
154
|
-
client.indexing.documents.bulk_index(
|
|
155
|
-
datasource=self.name,
|
|
156
|
-
documents=list(transformed_batch),
|
|
157
|
-
upload_id=upload_id,
|
|
158
|
-
is_first_page=is_first_batch,
|
|
159
|
-
is_last_page=is_last_batch,
|
|
160
|
-
)
|
|
174
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
161
175
|
|
|
162
176
|
logger.info(f"Batch {batch_number} indexed successfully")
|
|
163
177
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glean-indexing-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: SDK for building custom Glean indexing integrations
|
|
5
5
|
Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
|
|
6
6
|
Author-email: Steve Calvert <steve.calvert@glean.com>
|
|
@@ -232,6 +232,18 @@ connector.configure_datasource()
|
|
|
232
232
|
connector.index_data(mode=IndexingMode.FULL)
|
|
233
233
|
```
|
|
234
234
|
|
|
235
|
+
**When to use forced restarts:**
|
|
236
|
+
- When you need to abort and restart a failed or interrupted upload
|
|
237
|
+
- When you want to ensure a clean upload state by discarding partial uploads
|
|
238
|
+
- When recovering from upload errors or inconsistent states
|
|
239
|
+
|
|
240
|
+
**How it works:**
|
|
241
|
+
- Generates a new `upload_id` to ensure clean separation from previous uploads
|
|
242
|
+
- Sets `forceRestartUpload=True` on the **first batch only**
|
|
243
|
+
- Continues with normal batch processing for subsequent batches
|
|
244
|
+
|
|
245
|
+
This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
|
|
246
|
+
|
|
235
247
|
### Complete Example
|
|
236
248
|
|
|
237
249
|
```python snippet=non_streaming/complete.py
|
|
@@ -423,6 +435,7 @@ class LargeKnowledgeBaseClient(StreamingConnectorDataClient[ArticleData]):
|
|
|
423
435
|
from typing import List, Sequence
|
|
424
436
|
|
|
425
437
|
from glean.api_client.models.userreferencedefinition import UserReferenceDefinition
|
|
438
|
+
|
|
426
439
|
from glean.indexing.connectors import BaseStreamingDatasourceConnector
|
|
427
440
|
from glean.indexing.models import ContentDefinition, CustomDatasourceConfig, DocumentDefinition
|
|
428
441
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
glean/indexing/__init__.py,sha256=
|
|
1
|
+
glean/indexing/__init__.py,sha256=APnkKfvATYeZF1NCePp7V2OAa5mwWTf7D_aCKaYV9Gw,1629
|
|
2
2
|
glean/indexing/models.py,sha256=UuaEDCx0ygvU4u0lRbSn4YXXZVo7D_pyD_whQtjORm8,1223
|
|
3
3
|
glean/indexing/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
|
4
4
|
glean/indexing/common/__init__.py,sha256=6COS3jP66xJ7VcNGI8I95tkF5zpqHy9QPVn82CB4m4I,513
|
|
@@ -8,21 +8,23 @@ glean/indexing/common/glean_client.py,sha256=tKRWK_C1Nja0gVy2FLnj9SmUbpIdOA3WKmp
|
|
|
8
8
|
glean/indexing/common/metrics.py,sha256=SWCWCYnNOkN4cnwCxyWyEF8iHVwQ4HZqhewi2lqyS84,1771
|
|
9
9
|
glean/indexing/common/mocks.py,sha256=-TbLzpZ7yUstQW58AICixiIQM2CV5_OPRXejjI_brhE,726
|
|
10
10
|
glean/indexing/common/property_definition_builder.py,sha256=NZFhSqsSZlhI0Ia76sn0meYr82msBMCKMd78zMKLWAM,3724
|
|
11
|
-
glean/indexing/connectors/__init__.py,sha256=
|
|
12
|
-
glean/indexing/connectors/
|
|
13
|
-
glean/indexing/connectors/
|
|
14
|
-
glean/indexing/connectors/
|
|
15
|
-
glean/indexing/connectors/
|
|
16
|
-
glean/indexing/connectors/
|
|
17
|
-
glean/indexing/connectors/
|
|
11
|
+
glean/indexing/connectors/__init__.py,sha256=d9U2-elD7DewkuvY02UQJ1_khhdYVwyQCkADzg8jVjw,1147
|
|
12
|
+
glean/indexing/connectors/base_async_streaming_data_client.py,sha256=JaKa1kfK1R1FKI7151g0zsbCutS7TmpZLabQi0LetA4,1419
|
|
13
|
+
glean/indexing/connectors/base_async_streaming_datasource_connector.py,sha256=l6BuIbz_OGFxSZv5BsJ1uOFJLlwrf9BgJugtSXmuayE,8627
|
|
14
|
+
glean/indexing/connectors/base_connector.py,sha256=m_zKbg-MMc1bjG5m2SsIarSeiPhFJKzfBQzgnlqTKF8,2640
|
|
15
|
+
glean/indexing/connectors/base_data_client.py,sha256=0_QSdcjr1VK1COnpbzJFzowDVpODIRAPHgsjMNRh4As,908
|
|
16
|
+
glean/indexing/connectors/base_datasource_connector.py,sha256=8_FQcQsc5gX9g_N6nw_8jj0ppccaBtGMjID2bBq9VcU,13271
|
|
17
|
+
glean/indexing/connectors/base_people_connector.py,sha256=7aD_B8mVUWKinV4kfzWVw0y3RRIbKZ-AbONywQf2Gxc,7071
|
|
18
|
+
glean/indexing/connectors/base_streaming_data_client.py,sha256=0p_OPLv7eKKCER3tuvsOuvzakiQhAG-ztyKUs9bSIl0,1131
|
|
19
|
+
glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=96gehVYoxrzgHLr2U-EzO9kuKMdy_GsZ56QR2m3qls8,7872
|
|
18
20
|
glean/indexing/observability/__init__.py,sha256=SuWJ7pHs5WFq5vL036B3RIsJSbjDsy6SI705u83874I,455
|
|
19
21
|
glean/indexing/observability/observability.py,sha256=cHlo-tbrmGie6YeWXqEUap0YE6JRtFvOKTnxWD-7yac,9222
|
|
20
22
|
glean/indexing/testing/__init__.py,sha256=h9mK0QjRZD5f470ePTeg635jZNwPBAd2S7g1DQO4LuE,448
|
|
21
23
|
glean/indexing/testing/connector_test_harness.py,sha256=CMQZmn0cOIrj_GdIHb3OwRN9jTaZrn3pYkHHz50rqK8,1988
|
|
22
24
|
glean/indexing/testing/mock_data_source.py,sha256=ICYbbHQZe9RVTzvrlwcxp_suxm9yXgjEAGiNCU-SkS4,1325
|
|
23
|
-
glean/indexing/testing/mock_glean_client.py,sha256
|
|
25
|
+
glean/indexing/testing/mock_glean_client.py,sha256=-0-ppfD1DmLbmtc5T_vFOfZB_ACx2RL6MAoVUqxl_Us,2529
|
|
24
26
|
glean/indexing/testing/response_validator.py,sha256=jehEtXlW0AQcOVck-_VPoDFtQM_vkHJQ10SUN1ftr1Q,1800
|
|
25
|
-
glean_indexing_sdk-0.
|
|
26
|
-
glean_indexing_sdk-0.
|
|
27
|
-
glean_indexing_sdk-0.
|
|
28
|
-
glean_indexing_sdk-0.
|
|
27
|
+
glean_indexing_sdk-0.3.0.dist-info/METADATA,sha256=lpXuoNquAdBGHTGhm1XNYzvAhYKPudKdCPBQ41q95v0,16225
|
|
28
|
+
glean_indexing_sdk-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
29
|
+
glean_indexing_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
|
|
30
|
+
glean_indexing_sdk-0.3.0.dist-info/RECORD,,
|
|
File without changes
|