glean-indexing-sdk 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glean/indexing/__init__.py +18 -18
- glean/indexing/connectors/__init__.py +7 -5
- glean/indexing/connectors/base_async_streaming_data_client.py +42 -0
- glean/indexing/connectors/base_async_streaming_datasource_connector.py +233 -0
- glean/indexing/connectors/base_data_client.py +0 -4
- glean/indexing/connectors/base_datasource_connector.py +4 -3
- glean/indexing/connectors/base_people_connector.py +4 -3
- glean/indexing/connectors/base_streaming_data_client.py +0 -4
- glean/indexing/connectors/base_streaming_datasource_connector.py +6 -4
- glean/indexing/testing/mock_glean_client.py +1 -0
- {glean_indexing_sdk-0.2.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/METADATA +2 -1
- {glean_indexing_sdk-0.2.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/RECORD +14 -12
- {glean_indexing_sdk-0.2.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/WHEEL +1 -1
- {glean_indexing_sdk-0.2.0.dist-info → glean_indexing_sdk-0.3.0.dist-info}/licenses/LICENSE +0 -0
glean/indexing/__init__.py
CHANGED
|
@@ -1,56 +1,56 @@
|
|
|
1
1
|
"""Glean Indexing SDK.
|
|
2
2
|
|
|
3
|
-
A Python SDK for building custom Glean indexing solutions. This package provides
|
|
3
|
+
A Python SDK for building custom Glean indexing solutions. This package provides
|
|
4
4
|
the base classes and utilities to create custom connectors for Glean's indexing APIs.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from importlib.metadata import
|
|
7
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
|
+
|
|
9
|
+
from glean.indexing import models
|
|
10
|
+
from glean.indexing.common import BatchProcessor, ConnectorMetrics, ContentFormatter, MockGleanClient, api_client
|
|
8
11
|
from glean.indexing.connectors import (
|
|
12
|
+
BaseAsyncStreamingDataClient,
|
|
13
|
+
BaseAsyncStreamingDatasourceConnector,
|
|
9
14
|
BaseConnector,
|
|
15
|
+
BaseDataClient,
|
|
10
16
|
BaseDatasourceConnector,
|
|
11
|
-
BaseStreamingDatasourceConnector,
|
|
12
17
|
BasePeopleConnector,
|
|
13
|
-
|
|
14
|
-
|
|
18
|
+
BaseStreamingDataClient,
|
|
19
|
+
BaseStreamingDatasourceConnector,
|
|
15
20
|
)
|
|
16
|
-
from glean.indexing.common import BatchProcessor, ContentFormatter, ConnectorMetrics, api_client, MockGleanClient
|
|
17
|
-
from glean.indexing.observability.observability import ConnectorObservability
|
|
18
|
-
from glean.indexing.testing import ConnectorTestHarness
|
|
19
21
|
from glean.indexing.models import (
|
|
20
22
|
DatasourceIdentityDefinitions,
|
|
21
23
|
IndexingMode,
|
|
22
|
-
TSourceData,
|
|
23
24
|
TIndexableEntityDefinition,
|
|
25
|
+
TSourceData,
|
|
24
26
|
)
|
|
25
|
-
from glean.indexing import
|
|
27
|
+
from glean.indexing.observability.observability import ConnectorObservability
|
|
28
|
+
from glean.indexing.testing import ConnectorTestHarness
|
|
26
29
|
|
|
27
30
|
__all__ = [
|
|
28
31
|
"BaseConnector",
|
|
32
|
+
"BaseDataClient",
|
|
29
33
|
"BaseDatasourceConnector",
|
|
30
34
|
"BasePeopleConnector",
|
|
35
|
+
"BaseStreamingDataClient",
|
|
31
36
|
"BaseStreamingDatasourceConnector",
|
|
32
|
-
|
|
33
|
-
"
|
|
34
|
-
"StreamingConnectorDataClient",
|
|
35
|
-
|
|
37
|
+
"BaseAsyncStreamingDataClient",
|
|
38
|
+
"BaseAsyncStreamingDatasourceConnector",
|
|
36
39
|
"BatchProcessor",
|
|
37
40
|
"ContentFormatter",
|
|
38
41
|
"ConnectorMetrics",
|
|
39
42
|
"ConnectorObservability",
|
|
40
43
|
"ConnectorTestHarness",
|
|
41
|
-
|
|
42
44
|
"DatasourceIdentityDefinitions",
|
|
43
45
|
"IndexingMode",
|
|
44
46
|
"TSourceData",
|
|
45
47
|
"TIndexableEntityDefinition",
|
|
46
|
-
|
|
47
48
|
"MockGleanClient",
|
|
48
49
|
"api_client",
|
|
49
|
-
|
|
50
50
|
"models",
|
|
51
51
|
]
|
|
52
52
|
|
|
53
53
|
try:
|
|
54
54
|
__version__ = version("glean-indexing-sdk")
|
|
55
55
|
except PackageNotFoundError:
|
|
56
|
-
__version__ = "0.
|
|
56
|
+
__version__ = "0.3.0"
|
|
@@ -1,21 +1,23 @@
|
|
|
1
1
|
"""Connector implementations for Glean indexing."""
|
|
2
2
|
|
|
3
3
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
4
|
-
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
4
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
5
5
|
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
6
6
|
from glean.indexing.connectors.base_people_connector import BasePeopleConnector
|
|
7
|
-
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
7
|
+
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
8
8
|
from glean.indexing.connectors.base_streaming_datasource_connector import BaseStreamingDatasourceConnector
|
|
9
|
+
from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
|
|
10
|
+
from glean.indexing.connectors.base_async_streaming_datasource_connector import BaseAsyncStreamingDatasourceConnector
|
|
9
11
|
from glean.indexing.testing.connector_test_harness import ConnectorTestHarness
|
|
10
12
|
|
|
11
13
|
__all__ = [
|
|
12
14
|
"BaseConnector",
|
|
13
15
|
"BaseDataClient",
|
|
14
|
-
"BaseConnectorDataClient", # Backward compatibility alias
|
|
15
16
|
"BaseDatasourceConnector",
|
|
16
17
|
"BasePeopleConnector",
|
|
17
|
-
"BaseStreamingDataClient",
|
|
18
|
-
"StreamingConnectorDataClient", # Backward compatibility alias
|
|
18
|
+
"BaseStreamingDataClient",
|
|
19
19
|
"BaseStreamingDatasourceConnector",
|
|
20
|
+
"BaseAsyncStreamingDataClient",
|
|
21
|
+
"BaseAsyncStreamingDatasourceConnector",
|
|
20
22
|
"ConnectorTestHarness",
|
|
21
23
|
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Base async streaming data client for fetching data in chunks."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, AsyncGenerator, Generic
|
|
5
|
+
|
|
6
|
+
from glean.indexing.models import TSourceData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseAsyncStreamingDataClient(ABC, Generic[TSourceData]):
|
|
10
|
+
"""
|
|
11
|
+
Base class for async streaming data clients that fetch data in chunks.
|
|
12
|
+
|
|
13
|
+
Use this for large datasets with async APIs to minimize memory usage
|
|
14
|
+
and maximize I/O throughput.
|
|
15
|
+
|
|
16
|
+
Type Parameters:
|
|
17
|
+
TSourceData: The type of data yielded from the external source
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
class MyAsyncDataClient(BaseAsyncStreamingDataClient[MyDocData]):
|
|
21
|
+
async def get_source_data(self, **kwargs) -> AsyncGenerator[MyDocData, None]:
|
|
22
|
+
async for page in self.fetch_pages():
|
|
23
|
+
for item in page:
|
|
24
|
+
yield item
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
async def get_source_data(self, **kwargs: Any) -> AsyncGenerator[TSourceData, None]:
|
|
29
|
+
"""
|
|
30
|
+
Retrieves source data as an async generator.
|
|
31
|
+
|
|
32
|
+
This method should be implemented to return an async generator
|
|
33
|
+
that yields data items one at a time or in small batches.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
**kwargs: Additional keyword arguments for customizing data retrieval.
|
|
37
|
+
|
|
38
|
+
Yields:
|
|
39
|
+
Individual data items from the external source.
|
|
40
|
+
"""
|
|
41
|
+
if False:
|
|
42
|
+
yield # type: ignore[misc]
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Base async streaming datasource connector for memory-efficient processing of large datasets."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import uuid
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from typing import AsyncGenerator, List, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
from glean.indexing.common import api_client
|
|
10
|
+
from glean.indexing.connectors.base_async_streaming_data_client import BaseAsyncStreamingDataClient
|
|
11
|
+
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
12
|
+
from glean.indexing.models import IndexingMode, TSourceData
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseAsyncStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
|
|
18
|
+
"""
|
|
19
|
+
Base class for async streaming datasource connectors.
|
|
20
|
+
|
|
21
|
+
This class provides async-native streaming for memory-efficient processing
|
|
22
|
+
of large datasets. Use this when your data source provides async APIs
|
|
23
|
+
(e.g., aiohttp, httpx async, etc.).
|
|
24
|
+
|
|
25
|
+
To implement a custom async streaming connector, inherit from this class and implement:
|
|
26
|
+
- configuration: CustomDatasourceConfig (class or instance attribute)
|
|
27
|
+
- async_data_client: BaseAsyncStreamingDataClient (set in __init__)
|
|
28
|
+
- transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name (str): The unique name of the connector (should be snake_case).
|
|
32
|
+
configuration (CustomDatasourceConfig): The datasource configuration.
|
|
33
|
+
batch_size (int): The batch size for uploads (default: 1000).
|
|
34
|
+
async_data_client (BaseAsyncStreamingDataClient): The async streaming data client.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
class MyAsyncConnector(BaseAsyncStreamingDatasourceConnector[MyDocData]):
|
|
38
|
+
configuration = CustomDatasourceConfig(...)
|
|
39
|
+
|
|
40
|
+
def __init__(self, name: str):
|
|
41
|
+
async_client = MyAsyncDataClient()
|
|
42
|
+
super().__init__(name, async_client)
|
|
43
|
+
|
|
44
|
+
def transform(self, data: Sequence[MyDocData]) -> Sequence[DocumentDefinition]:
|
|
45
|
+
return [self._transform_doc(d) for d in data]
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
name: str,
|
|
51
|
+
async_data_client: BaseAsyncStreamingDataClient[TSourceData],
|
|
52
|
+
):
|
|
53
|
+
super().__init__(name, None) # type: ignore[arg-type]
|
|
54
|
+
self.async_data_client = async_data_client
|
|
55
|
+
self.batch_size = 1000
|
|
56
|
+
self._upload_id: Optional[str] = None
|
|
57
|
+
self._force_restart: bool = False
|
|
58
|
+
|
|
59
|
+
def generate_upload_id(self) -> str:
|
|
60
|
+
"""Generate a unique upload ID for batch tracking."""
|
|
61
|
+
if not self._upload_id:
|
|
62
|
+
self._upload_id = str(uuid.uuid4())
|
|
63
|
+
return self._upload_id
|
|
64
|
+
|
|
65
|
+
async def get_data_async(
|
|
66
|
+
self, since: Optional[str] = None
|
|
67
|
+
) -> AsyncGenerator[TSourceData, None]:
|
|
68
|
+
"""
|
|
69
|
+
Get data from the async streaming data client.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
since: If provided, only get data modified since this timestamp.
|
|
73
|
+
|
|
74
|
+
Yields:
|
|
75
|
+
Individual data items from the source
|
|
76
|
+
"""
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Fetching async streaming data from source{' since ' + since if since else ''}"
|
|
79
|
+
)
|
|
80
|
+
async for item in self.async_data_client.get_source_data(since=since):
|
|
81
|
+
yield item
|
|
82
|
+
|
|
83
|
+
async def index_data_async(
|
|
84
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
85
|
+
) -> None:
|
|
86
|
+
"""
|
|
87
|
+
Index data from the datasource to Glean using async streaming.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
91
|
+
force_restart: If True, forces a restart of the upload.
|
|
92
|
+
"""
|
|
93
|
+
logger.info(
|
|
94
|
+
f"Starting {mode.name.lower()} async streaming indexing for datasource '{self.name}'"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
since = None
|
|
98
|
+
if mode == IndexingMode.INCREMENTAL:
|
|
99
|
+
since = self._get_last_crawl_timestamp()
|
|
100
|
+
logger.info(f"Incremental crawl since: {since}")
|
|
101
|
+
|
|
102
|
+
upload_id = self.generate_upload_id()
|
|
103
|
+
self._force_restart = force_restart
|
|
104
|
+
is_first_batch = True
|
|
105
|
+
batch: List[TSourceData] = []
|
|
106
|
+
batch_count = 0
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
data_iterator = self.get_data_async(since=since).__aiter__()
|
|
110
|
+
exhausted = False
|
|
111
|
+
|
|
112
|
+
while not exhausted:
|
|
113
|
+
try:
|
|
114
|
+
item = await data_iterator.__anext__()
|
|
115
|
+
batch.append(item)
|
|
116
|
+
|
|
117
|
+
if len(batch) == self.batch_size:
|
|
118
|
+
try:
|
|
119
|
+
next_item = await data_iterator.__anext__()
|
|
120
|
+
|
|
121
|
+
await self._process_batch_async(
|
|
122
|
+
batch=batch,
|
|
123
|
+
upload_id=upload_id,
|
|
124
|
+
is_first_batch=is_first_batch,
|
|
125
|
+
is_last_batch=False,
|
|
126
|
+
batch_number=batch_count,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
batch_count += 1
|
|
130
|
+
batch = [next_item]
|
|
131
|
+
is_first_batch = False
|
|
132
|
+
|
|
133
|
+
except StopAsyncIteration:
|
|
134
|
+
exhausted = True
|
|
135
|
+
|
|
136
|
+
except StopAsyncIteration:
|
|
137
|
+
exhausted = True
|
|
138
|
+
|
|
139
|
+
if batch:
|
|
140
|
+
await self._process_batch_async(
|
|
141
|
+
batch=batch,
|
|
142
|
+
upload_id=upload_id,
|
|
143
|
+
is_first_batch=is_first_batch,
|
|
144
|
+
is_last_batch=True,
|
|
145
|
+
batch_number=batch_count,
|
|
146
|
+
)
|
|
147
|
+
batch_count += 1
|
|
148
|
+
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Async streaming indexing completed successfully. Processed {batch_count} batches."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.exception(f"Error during async streaming indexing: {e}")
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
async def _process_batch_async(
|
|
158
|
+
self,
|
|
159
|
+
batch: List[TSourceData],
|
|
160
|
+
upload_id: str,
|
|
161
|
+
is_first_batch: bool,
|
|
162
|
+
is_last_batch: bool,
|
|
163
|
+
batch_number: int,
|
|
164
|
+
) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Process a single batch of data.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
batch: The batch of raw data to process
|
|
170
|
+
upload_id: The upload ID for this indexing session
|
|
171
|
+
is_first_batch: Whether this is the first batch
|
|
172
|
+
is_last_batch: Whether this is the last batch
|
|
173
|
+
batch_number: The sequence number of this batch
|
|
174
|
+
"""
|
|
175
|
+
logger.info(f"Processing batch {batch_number} with {len(batch)} items")
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
transformed_batch = self.transform(batch)
|
|
179
|
+
logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
|
|
180
|
+
|
|
181
|
+
bulk_index_kwargs = {
|
|
182
|
+
"datasource": self.name,
|
|
183
|
+
"documents": list(transformed_batch),
|
|
184
|
+
"upload_id": upload_id,
|
|
185
|
+
"is_first_page": is_first_batch,
|
|
186
|
+
"is_last_page": is_last_batch,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if self._force_restart and is_first_batch:
|
|
190
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
191
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
192
|
+
|
|
193
|
+
with api_client() as client:
|
|
194
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
195
|
+
|
|
196
|
+
logger.info(f"Batch {batch_number} indexed successfully")
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"Failed to process batch {batch_number}: {e}")
|
|
200
|
+
raise
|
|
201
|
+
|
|
202
|
+
def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
|
|
203
|
+
"""
|
|
204
|
+
Sync fallback - collects all data into memory.
|
|
205
|
+
|
|
206
|
+
Warning: This defeats the purpose of streaming. Use get_data_async() instead.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
async def collect() -> List[TSourceData]:
|
|
210
|
+
result: List[TSourceData] = []
|
|
211
|
+
async for item in self.get_data_async(since=since):
|
|
212
|
+
result.append(item)
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
logger.warning(
|
|
216
|
+
"Sync get_data() called on async connector - using asyncio.run(). "
|
|
217
|
+
"Consider using get_data_async() for better performance."
|
|
218
|
+
)
|
|
219
|
+
return asyncio.run(collect())
|
|
220
|
+
|
|
221
|
+
def index_data(
|
|
222
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
223
|
+
) -> None:
|
|
224
|
+
"""
|
|
225
|
+
Sync fallback for index_data.
|
|
226
|
+
|
|
227
|
+
Warning: This blocks the current thread. Use index_data_async() instead.
|
|
228
|
+
"""
|
|
229
|
+
logger.warning(
|
|
230
|
+
"Sync index_data() called on async connector - using asyncio.run(). "
|
|
231
|
+
"Consider using index_data_async() for better performance."
|
|
232
|
+
)
|
|
233
|
+
asyncio.run(self.index_data_async(mode=mode, force_restart=force_restart))
|
|
@@ -6,9 +6,10 @@ from abc import ABC
|
|
|
6
6
|
from typing import Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.api_client.models import DocumentDefinition
|
|
9
|
+
|
|
9
10
|
from glean.indexing.common import BatchProcessor, api_client
|
|
10
11
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
11
|
-
from glean.indexing.connectors.base_data_client import
|
|
12
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
12
13
|
from glean.indexing.models import (
|
|
13
14
|
CustomDatasourceConfig,
|
|
14
15
|
DatasourceIdentityDefinitions,
|
|
@@ -36,7 +37,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
36
37
|
name (str): The unique name of the connector (should be snake_case).
|
|
37
38
|
configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
|
|
38
39
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
39
|
-
data_client (
|
|
40
|
+
data_client (BaseDataClient): The data client for fetching source data.
|
|
40
41
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
41
42
|
|
|
42
43
|
Example:
|
|
@@ -47,7 +48,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
47
48
|
|
|
48
49
|
configuration: CustomDatasourceConfig
|
|
49
50
|
|
|
50
|
-
def __init__(self, name: str, data_client:
|
|
51
|
+
def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
|
|
51
52
|
"""
|
|
52
53
|
Initialize the datasource connector.
|
|
53
54
|
|
|
@@ -6,9 +6,10 @@ from abc import ABC
|
|
|
6
6
|
from typing import Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.api_client.models import EmployeeInfoDefinition
|
|
9
|
+
|
|
9
10
|
from glean.indexing.common import BatchProcessor, api_client
|
|
10
11
|
from glean.indexing.connectors.base_connector import BaseConnector
|
|
11
|
-
from glean.indexing.connectors.base_data_client import
|
|
12
|
+
from glean.indexing.connectors.base_data_client import BaseDataClient
|
|
12
13
|
from glean.indexing.models import IndexingMode, TSourceData
|
|
13
14
|
from glean.indexing.observability.observability import ConnectorObservability
|
|
14
15
|
|
|
@@ -31,7 +32,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
31
32
|
name (str): The unique name of the connector (should be snake_case).
|
|
32
33
|
configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
|
|
33
34
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
34
|
-
data_client (
|
|
35
|
+
data_client (BaseDataClient): The data client for fetching source data.
|
|
35
36
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
36
37
|
|
|
37
38
|
Example:
|
|
@@ -40,7 +41,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
40
41
|
...
|
|
41
42
|
"""
|
|
42
43
|
|
|
43
|
-
def __init__(self, name: str, data_client:
|
|
44
|
+
def __init__(self, name: str, data_client: BaseDataClient[TSourceData]):
|
|
44
45
|
"""
|
|
45
46
|
Initialize the people connector.
|
|
46
47
|
|
|
@@ -6,7 +6,8 @@ from abc import ABC
|
|
|
6
6
|
from typing import Generator, List, Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from glean.indexing.common import api_client
|
|
9
|
-
from glean.indexing.connectors import BaseDatasourceConnector
|
|
9
|
+
from glean.indexing.connectors.base_datasource_connector import BaseDatasourceConnector
|
|
10
|
+
from glean.indexing.connectors.base_streaming_data_client import BaseStreamingDataClient
|
|
10
11
|
from glean.indexing.models import IndexingMode, TSourceData
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
@@ -28,7 +29,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
28
29
|
name (str): The unique name of the connector (should be snake_case).
|
|
29
30
|
configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
|
|
30
31
|
batch_size (int): The batch size for uploads (default: 1000).
|
|
31
|
-
data_client (
|
|
32
|
+
data_client (BaseStreamingDataClient): The streaming data client for fetching source data.
|
|
32
33
|
observability (ConnectorObservability): Observability and metrics for this connector.
|
|
33
34
|
|
|
34
35
|
Notes:
|
|
@@ -41,7 +42,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
41
42
|
...
|
|
42
43
|
"""
|
|
43
44
|
|
|
44
|
-
def __init__(self, name: str, data_client:
|
|
45
|
+
def __init__(self, name: str, data_client: BaseStreamingDataClient[TSourceData]):
|
|
45
46
|
# Note: We pass the streaming client as-is since it's a specialized version
|
|
46
47
|
# The type checker may warn about this, but it's intentional for streaming
|
|
47
48
|
super().__init__(name, data_client) # type: ignore[arg-type]
|
|
@@ -83,7 +84,8 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
83
84
|
|
|
84
85
|
since = None
|
|
85
86
|
if mode == IndexingMode.INCREMENTAL:
|
|
86
|
-
since =
|
|
87
|
+
since = self._get_last_crawl_timestamp()
|
|
88
|
+
logger.info(f"Incremental crawl since: {since}")
|
|
87
89
|
|
|
88
90
|
upload_id = self.generate_upload_id()
|
|
89
91
|
self._force_restart = force_restart
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glean-indexing-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: SDK for building custom Glean indexing integrations
|
|
5
5
|
Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
|
|
6
6
|
Author-email: Steve Calvert <steve.calvert@glean.com>
|
|
@@ -435,6 +435,7 @@ class LargeKnowledgeBaseClient(StreamingConnectorDataClient[ArticleData]):
|
|
|
435
435
|
from typing import List, Sequence
|
|
436
436
|
|
|
437
437
|
from glean.api_client.models.userreferencedefinition import UserReferenceDefinition
|
|
438
|
+
|
|
438
439
|
from glean.indexing.connectors import BaseStreamingDatasourceConnector
|
|
439
440
|
from glean.indexing.models import ContentDefinition, CustomDatasourceConfig, DocumentDefinition
|
|
440
441
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
glean/indexing/__init__.py,sha256=
|
|
1
|
+
glean/indexing/__init__.py,sha256=APnkKfvATYeZF1NCePp7V2OAa5mwWTf7D_aCKaYV9Gw,1629
|
|
2
2
|
glean/indexing/models.py,sha256=UuaEDCx0ygvU4u0lRbSn4YXXZVo7D_pyD_whQtjORm8,1223
|
|
3
3
|
glean/indexing/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
|
4
4
|
glean/indexing/common/__init__.py,sha256=6COS3jP66xJ7VcNGI8I95tkF5zpqHy9QPVn82CB4m4I,513
|
|
@@ -8,21 +8,23 @@ glean/indexing/common/glean_client.py,sha256=tKRWK_C1Nja0gVy2FLnj9SmUbpIdOA3WKmp
|
|
|
8
8
|
glean/indexing/common/metrics.py,sha256=SWCWCYnNOkN4cnwCxyWyEF8iHVwQ4HZqhewi2lqyS84,1771
|
|
9
9
|
glean/indexing/common/mocks.py,sha256=-TbLzpZ7yUstQW58AICixiIQM2CV5_OPRXejjI_brhE,726
|
|
10
10
|
glean/indexing/common/property_definition_builder.py,sha256=NZFhSqsSZlhI0Ia76sn0meYr82msBMCKMd78zMKLWAM,3724
|
|
11
|
-
glean/indexing/connectors/__init__.py,sha256=
|
|
11
|
+
glean/indexing/connectors/__init__.py,sha256=d9U2-elD7DewkuvY02UQJ1_khhdYVwyQCkADzg8jVjw,1147
|
|
12
|
+
glean/indexing/connectors/base_async_streaming_data_client.py,sha256=JaKa1kfK1R1FKI7151g0zsbCutS7TmpZLabQi0LetA4,1419
|
|
13
|
+
glean/indexing/connectors/base_async_streaming_datasource_connector.py,sha256=l6BuIbz_OGFxSZv5BsJ1uOFJLlwrf9BgJugtSXmuayE,8627
|
|
12
14
|
glean/indexing/connectors/base_connector.py,sha256=m_zKbg-MMc1bjG5m2SsIarSeiPhFJKzfBQzgnlqTKF8,2640
|
|
13
|
-
glean/indexing/connectors/base_data_client.py,sha256=
|
|
14
|
-
glean/indexing/connectors/base_datasource_connector.py,sha256=
|
|
15
|
-
glean/indexing/connectors/base_people_connector.py,sha256=
|
|
16
|
-
glean/indexing/connectors/base_streaming_data_client.py,sha256=
|
|
17
|
-
glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=
|
|
15
|
+
glean/indexing/connectors/base_data_client.py,sha256=0_QSdcjr1VK1COnpbzJFzowDVpODIRAPHgsjMNRh4As,908
|
|
16
|
+
glean/indexing/connectors/base_datasource_connector.py,sha256=8_FQcQsc5gX9g_N6nw_8jj0ppccaBtGMjID2bBq9VcU,13271
|
|
17
|
+
glean/indexing/connectors/base_people_connector.py,sha256=7aD_B8mVUWKinV4kfzWVw0y3RRIbKZ-AbONywQf2Gxc,7071
|
|
18
|
+
glean/indexing/connectors/base_streaming_data_client.py,sha256=0p_OPLv7eKKCER3tuvsOuvzakiQhAG-ztyKUs9bSIl0,1131
|
|
19
|
+
glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=96gehVYoxrzgHLr2U-EzO9kuKMdy_GsZ56QR2m3qls8,7872
|
|
18
20
|
glean/indexing/observability/__init__.py,sha256=SuWJ7pHs5WFq5vL036B3RIsJSbjDsy6SI705u83874I,455
|
|
19
21
|
glean/indexing/observability/observability.py,sha256=cHlo-tbrmGie6YeWXqEUap0YE6JRtFvOKTnxWD-7yac,9222
|
|
20
22
|
glean/indexing/testing/__init__.py,sha256=h9mK0QjRZD5f470ePTeg635jZNwPBAd2S7g1DQO4LuE,448
|
|
21
23
|
glean/indexing/testing/connector_test_harness.py,sha256=CMQZmn0cOIrj_GdIHb3OwRN9jTaZrn3pYkHHz50rqK8,1988
|
|
22
24
|
glean/indexing/testing/mock_data_source.py,sha256=ICYbbHQZe9RVTzvrlwcxp_suxm9yXgjEAGiNCU-SkS4,1325
|
|
23
|
-
glean/indexing/testing/mock_glean_client.py,sha256
|
|
25
|
+
glean/indexing/testing/mock_glean_client.py,sha256=-0-ppfD1DmLbmtc5T_vFOfZB_ACx2RL6MAoVUqxl_Us,2529
|
|
24
26
|
glean/indexing/testing/response_validator.py,sha256=jehEtXlW0AQcOVck-_VPoDFtQM_vkHJQ10SUN1ftr1Q,1800
|
|
25
|
-
glean_indexing_sdk-0.
|
|
26
|
-
glean_indexing_sdk-0.
|
|
27
|
-
glean_indexing_sdk-0.
|
|
28
|
-
glean_indexing_sdk-0.
|
|
27
|
+
glean_indexing_sdk-0.3.0.dist-info/METADATA,sha256=lpXuoNquAdBGHTGhm1XNYzvAhYKPudKdCPBQ41q95v0,16225
|
|
28
|
+
glean_indexing_sdk-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
29
|
+
glean_indexing_sdk-0.3.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
|
|
30
|
+
glean_indexing_sdk-0.3.0.dist-info/RECORD,,
|
|
File without changes
|