glean-indexing-sdk 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ """Base people connector for the Glean Connector SDK."""
2
+
3
+ import logging
4
+ import uuid
5
+ from abc import ABC
6
+ from typing import Optional, Sequence
7
+
8
+ from glean.api_client.models import EmployeeInfoDefinition
9
+ from glean.indexing.common import BatchProcessor, api_client
10
+ from glean.indexing.connectors.base_connector import BaseConnector
11
+ from glean.indexing.connectors.base_data_client import BaseConnectorDataClient
12
+ from glean.indexing.models import IndexingMode, TSourceData
13
+ from glean.indexing.observability.observability import ConnectorObservability
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], ABC):
19
+ """
20
+ Base class for all Glean people connectors.
21
+
22
+ This class provides the core logic for indexing people/identity data (users, groups, memberships) from external systems into Glean.
23
+ Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the people source.
24
+
25
+ To implement a custom people connector, inherit from this class and implement:
26
+ - configuration: CustomDatasourceConfig (class or instance attribute)
27
+ - get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]
28
+ - transform(self, data: Sequence[TSourceData]) -> Sequence[EmployeeInfoDefinition]
29
+
30
+ Attributes:
31
+ name (str): The unique name of the connector (should be snake_case).
32
+ configuration (CustomDatasourceConfig): The people source configuration for Glean registration.
33
+ batch_size (int): The batch size for uploads (default: 1000).
34
+ data_client (BaseConnectorDataClient): The data client for fetching source data.
35
+ observability (ConnectorObservability): Observability and metrics for this connector.
36
+
37
+ Example:
38
+ class MyPeopleConnector(BasePeopleConnector[MyEmployeeData]):
39
+ configuration = CustomDatasourceConfig(...)
40
+ ...
41
+ """
42
+
43
+ def __init__(self, name: str, data_client: BaseConnectorDataClient[TSourceData]):
44
+ """
45
+ Initialize the people connector.
46
+
47
+ Args:
48
+ name: The name of the connector
49
+ data_client: The data client for fetching source data
50
+ """
51
+ super().__init__(name)
52
+ self.data_client = data_client
53
+ self._observability = ConnectorObservability(name)
54
+ self.batch_size = 1000
55
+
56
+ @property
57
+ def observability(self) -> ConnectorObservability:
58
+ """The observability instance for this connector."""
59
+ return self._observability
60
+
61
+ def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
62
+ """Index people data to Glean.
63
+
64
+ Args:
65
+ mode: The indexing mode to use (FULL or INCREMENTAL).
66
+ """
67
+ self._observability.start_execution()
68
+
69
+ try:
70
+ logger.info(f"Starting {mode.name.lower()} people indexing for '{self.name}'")
71
+
72
+ since = None
73
+ if mode == IndexingMode.INCREMENTAL:
74
+ since = self._get_last_crawl_timestamp()
75
+ logger.info(f"Incremental crawl since: {since}")
76
+
77
+ self._observability.start_timer("data_fetch")
78
+ data = self.get_data(since=since)
79
+ self._observability.end_timer("data_fetch")
80
+
81
+ logger.info(f"Retrieved {len(data)} people from source")
82
+ self._observability.record_metric("people_fetched", len(data))
83
+
84
+ self._observability.start_timer("data_transform")
85
+ employees = self.transform(data)
86
+ self._observability.end_timer("data_transform")
87
+
88
+ logger.info(f"Transformed {len(employees)} employees")
89
+ self._observability.record_metric("employees_transformed", len(employees))
90
+
91
+ self._observability.start_timer("data_upload")
92
+ self._batch_index_employees(employees)
93
+ self._observability.end_timer("data_upload")
94
+
95
+ logger.info(f"Successfully indexed {len(employees)} employees to Glean")
96
+ self._observability.record_metric("employees_indexed", len(employees))
97
+
98
+ except Exception as e:
99
+ logger.exception(f"Error during people indexing: {e}")
100
+ self._observability.increment_counter("indexing_errors")
101
+ raise
102
+ finally:
103
+ self._observability.end_execution()
104
+
105
+ def get_data(self, since: Optional[str] = None) -> Sequence[TSourceData]:
106
+ """Get data from the data client.
107
+
108
+ Args:
109
+ since: If provided, only get data modified since this timestamp.
110
+
111
+ Returns:
112
+ A sequence of source data items from the external system.
113
+ """
114
+ return self.data_client.get_source_data(since=since)
115
+
116
+ def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
117
+ """Index employees to Glean in batches."""
118
+ if not employees:
119
+ return
120
+
121
+ batches = list(BatchProcessor(list(employees), batch_size=self.batch_size))
122
+ total_batches = len(batches)
123
+
124
+ logger.info(f"Uploading {len(employees)} employees in {total_batches} batches")
125
+
126
+ upload_id = str(uuid.uuid4())
127
+ for i, batch in enumerate(batches):
128
+ try:
129
+ with api_client() as client:
130
+ client.indexing.people.bulk_index(
131
+ employees=list(batch),
132
+ upload_id=upload_id,
133
+ is_first_page=(i == 0),
134
+ is_last_page=(i == total_batches - 1),
135
+ )
136
+
137
+ logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
138
+ self._observability.increment_counter("batches_uploaded")
139
+
140
+ except Exception as e:
141
+ logger.error(f"Failed to upload employee batch {i + 1}/{total_batches}: {e}")
142
+ self._observability.increment_counter("batch_upload_errors")
143
+ raise
144
+
145
+ def _get_last_crawl_timestamp(self) -> Optional[str]:
146
+ """
147
+ Get the timestamp of the last successful crawl for incremental indexing.
148
+
149
+ Subclasses should override this to implement proper timestamp tracking.
150
+
151
+ Returns:
152
+ ISO timestamp string or None for full crawl
153
+ """
154
+ return None
@@ -0,0 +1,39 @@
1
+ """Base streaming data client interface for Glean connectors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Generator, Generic
5
+
6
+ from glean.indexing.models import TSourceData
7
+
8
+
9
+ class BaseStreamingDataClient(ABC, Generic[TSourceData]):
10
+ """
11
+ Base class for streaming data clients that fetch data in chunks.
12
+
13
+ Use this for large datasets to minimize memory usage.
14
+ This class provides an iterable interface for data retrieval, allowing
15
+ for efficient processing of large datasets without loading all data into memory at once.
16
+
17
+ Type Parameters:
18
+ TSourceData: The type of data yielded from the external source
19
+ """
20
+
21
+ @abstractmethod
22
+ def get_source_data(self, **kwargs: Any) -> Generator[TSourceData, None, None]:
23
+ """
24
+ Retrieves source data as a generator.
25
+
26
+ This method should be implemented to return a generator
27
+ that yields data items one at a time or in small batches.
28
+
29
+ Args:
30
+ **kwargs: Additional keyword arguments for customizing data retrieval.
31
+
32
+ Returns:
33
+ A generator of data items.
34
+ """
35
+ pass
36
+
37
+
38
+ # Alias for backward compatibility during transition
39
+ StreamingConnectorDataClient = BaseStreamingDataClient
@@ -0,0 +1,184 @@
1
+ """Base streaming datasource connector for memory-efficient processing of large datasets."""
2
+
3
+ import logging
4
+ import uuid
5
+ from abc import ABC
6
+ from typing import Generator, List, Optional, Sequence
7
+
8
+ from glean.indexing.common import api_client
9
+ from glean.indexing.connectors import BaseDatasourceConnector, StreamingConnectorDataClient
10
+ from glean.indexing.models import IndexingMode, TSourceData
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC):
16
+ """
17
+ Base class for all Glean streaming datasource connectors.
18
+
19
+ This class provides the core logic for memory-efficient, incremental indexing of large document/content datasets from external systems into Glean.
20
+ Subclasses must define a `configuration` attribute of type `CustomDatasourceConfig` describing the datasource.
21
+
22
+ To implement a custom streaming connector, inherit from this class and implement:
23
+ - configuration: CustomDatasourceConfig (class or instance attribute)
24
+ - get_data(self, since: Optional[str] = None) -> Generator[TSourceData, None, None]
25
+ - transform(self, data: Sequence[TSourceData]) -> Sequence[DocumentDefinition]
26
+
27
+ Attributes:
28
+ name (str): The unique name of the connector (should be snake_case).
29
+ configuration (CustomDatasourceConfig): The datasource configuration for Glean registration.
30
+ batch_size (int): The batch size for uploads (default: 1000).
31
+ data_client (StreamingConnectorDataClient): The streaming data client for fetching source data.
32
+ observability (ConnectorObservability): Observability and metrics for this connector.
33
+
34
+ Notes:
35
+ - Use this class for very large datasets, paginated APIs, or memory-constrained environments.
36
+ - The data client should yield data incrementally (e.g., via a generator).
37
+
38
+ Example:
39
+ class MyStreamingConnector(BaseStreamingDatasourceConnector[MyDocData]):
40
+ configuration = CustomDatasourceConfig(...)
41
+ ...
42
+ """
43
+
44
+ def __init__(self, name: str, data_client: StreamingConnectorDataClient[TSourceData]):
45
+ # Note: We pass the streaming client as-is since it's a specialized version
46
+ # The type checker may warn about this, but it's intentional for streaming
47
+ super().__init__(name, data_client) # type: ignore[arg-type]
48
+ self.batch_size = 1000
49
+ self._upload_id: Optional[str] = None
50
+
51
+ def generate_upload_id(self) -> str:
52
+ """Generate a unique upload ID for batch tracking."""
53
+ if not self._upload_id:
54
+ self._upload_id = str(uuid.uuid4())
55
+ return self._upload_id
56
+
57
+ def get_data(self, since: Optional[str] = None) -> Generator[TSourceData, None, None]:
58
+ """
59
+ Get data from the streaming data client.
60
+
61
+ Args:
62
+ since: If provided, only get data modified since this timestamp.
63
+
64
+ Yields:
65
+ Individual data items from the source
66
+ """
67
+ logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
68
+ yield from self.data_client.get_source_data(since=since)
69
+
70
+ def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
71
+ """
72
+ Index data from the datasource to Glean using streaming.
73
+
74
+ Args:
75
+ mode: The indexing mode to use (FULL or INCREMENTAL).
76
+ """
77
+ logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
78
+
79
+ since = None
80
+ if mode == IndexingMode.INCREMENTAL:
81
+ since = "2023-01-01T00:00:00Z"
82
+
83
+ upload_id = self.generate_upload_id()
84
+ data_iterator = self.get_data(since=since)
85
+ is_first_batch = True
86
+ batch: List[TSourceData] = []
87
+ batch_count = 0
88
+
89
+ try:
90
+ for item in data_iterator:
91
+ batch.append(item)
92
+
93
+ if len(batch) == self.batch_size:
94
+ try:
95
+ next_item = next(data_iterator)
96
+
97
+ self._process_batch(
98
+ batch=batch,
99
+ upload_id=upload_id,
100
+ is_first_batch=is_first_batch,
101
+ is_last_batch=False,
102
+ batch_number=batch_count,
103
+ )
104
+
105
+ batch_count += 1
106
+ batch = [next_item]
107
+ is_first_batch = False
108
+
109
+ except StopIteration:
110
+ break
111
+
112
+ if batch:
113
+ self._process_batch(
114
+ batch=batch,
115
+ upload_id=upload_id,
116
+ is_first_batch=is_first_batch,
117
+ is_last_batch=True,
118
+ batch_number=batch_count,
119
+ )
120
+
121
+ logger.info(
122
+ f"Streaming indexing completed successfully. Processed {batch_count + 1} batches."
123
+ )
124
+
125
+ except Exception as e:
126
+ logger.exception(f"Error during streaming indexing: {e}")
127
+ raise
128
+
129
+ def _process_batch(
130
+ self,
131
+ batch: List[TSourceData],
132
+ upload_id: str,
133
+ is_first_batch: bool,
134
+ is_last_batch: bool,
135
+ batch_number: int,
136
+ ) -> None:
137
+ """
138
+ Process a single batch of data.
139
+
140
+ Args:
141
+ batch: The batch of raw data to process
142
+ upload_id: The upload ID for this indexing session
143
+ is_first_batch: Whether this is the first batch
144
+ is_last_batch: Whether this is the last batch
145
+ batch_number: The sequence number of this batch
146
+ """
147
+ logger.info(f"Processing batch {batch_number} with {len(batch)} items")
148
+
149
+ try:
150
+ transformed_batch = self.transform(batch)
151
+ logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
152
+
153
+ with api_client() as client:
154
+ client.indexing.documents.bulk_index(
155
+ datasource=self.name,
156
+ documents=list(transformed_batch),
157
+ upload_id=upload_id,
158
+ is_first_page=is_first_batch,
159
+ is_last_page=is_last_batch,
160
+ )
161
+
162
+ logger.info(f"Batch {batch_number} indexed successfully")
163
+
164
+ except Exception as e:
165
+ logger.error(f"Failed to process batch {batch_number}: {e}")
166
+ raise
167
+
168
+ def get_data_non_streaming(self, since: Optional[str] = None) -> Sequence[TSourceData]:
169
+ """
170
+ Get all data at once (non-streaming mode).
171
+
172
+ This method is required by the base class but shouldn't be used
173
+ for streaming connectors as it defeats the purpose of streaming.
174
+
175
+ Args:
176
+ since: If provided, only get data modified since this timestamp.
177
+
178
+ Returns:
179
+ A sequence of source data items from the external system.
180
+ """
181
+ logger.warning(
182
+ "get_data_non_streaming called on streaming connector - this may cause memory issues"
183
+ )
184
+ return list(self.get_data(since=since))
@@ -0,0 +1,45 @@
1
+ from enum import Enum
2
+ from typing import Any, Sequence, TypedDict, TypeVar
3
+
4
+ from glean.api_client.models import (
5
+ ContentDefinition,
6
+ CustomDatasourceConfig,
7
+ DocumentDefinition,
8
+ EmployeeInfoDefinition,
9
+ UserReferenceDefinition,
10
+ )
11
+
12
+
13
+ class IndexingMode(str, Enum):
14
+ """Specifies the indexing strategy for a datasource: full or incremental."""
15
+
16
+ FULL = "full"
17
+ INCREMENTAL = "incremental"
18
+
19
+
20
+ TSourceData = TypeVar("TSourceData")
21
+ """Type variable for the raw source data type used in indexing pipelines."""
22
+
23
+ TIndexableEntityDefinition = TypeVar("TIndexableEntityDefinition")
24
+ """Type variable for the Glean API entity definition produced by the connector (e.g., DocumentDefinition, EmployeeInfoDefinition)."""
25
+
26
+
27
+ class DatasourceIdentityDefinitions(TypedDict, total=False):
28
+ """Defines user, group, and membership identity data for a datasource."""
29
+
30
+ users: Sequence[Any]
31
+ groups: Sequence[Any]
32
+ memberships: Sequence[Any]
33
+
34
+
35
+ __all__ = [
36
+ "CustomDatasourceConfig",
37
+ "DocumentDefinition",
38
+ "EmployeeInfoDefinition",
39
+ "ContentDefinition",
40
+ "UserReferenceDefinition",
41
+ "IndexingMode",
42
+ "DatasourceIdentityDefinitions",
43
+ "TSourceData",
44
+ "TIndexableEntityDefinition",
45
+ ]
@@ -0,0 +1,19 @@
1
+ """Observability and monitoring tools for Glean indexing."""
2
+
3
+ from glean.indexing.observability.observability import (
4
+ ConnectorObservability,
5
+ with_observability,
6
+ track_crawl_progress,
7
+ PerformanceTracker,
8
+ ProgressCallback,
9
+ setup_connector_logging,
10
+ )
11
+
12
+ __all__ = [
13
+ "ConnectorObservability",
14
+ "with_observability",
15
+ "track_crawl_progress",
16
+ "PerformanceTracker",
17
+ "ProgressCallback",
18
+ "setup_connector_logging",
19
+ ]