glean-indexing-sdk 0.0.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.cz.toml +1 -1
- glean_indexing_sdk-0.2.0/CHANGELOG.md +10 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/PKG-INFO +13 -1
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/README.md +12 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/pyproject.toml +1 -1
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/__init__.py +1 -1
- glean_indexing_sdk-0.2.0/src/glean/indexing/common/property_definition_builder.py +115 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_connector.py +9 -2
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_datasource_connector.py +29 -11
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_people_connector.py +28 -10
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_datasource_connector.py +20 -8
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/taskfile.yml +2 -0
- glean_indexing_sdk-0.2.0/tests/unit_tests/common/test_property_definition_builder.py +246 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_datasource_connector.py +69 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_people_connector.py +1 -1
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_streaming_datasource_connector.py +54 -2
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_custom_connector_integration.py +1 -1
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/uv.lock +645 -645
- glean_indexing_sdk-0.0.3/CHANGELOG.md +0 -5
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.env.template +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.github/CODEOWNERS +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.github/workflows/ci.yml +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.github/workflows/publish.yml +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.gitignore +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.markdown-coderc.json +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.python-version +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.ruff.toml +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/.vscode/settings.json +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/CONTRIBUTING.md +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/LICENSE +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/RELEASE.md +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/env.template +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/mise.toml +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/complete.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/run_connector.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_connector.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_data_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_page_data.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_connector.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/snippets/streaming/run_connector.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/batch_processor.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/content_formatter.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/glean_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/metrics.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/mocks.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_data_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_data_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/models.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/observability.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/py.typed +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/connector_test_harness.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_data_source.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_glean_client.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/response_validator.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/integration_tests/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3/tests/unit_tests/utils → glean_indexing_sdk-0.2.0/tests/unit_tests/common}/__init__.py +0 -0
- {glean_indexing_sdk-0.0.3/tests/unit_tests/utils → glean_indexing_sdk-0.2.0/tests/unit_tests/common}/mock_clients.py +0 -0
- {glean_indexing_sdk-0.0.3/tests/unit_tests/utils → glean_indexing_sdk-0.2.0/tests/unit_tests/common}/test_batch_processor.py +0 -0
- {glean_indexing_sdk-0.0.3/tests/unit_tests/utils → glean_indexing_sdk-0.2.0/tests/unit_tests/common}/test_content_formatter.py +0 -0
- {glean_indexing_sdk-0.0.3/tests/unit_tests/utils → glean_indexing_sdk-0.2.0/tests/unit_tests/common}/test_metrics.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_connector.py +0 -0
- {glean_indexing_sdk-0.0.3 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_data_client.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glean-indexing-sdk
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: SDK for building custom Glean indexing integrations
|
|
5
5
|
Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
|
|
6
6
|
Author-email: Steve Calvert <steve.calvert@glean.com>
|
|
@@ -232,6 +232,18 @@ connector.configure_datasource()
|
|
|
232
232
|
connector.index_data(mode=IndexingMode.FULL)
|
|
233
233
|
```
|
|
234
234
|
|
|
235
|
+
**When to use forced restarts:**
|
|
236
|
+
- When you need to abort and restart a failed or interrupted upload
|
|
237
|
+
- When you want to ensure a clean upload state by discarding partial uploads
|
|
238
|
+
- When recovering from upload errors or inconsistent states
|
|
239
|
+
|
|
240
|
+
**How it works:**
|
|
241
|
+
- Generates a new `upload_id` to ensure clean separation from previous uploads
|
|
242
|
+
- Sets `forceRestartUpload=True` on the **first batch only**
|
|
243
|
+
- Continues with normal batch processing for subsequent batches
|
|
244
|
+
|
|
245
|
+
This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
|
|
246
|
+
|
|
235
247
|
### Complete Example
|
|
236
248
|
|
|
237
249
|
```python snippet=non_streaming/complete.py
|
|
@@ -202,6 +202,18 @@ connector.configure_datasource()
|
|
|
202
202
|
connector.index_data(mode=IndexingMode.FULL)
|
|
203
203
|
```
|
|
204
204
|
|
|
205
|
+
**When to use forced restarts:**
|
|
206
|
+
- When you need to abort and restart a failed or interrupted upload
|
|
207
|
+
- When you want to ensure a clean upload state by discarding partial uploads
|
|
208
|
+
- When recovering from upload errors or inconsistent states
|
|
209
|
+
|
|
210
|
+
**How it works:**
|
|
211
|
+
- Generates a new `upload_id` to ensure clean separation from previous uploads
|
|
212
|
+
- Sets `forceRestartUpload=True` on the **first batch only**
|
|
213
|
+
- Continues with normal batch processing for subsequent batches
|
|
214
|
+
|
|
215
|
+
This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
|
|
216
|
+
|
|
205
217
|
### Complete Example
|
|
206
218
|
|
|
207
219
|
```python snippet=non_streaming/complete.py
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "glean-indexing-sdk"
|
|
7
|
-
version = "0.0
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "SDK for building custom Glean indexing integrations"
|
|
9
9
|
authors = [{ name = "Steve Calvert", email = "steve.calvert@glean.com" }]
|
|
10
10
|
readme = "README.md"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from glean.api_client.models.propertydefinition import PropertyDefinition, PropertyType, UIOptions
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PropertyDefinitionBuilder:
|
|
7
|
+
"""
|
|
8
|
+
Builder class for creating PropertyDefinition objects with a fluent interface.
|
|
9
|
+
|
|
10
|
+
This class provides a convenient way to build multiple PropertyDefinition objects
|
|
11
|
+
with proper validation and type safety.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
builder = PropertyDefinitionBuilder()
|
|
15
|
+
properties = (builder
|
|
16
|
+
.add_property("title", "Title", property_type=PropertyType.TEXT)
|
|
17
|
+
.add_property("author", "Author", display_label_plural="Authors")
|
|
18
|
+
.build())
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self.properties: List[PropertyDefinition] = []
|
|
23
|
+
|
|
24
|
+
def add_property(
|
|
25
|
+
self,
|
|
26
|
+
name: str,
|
|
27
|
+
display_label: str,
|
|
28
|
+
display_label_plural: Optional[str] = None,
|
|
29
|
+
property_type: PropertyType = PropertyType.TEXT,
|
|
30
|
+
ui_options: UIOptions = UIOptions.SEARCH_RESULT,
|
|
31
|
+
hide_ui_facet: bool = False,
|
|
32
|
+
ui_facet_order: Optional[int] = None,
|
|
33
|
+
group: Optional[str] = None,
|
|
34
|
+
) -> "PropertyDefinitionBuilder":
|
|
35
|
+
"""
|
|
36
|
+
Add a property definition to the builder.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
name: The property name (must not be empty)
|
|
40
|
+
display_label: The display label for the property
|
|
41
|
+
display_label_plural: Optional plural form of the display label
|
|
42
|
+
property_type: The type of property (defaults to TEXT)
|
|
43
|
+
ui_options: UI options for the property (defaults to SEARCH_RESULT)
|
|
44
|
+
hide_ui_facet: Whether to hide the UI facet
|
|
45
|
+
ui_facet_order: Optional order for UI facet display
|
|
46
|
+
group: Optional group name for the property
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Self for method chaining
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If name or display_label is empty
|
|
53
|
+
"""
|
|
54
|
+
if not name or not name.strip():
|
|
55
|
+
raise ValueError("Property name cannot be empty")
|
|
56
|
+
if not display_label or not display_label.strip():
|
|
57
|
+
raise ValueError("Display label cannot be empty")
|
|
58
|
+
|
|
59
|
+
base_params = {
|
|
60
|
+
"name": name.strip(),
|
|
61
|
+
"display_label": display_label.strip(),
|
|
62
|
+
"property_type": property_type.value,
|
|
63
|
+
"ui_options": ui_options.value,
|
|
64
|
+
"hide_ui_facet": hide_ui_facet,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
optional_params = {
|
|
68
|
+
k: v
|
|
69
|
+
for k, v in {
|
|
70
|
+
"display_label_plural": display_label_plural.strip()
|
|
71
|
+
if display_label_plural
|
|
72
|
+
else None,
|
|
73
|
+
"ui_facet_order": ui_facet_order,
|
|
74
|
+
"group": group.strip() if group else None,
|
|
75
|
+
}.items()
|
|
76
|
+
if v is not None
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
params = {**base_params, **optional_params}
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
prop = PropertyDefinition(**params)
|
|
83
|
+
self.properties.append(prop)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
raise ValueError(f"Failed to create PropertyDefinition: {e}") from e
|
|
86
|
+
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def clear(self) -> "PropertyDefinitionBuilder":
|
|
90
|
+
"""
|
|
91
|
+
Clear all properties from the builder.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Self for method chaining
|
|
95
|
+
"""
|
|
96
|
+
self.properties.clear()
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
def count(self) -> int:
|
|
100
|
+
"""
|
|
101
|
+
Get the number of properties currently in the builder.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Number of properties
|
|
105
|
+
"""
|
|
106
|
+
return len(self.properties)
|
|
107
|
+
|
|
108
|
+
def build(self) -> List[PropertyDefinition]:
|
|
109
|
+
"""
|
|
110
|
+
Build and return the list of PropertyDefinition objects.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List of PropertyDefinition objects
|
|
114
|
+
"""
|
|
115
|
+
return self.properties.copy()
|
|
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
|
|
|
55
55
|
pass
|
|
56
56
|
|
|
57
57
|
@abstractmethod
|
|
58
|
-
def index_data(
|
|
59
|
-
|
|
58
|
+
def index_data(
|
|
59
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Index data from the connector to Glean.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
65
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
66
|
+
"""
|
|
60
67
|
pass
|
|
@@ -114,12 +114,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
114
114
|
client.indexing.datasources.add(**config.dict(exclude_unset=True))
|
|
115
115
|
logger.info(f"Successfully configured datasource: {config.name}")
|
|
116
116
|
|
|
117
|
-
def index_data(
|
|
117
|
+
def index_data(
|
|
118
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
119
|
+
) -> None:
|
|
118
120
|
"""
|
|
119
121
|
Index data from the datasource to Glean with identity crawl followed by content crawl.
|
|
120
122
|
|
|
121
123
|
Args:
|
|
122
124
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
125
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
126
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
123
127
|
"""
|
|
124
128
|
self._observability.start_execution()
|
|
125
129
|
|
|
@@ -169,7 +173,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
169
173
|
self._observability.start_timer("data_upload")
|
|
170
174
|
if documents:
|
|
171
175
|
logger.info(f"Indexing {len(documents)} documents")
|
|
172
|
-
self._batch_index_documents(documents)
|
|
176
|
+
self._batch_index_documents(documents, force_restart=force_restart)
|
|
173
177
|
self._observability.end_timer("data_upload")
|
|
174
178
|
|
|
175
179
|
logger.info(f"Successfully indexed {len(documents)} documents to Glean")
|
|
@@ -272,8 +276,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
272
276
|
self._observability.increment_counter("batch_upload_errors")
|
|
273
277
|
raise
|
|
274
278
|
|
|
275
|
-
def _batch_index_documents(
|
|
276
|
-
|
|
279
|
+
def _batch_index_documents(
|
|
280
|
+
self, documents: Sequence[DocumentDefinition], force_restart: bool = False
|
|
281
|
+
) -> None:
|
|
282
|
+
"""Index documents in batches with proper page signaling.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
documents: The documents to index
|
|
286
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
287
|
+
"""
|
|
277
288
|
if not documents:
|
|
278
289
|
return
|
|
279
290
|
|
|
@@ -285,14 +296,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
285
296
|
upload_id = str(uuid.uuid4())
|
|
286
297
|
for i, batch in enumerate(batches):
|
|
287
298
|
try:
|
|
299
|
+
is_first_page = i == 0
|
|
300
|
+
bulk_index_kwargs = {
|
|
301
|
+
"datasource": self.name,
|
|
302
|
+
"documents": list(batch),
|
|
303
|
+
"upload_id": upload_id,
|
|
304
|
+
"is_first_page": is_first_page,
|
|
305
|
+
"is_last_page": (i == total_batches - 1),
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if force_restart and is_first_page:
|
|
309
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
310
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
311
|
+
|
|
288
312
|
with api_client() as client:
|
|
289
|
-
client.indexing.documents.bulk_index(
|
|
290
|
-
datasource=self.name,
|
|
291
|
-
documents=list(batch),
|
|
292
|
-
upload_id=upload_id,
|
|
293
|
-
is_first_page=(i == 0),
|
|
294
|
-
is_last_page=(i == total_batches - 1),
|
|
295
|
-
)
|
|
313
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
296
314
|
|
|
297
315
|
logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
|
|
298
316
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -58,11 +58,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
58
58
|
"""The observability instance for this connector."""
|
|
59
59
|
return self._observability
|
|
60
60
|
|
|
61
|
-
def index_data(
|
|
61
|
+
def index_data(
|
|
62
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
63
|
+
) -> None:
|
|
62
64
|
"""Index people data to Glean.
|
|
63
65
|
|
|
64
66
|
Args:
|
|
65
67
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
68
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
69
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
66
70
|
"""
|
|
67
71
|
self._observability.start_execution()
|
|
68
72
|
|
|
@@ -89,7 +93,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
89
93
|
self._observability.record_metric("employees_transformed", len(employees))
|
|
90
94
|
|
|
91
95
|
self._observability.start_timer("data_upload")
|
|
92
|
-
self._batch_index_employees(employees)
|
|
96
|
+
self._batch_index_employees(employees, force_restart=force_restart)
|
|
93
97
|
self._observability.end_timer("data_upload")
|
|
94
98
|
|
|
95
99
|
logger.info(f"Successfully indexed {len(employees)} employees to Glean")
|
|
@@ -113,8 +117,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
113
117
|
"""
|
|
114
118
|
return self.data_client.get_source_data(since=since)
|
|
115
119
|
|
|
116
|
-
def _batch_index_employees(
|
|
117
|
-
|
|
120
|
+
def _batch_index_employees(
|
|
121
|
+
self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
|
|
122
|
+
) -> None:
|
|
123
|
+
"""Index employees to Glean in batches.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
employees: The employees to index
|
|
127
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
128
|
+
"""
|
|
118
129
|
if not employees:
|
|
119
130
|
return
|
|
120
131
|
|
|
@@ -126,13 +137,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
126
137
|
upload_id = str(uuid.uuid4())
|
|
127
138
|
for i, batch in enumerate(batches):
|
|
128
139
|
try:
|
|
140
|
+
is_first_page = i == 0
|
|
141
|
+
bulk_index_kwargs = {
|
|
142
|
+
"employees": list(batch),
|
|
143
|
+
"upload_id": upload_id,
|
|
144
|
+
"is_first_page": is_first_page,
|
|
145
|
+
"is_last_page": (i == total_batches - 1),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if force_restart and is_first_page:
|
|
149
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
150
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
151
|
+
|
|
129
152
|
with api_client() as client:
|
|
130
|
-
client.indexing.people.bulk_index(
|
|
131
|
-
employees=list(batch),
|
|
132
|
-
upload_id=upload_id,
|
|
133
|
-
is_first_page=(i == 0),
|
|
134
|
-
is_last_page=(i == total_batches - 1),
|
|
135
|
-
)
|
|
153
|
+
client.indexing.people.bulk_index(**bulk_index_kwargs)
|
|
136
154
|
|
|
137
155
|
logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
|
|
138
156
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -47,6 +47,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
47
47
|
super().__init__(name, data_client) # type: ignore[arg-type]
|
|
48
48
|
self.batch_size = 1000
|
|
49
49
|
self._upload_id: Optional[str] = None
|
|
50
|
+
self._force_restart: bool = False
|
|
50
51
|
|
|
51
52
|
def generate_upload_id(self) -> str:
|
|
52
53
|
"""Generate a unique upload ID for batch tracking."""
|
|
@@ -67,12 +68,16 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
67
68
|
logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
|
|
68
69
|
yield from self.data_client.get_source_data(since=since)
|
|
69
70
|
|
|
70
|
-
def index_data(
|
|
71
|
+
def index_data(
|
|
72
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
73
|
+
) -> None:
|
|
71
74
|
"""
|
|
72
75
|
Index data from the datasource to Glean using streaming.
|
|
73
76
|
|
|
74
77
|
Args:
|
|
75
78
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
79
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
80
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
76
81
|
"""
|
|
77
82
|
logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
|
|
78
83
|
|
|
@@ -81,6 +86,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
81
86
|
since = "2023-01-01T00:00:00Z"
|
|
82
87
|
|
|
83
88
|
upload_id = self.generate_upload_id()
|
|
89
|
+
self._force_restart = force_restart
|
|
84
90
|
data_iterator = self.get_data(since=since)
|
|
85
91
|
is_first_batch = True
|
|
86
92
|
batch: List[TSourceData] = []
|
|
@@ -150,14 +156,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
150
156
|
transformed_batch = self.transform(batch)
|
|
151
157
|
logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
|
|
152
158
|
|
|
159
|
+
bulk_index_kwargs = {
|
|
160
|
+
"datasource": self.name,
|
|
161
|
+
"documents": list(transformed_batch),
|
|
162
|
+
"upload_id": upload_id,
|
|
163
|
+
"is_first_page": is_first_batch,
|
|
164
|
+
"is_last_page": is_last_batch,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if self._force_restart and is_first_batch:
|
|
168
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
169
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
170
|
+
|
|
153
171
|
with api_client() as client:
|
|
154
|
-
client.indexing.documents.bulk_index(
|
|
155
|
-
datasource=self.name,
|
|
156
|
-
documents=list(transformed_batch),
|
|
157
|
-
upload_id=upload_id,
|
|
158
|
-
is_first_page=is_first_batch,
|
|
159
|
-
is_last_page=is_last_batch,
|
|
160
|
-
)
|
|
172
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
161
173
|
|
|
162
174
|
logger.info(f"Batch {batch_number} indexed successfully")
|
|
163
175
|
|
|
@@ -178,11 +178,13 @@ tasks:
|
|
|
178
178
|
{{.PYRIGHT}} {{.PYTHON_FILES}}
|
|
179
179
|
fi
|
|
180
180
|
|
|
181
|
+
# Lint Readme task: Lint the README.md file
|
|
181
182
|
lint:readme:
|
|
182
183
|
desc: Lint the README.md file
|
|
183
184
|
cmds:
|
|
184
185
|
- npx -y markdown-code check
|
|
185
186
|
|
|
187
|
+
# Lint Readme fix task: Fix the README.md file
|
|
186
188
|
lint:readme:fix:
|
|
187
189
|
desc: Fix the README.md file
|
|
188
190
|
cmds:
|