glean-indexing-sdk 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.cz.toml +1 -1
- glean_indexing_sdk-0.2.0/CHANGELOG.md +10 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/PKG-INFO +13 -1
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/README.md +12 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/pyproject.toml +1 -1
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/__init__.py +1 -1
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_connector.py +9 -2
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_datasource_connector.py +29 -11
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_people_connector.py +28 -10
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_datasource_connector.py +20 -8
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_datasource_connector.py +69 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_streaming_datasource_connector.py +54 -2
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/uv.lock +645 -645
- glean_indexing_sdk-0.1.0/CHANGELOG.md +0 -5
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.env.template +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/CODEOWNERS +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/workflows/ci.yml +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/workflows/publish.yml +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.gitignore +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.markdown-coderc.json +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.python-version +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.ruff.toml +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.vscode/settings.json +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/CONTRIBUTING.md +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/LICENSE +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/RELEASE.md +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/env.template +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/mise.toml +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/complete.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/run_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_data_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_page_data.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/run_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/batch_processor.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/content_formatter.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/glean_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/metrics.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/mocks.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/property_definition_builder.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_data_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_data_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/models.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/observability.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/py.typed +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/connector_test_harness.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_data_source.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_glean_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/response_validator.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/taskfile.yml +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/integration_tests/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/__init__.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/mock_clients.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_batch_processor.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_content_formatter.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_metrics.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_property_definition_builder.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_data_client.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_people_connector.py +0 -0
- {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_custom_connector_integration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: glean-indexing-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: SDK for building custom Glean indexing integrations
|
|
5
5
|
Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
|
|
6
6
|
Author-email: Steve Calvert <steve.calvert@glean.com>
|
|
@@ -232,6 +232,18 @@ connector.configure_datasource()
|
|
|
232
232
|
connector.index_data(mode=IndexingMode.FULL)
|
|
233
233
|
```
|
|
234
234
|
|
|
235
|
+
**When to use forced restarts:**
|
|
236
|
+
- When you need to abort and restart a failed or interrupted upload
|
|
237
|
+
- When you want to ensure a clean upload state by discarding partial uploads
|
|
238
|
+
- When recovering from upload errors or inconsistent states
|
|
239
|
+
|
|
240
|
+
**How it works:**
|
|
241
|
+
- Generates a new `upload_id` to ensure clean separation from previous uploads
|
|
242
|
+
- Sets `forceRestartUpload=True` on the **first batch only**
|
|
243
|
+
- Continues with normal batch processing for subsequent batches
|
|
244
|
+
|
|
245
|
+
This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
|
|
246
|
+
|
|
235
247
|
### Complete Example
|
|
236
248
|
|
|
237
249
|
```python snippet=non_streaming/complete.py
|
|
@@ -202,6 +202,18 @@ connector.configure_datasource()
|
|
|
202
202
|
connector.index_data(mode=IndexingMode.FULL)
|
|
203
203
|
```
|
|
204
204
|
|
|
205
|
+
**When to use forced restarts:**
|
|
206
|
+
- When you need to abort and restart a failed or interrupted upload
|
|
207
|
+
- When you want to ensure a clean upload state by discarding partial uploads
|
|
208
|
+
- When recovering from upload errors or inconsistent states
|
|
209
|
+
|
|
210
|
+
**How it works:**
|
|
211
|
+
- Generates a new `upload_id` to ensure clean separation from previous uploads
|
|
212
|
+
- Sets `forceRestartUpload=True` on the **first batch only**
|
|
213
|
+
- Continues with normal batch processing for subsequent batches
|
|
214
|
+
|
|
215
|
+
This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
|
|
216
|
+
|
|
205
217
|
### Complete Example
|
|
206
218
|
|
|
207
219
|
```python snippet=non_streaming/complete.py
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "glean-indexing-sdk"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "SDK for building custom Glean indexing integrations"
|
|
9
9
|
authors = [{ name = "Steve Calvert", email = "steve.calvert@glean.com" }]
|
|
10
10
|
readme = "README.md"
|
|
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
|
|
|
55
55
|
pass
|
|
56
56
|
|
|
57
57
|
@abstractmethod
|
|
58
|
-
def index_data(
|
|
59
|
-
|
|
58
|
+
def index_data(
|
|
59
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Index data from the connector to Glean.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
65
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
66
|
+
"""
|
|
60
67
|
pass
|
|
@@ -114,12 +114,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
114
114
|
client.indexing.datasources.add(**config.dict(exclude_unset=True))
|
|
115
115
|
logger.info(f"Successfully configured datasource: {config.name}")
|
|
116
116
|
|
|
117
|
-
def index_data(
|
|
117
|
+
def index_data(
|
|
118
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
119
|
+
) -> None:
|
|
118
120
|
"""
|
|
119
121
|
Index data from the datasource to Glean with identity crawl followed by content crawl.
|
|
120
122
|
|
|
121
123
|
Args:
|
|
122
124
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
125
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
126
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
123
127
|
"""
|
|
124
128
|
self._observability.start_execution()
|
|
125
129
|
|
|
@@ -169,7 +173,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
169
173
|
self._observability.start_timer("data_upload")
|
|
170
174
|
if documents:
|
|
171
175
|
logger.info(f"Indexing {len(documents)} documents")
|
|
172
|
-
self._batch_index_documents(documents)
|
|
176
|
+
self._batch_index_documents(documents, force_restart=force_restart)
|
|
173
177
|
self._observability.end_timer("data_upload")
|
|
174
178
|
|
|
175
179
|
logger.info(f"Successfully indexed {len(documents)} documents to Glean")
|
|
@@ -272,8 +276,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
272
276
|
self._observability.increment_counter("batch_upload_errors")
|
|
273
277
|
raise
|
|
274
278
|
|
|
275
|
-
def _batch_index_documents(
|
|
276
|
-
|
|
279
|
+
def _batch_index_documents(
|
|
280
|
+
self, documents: Sequence[DocumentDefinition], force_restart: bool = False
|
|
281
|
+
) -> None:
|
|
282
|
+
"""Index documents in batches with proper page signaling.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
documents: The documents to index
|
|
286
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
287
|
+
"""
|
|
277
288
|
if not documents:
|
|
278
289
|
return
|
|
279
290
|
|
|
@@ -285,14 +296,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
|
|
|
285
296
|
upload_id = str(uuid.uuid4())
|
|
286
297
|
for i, batch in enumerate(batches):
|
|
287
298
|
try:
|
|
299
|
+
is_first_page = i == 0
|
|
300
|
+
bulk_index_kwargs = {
|
|
301
|
+
"datasource": self.name,
|
|
302
|
+
"documents": list(batch),
|
|
303
|
+
"upload_id": upload_id,
|
|
304
|
+
"is_first_page": is_first_page,
|
|
305
|
+
"is_last_page": (i == total_batches - 1),
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if force_restart and is_first_page:
|
|
309
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
310
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
311
|
+
|
|
288
312
|
with api_client() as client:
|
|
289
|
-
client.indexing.documents.bulk_index(
|
|
290
|
-
datasource=self.name,
|
|
291
|
-
documents=list(batch),
|
|
292
|
-
upload_id=upload_id,
|
|
293
|
-
is_first_page=(i == 0),
|
|
294
|
-
is_last_page=(i == total_batches - 1),
|
|
295
|
-
)
|
|
313
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
296
314
|
|
|
297
315
|
logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
|
|
298
316
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -58,11 +58,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
58
58
|
"""The observability instance for this connector."""
|
|
59
59
|
return self._observability
|
|
60
60
|
|
|
61
|
-
def index_data(
|
|
61
|
+
def index_data(
|
|
62
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
63
|
+
) -> None:
|
|
62
64
|
"""Index people data to Glean.
|
|
63
65
|
|
|
64
66
|
Args:
|
|
65
67
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
68
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
69
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
66
70
|
"""
|
|
67
71
|
self._observability.start_execution()
|
|
68
72
|
|
|
@@ -89,7 +93,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
89
93
|
self._observability.record_metric("employees_transformed", len(employees))
|
|
90
94
|
|
|
91
95
|
self._observability.start_timer("data_upload")
|
|
92
|
-
self._batch_index_employees(employees)
|
|
96
|
+
self._batch_index_employees(employees, force_restart=force_restart)
|
|
93
97
|
self._observability.end_timer("data_upload")
|
|
94
98
|
|
|
95
99
|
logger.info(f"Successfully indexed {len(employees)} employees to Glean")
|
|
@@ -113,8 +117,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
113
117
|
"""
|
|
114
118
|
return self.data_client.get_source_data(since=since)
|
|
115
119
|
|
|
116
|
-
def _batch_index_employees(
|
|
117
|
-
|
|
120
|
+
def _batch_index_employees(
|
|
121
|
+
self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
|
|
122
|
+
) -> None:
|
|
123
|
+
"""Index employees to Glean in batches.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
employees: The employees to index
|
|
127
|
+
force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
|
|
128
|
+
"""
|
|
118
129
|
if not employees:
|
|
119
130
|
return
|
|
120
131
|
|
|
@@ -126,13 +137,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
|
|
|
126
137
|
upload_id = str(uuid.uuid4())
|
|
127
138
|
for i, batch in enumerate(batches):
|
|
128
139
|
try:
|
|
140
|
+
is_first_page = i == 0
|
|
141
|
+
bulk_index_kwargs = {
|
|
142
|
+
"employees": list(batch),
|
|
143
|
+
"upload_id": upload_id,
|
|
144
|
+
"is_first_page": is_first_page,
|
|
145
|
+
"is_last_page": (i == total_batches - 1),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if force_restart and is_first_page:
|
|
149
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
150
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
151
|
+
|
|
129
152
|
with api_client() as client:
|
|
130
|
-
client.indexing.people.bulk_index(
|
|
131
|
-
employees=list(batch),
|
|
132
|
-
upload_id=upload_id,
|
|
133
|
-
is_first_page=(i == 0),
|
|
134
|
-
is_last_page=(i == total_batches - 1),
|
|
135
|
-
)
|
|
153
|
+
client.indexing.people.bulk_index(**bulk_index_kwargs)
|
|
136
154
|
|
|
137
155
|
logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
|
|
138
156
|
self._observability.increment_counter("batches_uploaded")
|
|
@@ -47,6 +47,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
47
47
|
super().__init__(name, data_client) # type: ignore[arg-type]
|
|
48
48
|
self.batch_size = 1000
|
|
49
49
|
self._upload_id: Optional[str] = None
|
|
50
|
+
self._force_restart: bool = False
|
|
50
51
|
|
|
51
52
|
def generate_upload_id(self) -> str:
|
|
52
53
|
"""Generate a unique upload ID for batch tracking."""
|
|
@@ -67,12 +68,16 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
67
68
|
logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
|
|
68
69
|
yield from self.data_client.get_source_data(since=since)
|
|
69
70
|
|
|
70
|
-
def index_data(
|
|
71
|
+
def index_data(
|
|
72
|
+
self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
|
|
73
|
+
) -> None:
|
|
71
74
|
"""
|
|
72
75
|
Index data from the datasource to Glean using streaming.
|
|
73
76
|
|
|
74
77
|
Args:
|
|
75
78
|
mode: The indexing mode to use (FULL or INCREMENTAL).
|
|
79
|
+
force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
|
|
80
|
+
This sets forceRestartUpload=True on the first batch and generates a new upload ID.
|
|
76
81
|
"""
|
|
77
82
|
logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
|
|
78
83
|
|
|
@@ -81,6 +86,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
81
86
|
since = "2023-01-01T00:00:00Z"
|
|
82
87
|
|
|
83
88
|
upload_id = self.generate_upload_id()
|
|
89
|
+
self._force_restart = force_restart
|
|
84
90
|
data_iterator = self.get_data(since=since)
|
|
85
91
|
is_first_batch = True
|
|
86
92
|
batch: List[TSourceData] = []
|
|
@@ -150,14 +156,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
|
|
|
150
156
|
transformed_batch = self.transform(batch)
|
|
151
157
|
logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
|
|
152
158
|
|
|
159
|
+
bulk_index_kwargs = {
|
|
160
|
+
"datasource": self.name,
|
|
161
|
+
"documents": list(transformed_batch),
|
|
162
|
+
"upload_id": upload_id,
|
|
163
|
+
"is_first_page": is_first_batch,
|
|
164
|
+
"is_last_page": is_last_batch,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if self._force_restart and is_first_batch:
|
|
168
|
+
bulk_index_kwargs["forceRestartUpload"] = True
|
|
169
|
+
logger.info("Force restarting upload - discarding any previous upload progress")
|
|
170
|
+
|
|
153
171
|
with api_client() as client:
|
|
154
|
-
client.indexing.documents.bulk_index(
|
|
155
|
-
datasource=self.name,
|
|
156
|
-
documents=list(transformed_batch),
|
|
157
|
-
upload_id=upload_id,
|
|
158
|
-
is_first_page=is_first_batch,
|
|
159
|
-
is_last_page=is_last_batch,
|
|
160
|
-
)
|
|
172
|
+
client.indexing.documents.bulk_index(**bulk_index_kwargs)
|
|
161
173
|
|
|
162
174
|
logger.info(f"Batch {batch_number} indexed successfully")
|
|
163
175
|
|
|
@@ -128,3 +128,72 @@ class TestBaseDatasourceConnector:
|
|
|
128
128
|
|
|
129
129
|
timestamp = connector._get_last_crawl_timestamp()
|
|
130
130
|
assert timestamp is None
|
|
131
|
+
|
|
132
|
+
@patch("glean.indexing.connectors.base_datasource_connector.api_client")
|
|
133
|
+
def test_force_restart_upload(self, mock_api_client):
|
|
134
|
+
"""Test that force_restart parameter sets forceRestartUpload on first batch."""
|
|
135
|
+
mock_client = Mock()
|
|
136
|
+
mock_api_client.return_value.__enter__.return_value = mock_client
|
|
137
|
+
|
|
138
|
+
test_data = [
|
|
139
|
+
{
|
|
140
|
+
"id": "1",
|
|
141
|
+
"title": "Test Doc 1",
|
|
142
|
+
"content": "Content 1",
|
|
143
|
+
"url": "https://test.example.com/1",
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"id": "2",
|
|
147
|
+
"title": "Test Doc 2",
|
|
148
|
+
"content": "Content 2",
|
|
149
|
+
"url": "https://test.example.com/2",
|
|
150
|
+
},
|
|
151
|
+
]
|
|
152
|
+
data_client = MockDataClient(test_data)
|
|
153
|
+
connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
|
|
154
|
+
connector.batch_size = 1
|
|
155
|
+
|
|
156
|
+
connector.index_data(force_restart=True)
|
|
157
|
+
|
|
158
|
+
# Should be called twice (one batch per document)
|
|
159
|
+
assert mock_client.indexing.documents.bulk_index.call_count == 2
|
|
160
|
+
|
|
161
|
+
# First call should have forceRestartUpload=True
|
|
162
|
+
first_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[0][1]
|
|
163
|
+
assert first_call_kwargs["forceRestartUpload"] is True
|
|
164
|
+
assert first_call_kwargs["is_first_page"] is True
|
|
165
|
+
assert first_call_kwargs["is_last_page"] is False
|
|
166
|
+
|
|
167
|
+
# Second call should NOT have forceRestartUpload
|
|
168
|
+
second_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[1][1]
|
|
169
|
+
assert "forceRestartUpload" not in second_call_kwargs
|
|
170
|
+
assert second_call_kwargs["is_first_page"] is False
|
|
171
|
+
assert second_call_kwargs["is_last_page"] is True
|
|
172
|
+
|
|
173
|
+
@patch("glean.indexing.connectors.base_datasource_connector.api_client")
|
|
174
|
+
def test_normal_upload_no_force_restart(self, mock_api_client):
|
|
175
|
+
"""Test that normal upload does not include forceRestartUpload parameter."""
|
|
176
|
+
mock_client = Mock()
|
|
177
|
+
mock_api_client.return_value.__enter__.return_value = mock_client
|
|
178
|
+
|
|
179
|
+
test_data = [
|
|
180
|
+
{
|
|
181
|
+
"id": "1",
|
|
182
|
+
"title": "Test Doc",
|
|
183
|
+
"content": "Content",
|
|
184
|
+
"url": "https://test.example.com/1",
|
|
185
|
+
}
|
|
186
|
+
]
|
|
187
|
+
data_client = MockDataClient(test_data)
|
|
188
|
+
connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
|
|
189
|
+
|
|
190
|
+
connector.index_data(force_restart=False)
|
|
191
|
+
|
|
192
|
+
# Should be called once
|
|
193
|
+
assert mock_client.indexing.documents.bulk_index.call_count == 1
|
|
194
|
+
|
|
195
|
+
# Should NOT have forceRestartUpload parameter
|
|
196
|
+
call_kwargs = mock_client.indexing.documents.bulk_index.call_args[1]
|
|
197
|
+
assert "forceRestartUpload" not in call_kwargs
|
|
198
|
+
assert call_kwargs["is_first_page"] is True
|
|
199
|
+
assert call_kwargs["is_last_page"] is True
|
|
@@ -82,8 +82,7 @@ def test_index_data_batches_and_uploads():
|
|
|
82
82
|
def test_index_data_empty():
|
|
83
83
|
class EmptyClient(BaseStreamingDataClient[dict]):
|
|
84
84
|
def get_source_data(self, **kwargs):
|
|
85
|
-
|
|
86
|
-
yield
|
|
85
|
+
yield from []
|
|
87
86
|
|
|
88
87
|
connector = DummyStreamingConnector("test_stream", EmptyClient())
|
|
89
88
|
with patch(
|
|
@@ -104,3 +103,56 @@ def test_index_data_error_handling():
|
|
|
104
103
|
bulk_index.side_effect = Exception("upload failed")
|
|
105
104
|
with pytest.raises(Exception):
|
|
106
105
|
connector.index_data()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_force_restart_upload():
|
|
109
|
+
"""Test that force_restart parameter sets forceRestartUpload on first batch."""
|
|
110
|
+
client = DummyStreamingDataClient()
|
|
111
|
+
connector = DummyStreamingConnector("test_stream", client)
|
|
112
|
+
connector.batch_size = 2
|
|
113
|
+
|
|
114
|
+
with patch(
|
|
115
|
+
"glean.indexing.connectors.base_streaming_datasource_connector.api_client"
|
|
116
|
+
) as api_client:
|
|
117
|
+
bulk_index = api_client().__enter__().indexing.documents.bulk_index
|
|
118
|
+
connector.index_data(force_restart=True)
|
|
119
|
+
|
|
120
|
+
assert bulk_index.call_count == 3
|
|
121
|
+
|
|
122
|
+
# First call should have forceRestartUpload=True
|
|
123
|
+
first_call_kwargs = bulk_index.call_args_list[0][1]
|
|
124
|
+
assert first_call_kwargs["forceRestartUpload"] is True
|
|
125
|
+
assert first_call_kwargs["is_first_page"] is True
|
|
126
|
+
assert first_call_kwargs["is_last_page"] is False
|
|
127
|
+
|
|
128
|
+
# Subsequent calls should NOT have forceRestartUpload
|
|
129
|
+
second_call_kwargs = bulk_index.call_args_list[1][1]
|
|
130
|
+
assert "forceRestartUpload" not in second_call_kwargs
|
|
131
|
+
assert second_call_kwargs["is_first_page"] is False
|
|
132
|
+
assert second_call_kwargs["is_last_page"] is False
|
|
133
|
+
|
|
134
|
+
third_call_kwargs = bulk_index.call_args_list[2][1]
|
|
135
|
+
assert "forceRestartUpload" not in third_call_kwargs
|
|
136
|
+
assert third_call_kwargs["is_first_page"] is False
|
|
137
|
+
assert third_call_kwargs["is_last_page"] is True
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_normal_upload_no_force_restart():
|
|
141
|
+
"""Test that normal upload does not include forceRestartUpload parameter."""
|
|
142
|
+
client = DummyStreamingDataClient()
|
|
143
|
+
connector = DummyStreamingConnector("test_stream", client)
|
|
144
|
+
connector.batch_size = 5
|
|
145
|
+
|
|
146
|
+
with patch(
|
|
147
|
+
"glean.indexing.connectors.base_streaming_datasource_connector.api_client"
|
|
148
|
+
) as api_client:
|
|
149
|
+
bulk_index = api_client().__enter__().indexing.documents.bulk_index
|
|
150
|
+
connector.index_data(force_restart=False)
|
|
151
|
+
|
|
152
|
+
assert bulk_index.call_count == 1
|
|
153
|
+
|
|
154
|
+
# Should NOT have forceRestartUpload parameter
|
|
155
|
+
call_kwargs = bulk_index.call_args[1]
|
|
156
|
+
assert "forceRestartUpload" not in call_kwargs
|
|
157
|
+
assert call_kwargs["is_first_page"] is True
|
|
158
|
+
assert call_kwargs["is_last_page"] is True
|