glean-indexing-sdk 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.cz.toml +1 -1
  2. glean_indexing_sdk-0.2.0/CHANGELOG.md +10 -0
  3. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/PKG-INFO +13 -1
  4. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/README.md +12 -0
  5. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/pyproject.toml +1 -1
  6. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/__init__.py +1 -1
  7. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_connector.py +9 -2
  8. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_datasource_connector.py +29 -11
  9. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_people_connector.py +28 -10
  10. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_datasource_connector.py +20 -8
  11. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_datasource_connector.py +69 -0
  12. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_streaming_datasource_connector.py +54 -2
  13. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/uv.lock +645 -645
  14. glean_indexing_sdk-0.1.0/CHANGELOG.md +0 -5
  15. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.env.template +0 -0
  16. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/CODEOWNERS +0 -0
  17. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/workflows/ci.yml +0 -0
  18. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.github/workflows/publish.yml +0 -0
  19. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.gitignore +0 -0
  20. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.markdown-coderc.json +0 -0
  21. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.python-version +0 -0
  22. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.ruff.toml +0 -0
  23. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/.vscode/settings.json +0 -0
  24. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/CONTRIBUTING.md +0 -0
  25. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/LICENSE +0 -0
  26. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/RELEASE.md +0 -0
  27. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/env.template +0 -0
  28. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/mise.toml +0 -0
  29. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/complete.py +0 -0
  30. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/run_connector.py +0 -0
  31. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_connector.py +0 -0
  32. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_data_client.py +0 -0
  33. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/non_streaming/wiki_page_data.py +0 -0
  34. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_connector.py +0 -0
  35. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data.py +0 -0
  36. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/article_data_client.py +0 -0
  37. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/snippets/streaming/run_connector.py +0 -0
  38. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/__init__.py +0 -0
  39. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/batch_processor.py +0 -0
  40. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/content_formatter.py +0 -0
  41. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/glean_client.py +0 -0
  42. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/metrics.py +0 -0
  43. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/mocks.py +0 -0
  44. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/common/property_definition_builder.py +0 -0
  45. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/__init__.py +0 -0
  46. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_data_client.py +0 -0
  47. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/connectors/base_streaming_data_client.py +0 -0
  48. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/models.py +0 -0
  49. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/__init__.py +0 -0
  50. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/observability/observability.py +0 -0
  51. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/py.typed +0 -0
  52. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/__init__.py +0 -0
  53. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/connector_test_harness.py +0 -0
  54. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_data_source.py +0 -0
  55. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/mock_glean_client.py +0 -0
  56. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/src/glean/indexing/testing/response_validator.py +0 -0
  57. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/taskfile.yml +0 -0
  58. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/__init__.py +0 -0
  59. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/integration_tests/__init__.py +0 -0
  60. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/__init__.py +0 -0
  61. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/__init__.py +0 -0
  62. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/mock_clients.py +0 -0
  63. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_batch_processor.py +0 -0
  64. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_content_formatter.py +0 -0
  65. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_metrics.py +0 -0
  66. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/common/test_property_definition_builder.py +0 -0
  67. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_connector.py +0 -0
  68. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_data_client.py +0 -0
  69. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_base_people_connector.py +0 -0
  70. {glean_indexing_sdk-0.1.0 → glean_indexing_sdk-0.2.0}/tests/unit_tests/test_custom_connector_integration.py +0 -0
@@ -1,5 +1,5 @@
1
1
  [tool.commitizen]
2
2
  name = "cz_conventional_commits"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  tag_format = "v$version"
5
5
  version_files = ["pyproject.toml:version", "src/glean/indexing/__init__.py:__version__"]
@@ -0,0 +1,10 @@
1
+ ## v0.1.0 (2025-07-23)
2
+
3
+ ### Feat
4
+
5
+ - Adds property definition builder
6
+
7
+ ### Fix
8
+
9
+ - Fixing format of tags for release
10
+ - Adds addition model for re-export
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glean-indexing-sdk
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: SDK for building custom Glean indexing integrations
5
5
  Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
6
6
  Author-email: Steve Calvert <steve.calvert@glean.com>
@@ -232,6 +232,18 @@ connector.configure_datasource()
232
232
  connector.index_data(mode=IndexingMode.FULL)
233
233
  ```
234
234
 
235
+ **When to use forced restarts:**
236
+ - When you need to abort and restart a failed or interrupted upload
237
+ - When you want to ensure a clean upload state by discarding partial uploads
238
+ - When recovering from upload errors or inconsistent states
239
+
240
+ **How it works:**
241
+ - Generates a new `upload_id` to ensure clean separation from previous uploads
242
+ - Sets `forceRestartUpload=True` on the **first batch only**
243
+ - Continues with normal batch processing for subsequent batches
244
+
245
+ This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
246
+
235
247
  ### Complete Example
236
248
 
237
249
  ```python snippet=non_streaming/complete.py
@@ -202,6 +202,18 @@ connector.configure_datasource()
202
202
  connector.index_data(mode=IndexingMode.FULL)
203
203
  ```
204
204
 
205
+ **When to use forced restarts:**
206
+ - When you need to abort and restart a failed or interrupted upload
207
+ - When you want to ensure a clean upload state by discarding partial uploads
208
+ - When recovering from upload errors or inconsistent states
209
+
210
+ **How it works:**
211
+ - Generates a new `upload_id` to ensure clean separation from previous uploads
212
+ - Sets `forceRestartUpload=True` on the **first batch only**
213
+ - Continues with normal batch processing for subsequent batches
214
+
215
+ This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
216
+
205
217
  ### Complete Example
206
218
 
207
219
  ```python snippet=non_streaming/complete.py
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "glean-indexing-sdk"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "SDK for building custom Glean indexing integrations"
9
9
  authors = [{ name = "Steve Calvert", email = "steve.calvert@glean.com" }]
10
10
  readme = "README.md"
@@ -53,4 +53,4 @@ __all__ = [
53
53
  try:
54
54
  __version__ = version("glean-indexing-sdk")
55
55
  except PackageNotFoundError:
56
- __version__ = "0.1.0"
56
+ __version__ = "0.2.0"
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
55
55
  pass
56
56
 
57
57
  @abstractmethod
58
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
59
- """Index data from the connector to Glean."""
58
+ def index_data(
59
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
60
+ ) -> None:
61
+ """Index data from the connector to Glean.
62
+
63
+ Args:
64
+ mode: The indexing mode to use (FULL or INCREMENTAL).
65
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
66
+ """
60
67
  pass
@@ -114,12 +114,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
114
114
  client.indexing.datasources.add(**config.dict(exclude_unset=True))
115
115
  logger.info(f"Successfully configured datasource: {config.name}")
116
116
 
117
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
117
+ def index_data(
118
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
119
+ ) -> None:
118
120
  """
119
121
  Index data from the datasource to Glean with identity crawl followed by content crawl.
120
122
 
121
123
  Args:
122
124
  mode: The indexing mode to use (FULL or INCREMENTAL).
125
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
126
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
123
127
  """
124
128
  self._observability.start_execution()
125
129
 
@@ -169,7 +173,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
169
173
  self._observability.start_timer("data_upload")
170
174
  if documents:
171
175
  logger.info(f"Indexing {len(documents)} documents")
172
- self._batch_index_documents(documents)
176
+ self._batch_index_documents(documents, force_restart=force_restart)
173
177
  self._observability.end_timer("data_upload")
174
178
 
175
179
  logger.info(f"Successfully indexed {len(documents)} documents to Glean")
@@ -272,8 +276,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
272
276
  self._observability.increment_counter("batch_upload_errors")
273
277
  raise
274
278
 
275
- def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
276
- """Index documents in batches with proper page signaling."""
279
+ def _batch_index_documents(
280
+ self, documents: Sequence[DocumentDefinition], force_restart: bool = False
281
+ ) -> None:
282
+ """Index documents in batches with proper page signaling.
283
+
284
+ Args:
285
+ documents: The documents to index
286
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
287
+ """
277
288
  if not documents:
278
289
  return
279
290
 
@@ -285,14 +296,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
285
296
  upload_id = str(uuid.uuid4())
286
297
  for i, batch in enumerate(batches):
287
298
  try:
299
+ is_first_page = i == 0
300
+ bulk_index_kwargs = {
301
+ "datasource": self.name,
302
+ "documents": list(batch),
303
+ "upload_id": upload_id,
304
+ "is_first_page": is_first_page,
305
+ "is_last_page": (i == total_batches - 1),
306
+ }
307
+
308
+ if force_restart and is_first_page:
309
+ bulk_index_kwargs["forceRestartUpload"] = True
310
+ logger.info("Force restarting upload - discarding any previous upload progress")
311
+
288
312
  with api_client() as client:
289
- client.indexing.documents.bulk_index(
290
- datasource=self.name,
291
- documents=list(batch),
292
- upload_id=upload_id,
293
- is_first_page=(i == 0),
294
- is_last_page=(i == total_batches - 1),
295
- )
313
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
296
314
 
297
315
  logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
298
316
  self._observability.increment_counter("batches_uploaded")
@@ -58,11 +58,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
58
58
  """The observability instance for this connector."""
59
59
  return self._observability
60
60
 
61
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
61
+ def index_data(
62
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
63
+ ) -> None:
62
64
  """Index people data to Glean.
63
65
 
64
66
  Args:
65
67
  mode: The indexing mode to use (FULL or INCREMENTAL).
68
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
69
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
66
70
  """
67
71
  self._observability.start_execution()
68
72
 
@@ -89,7 +93,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
89
93
  self._observability.record_metric("employees_transformed", len(employees))
90
94
 
91
95
  self._observability.start_timer("data_upload")
92
- self._batch_index_employees(employees)
96
+ self._batch_index_employees(employees, force_restart=force_restart)
93
97
  self._observability.end_timer("data_upload")
94
98
 
95
99
  logger.info(f"Successfully indexed {len(employees)} employees to Glean")
@@ -113,8 +117,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
113
117
  """
114
118
  return self.data_client.get_source_data(since=since)
115
119
 
116
- def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
117
- """Index employees to Glean in batches."""
120
+ def _batch_index_employees(
121
+ self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
122
+ ) -> None:
123
+ """Index employees to Glean in batches.
124
+
125
+ Args:
126
+ employees: The employees to index
127
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
128
+ """
118
129
  if not employees:
119
130
  return
120
131
 
@@ -126,13 +137,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
126
137
  upload_id = str(uuid.uuid4())
127
138
  for i, batch in enumerate(batches):
128
139
  try:
140
+ is_first_page = i == 0
141
+ bulk_index_kwargs = {
142
+ "employees": list(batch),
143
+ "upload_id": upload_id,
144
+ "is_first_page": is_first_page,
145
+ "is_last_page": (i == total_batches - 1),
146
+ }
147
+
148
+ if force_restart and is_first_page:
149
+ bulk_index_kwargs["forceRestartUpload"] = True
150
+ logger.info("Force restarting upload - discarding any previous upload progress")
151
+
129
152
  with api_client() as client:
130
- client.indexing.people.bulk_index(
131
- employees=list(batch),
132
- upload_id=upload_id,
133
- is_first_page=(i == 0),
134
- is_last_page=(i == total_batches - 1),
135
- )
153
+ client.indexing.people.bulk_index(**bulk_index_kwargs)
136
154
 
137
155
  logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
138
156
  self._observability.increment_counter("batches_uploaded")
@@ -47,6 +47,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
47
47
  super().__init__(name, data_client) # type: ignore[arg-type]
48
48
  self.batch_size = 1000
49
49
  self._upload_id: Optional[str] = None
50
+ self._force_restart: bool = False
50
51
 
51
52
  def generate_upload_id(self) -> str:
52
53
  """Generate a unique upload ID for batch tracking."""
@@ -67,12 +68,16 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
67
68
  logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
68
69
  yield from self.data_client.get_source_data(since=since)
69
70
 
70
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
71
+ def index_data(
72
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
73
+ ) -> None:
71
74
  """
72
75
  Index data from the datasource to Glean using streaming.
73
76
 
74
77
  Args:
75
78
  mode: The indexing mode to use (FULL or INCREMENTAL).
79
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
80
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
76
81
  """
77
82
  logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
78
83
 
@@ -81,6 +86,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
81
86
  since = "2023-01-01T00:00:00Z"
82
87
 
83
88
  upload_id = self.generate_upload_id()
89
+ self._force_restart = force_restart
84
90
  data_iterator = self.get_data(since=since)
85
91
  is_first_batch = True
86
92
  batch: List[TSourceData] = []
@@ -150,14 +156,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
150
156
  transformed_batch = self.transform(batch)
151
157
  logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
152
158
 
159
+ bulk_index_kwargs = {
160
+ "datasource": self.name,
161
+ "documents": list(transformed_batch),
162
+ "upload_id": upload_id,
163
+ "is_first_page": is_first_batch,
164
+ "is_last_page": is_last_batch,
165
+ }
166
+
167
+ if self._force_restart and is_first_batch:
168
+ bulk_index_kwargs["forceRestartUpload"] = True
169
+ logger.info("Force restarting upload - discarding any previous upload progress")
170
+
153
171
  with api_client() as client:
154
- client.indexing.documents.bulk_index(
155
- datasource=self.name,
156
- documents=list(transformed_batch),
157
- upload_id=upload_id,
158
- is_first_page=is_first_batch,
159
- is_last_page=is_last_batch,
160
- )
172
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
161
173
 
162
174
  logger.info(f"Batch {batch_number} indexed successfully")
163
175
 
@@ -128,3 +128,72 @@ class TestBaseDatasourceConnector:
128
128
 
129
129
  timestamp = connector._get_last_crawl_timestamp()
130
130
  assert timestamp is None
131
+
132
+ @patch("glean.indexing.connectors.base_datasource_connector.api_client")
133
+ def test_force_restart_upload(self, mock_api_client):
134
+ """Test that force_restart parameter sets forceRestartUpload on first batch."""
135
+ mock_client = Mock()
136
+ mock_api_client.return_value.__enter__.return_value = mock_client
137
+
138
+ test_data = [
139
+ {
140
+ "id": "1",
141
+ "title": "Test Doc 1",
142
+ "content": "Content 1",
143
+ "url": "https://test.example.com/1",
144
+ },
145
+ {
146
+ "id": "2",
147
+ "title": "Test Doc 2",
148
+ "content": "Content 2",
149
+ "url": "https://test.example.com/2",
150
+ },
151
+ ]
152
+ data_client = MockDataClient(test_data)
153
+ connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
154
+ connector.batch_size = 1
155
+
156
+ connector.index_data(force_restart=True)
157
+
158
+ # Should be called twice (one batch per document)
159
+ assert mock_client.indexing.documents.bulk_index.call_count == 2
160
+
161
+ # First call should have forceRestartUpload=True
162
+ first_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[0][1]
163
+ assert first_call_kwargs["forceRestartUpload"] is True
164
+ assert first_call_kwargs["is_first_page"] is True
165
+ assert first_call_kwargs["is_last_page"] is False
166
+
167
+ # Second call should NOT have forceRestartUpload
168
+ second_call_kwargs = mock_client.indexing.documents.bulk_index.call_args_list[1][1]
169
+ assert "forceRestartUpload" not in second_call_kwargs
170
+ assert second_call_kwargs["is_first_page"] is False
171
+ assert second_call_kwargs["is_last_page"] is True
172
+
173
+ @patch("glean.indexing.connectors.base_datasource_connector.api_client")
174
+ def test_normal_upload_no_force_restart(self, mock_api_client):
175
+ """Test that normal upload does not include forceRestartUpload parameter."""
176
+ mock_client = Mock()
177
+ mock_api_client.return_value.__enter__.return_value = mock_client
178
+
179
+ test_data = [
180
+ {
181
+ "id": "1",
182
+ "title": "Test Doc",
183
+ "content": "Content",
184
+ "url": "https://test.example.com/1",
185
+ }
186
+ ]
187
+ data_client = MockDataClient(test_data)
188
+ connector = TestDatasourceConnector(name="test_connector", data_client=data_client)
189
+
190
+ connector.index_data(force_restart=False)
191
+
192
+ # Should be called once
193
+ assert mock_client.indexing.documents.bulk_index.call_count == 1
194
+
195
+ # Should NOT have forceRestartUpload parameter
196
+ call_kwargs = mock_client.indexing.documents.bulk_index.call_args[1]
197
+ assert "forceRestartUpload" not in call_kwargs
198
+ assert call_kwargs["is_first_page"] is True
199
+ assert call_kwargs["is_last_page"] is True
@@ -82,8 +82,7 @@ def test_index_data_batches_and_uploads():
82
82
  def test_index_data_empty():
83
83
  class EmptyClient(BaseStreamingDataClient[dict]):
84
84
  def get_source_data(self, **kwargs):
85
- if False:
86
- yield
85
+ yield from []
87
86
 
88
87
  connector = DummyStreamingConnector("test_stream", EmptyClient())
89
88
  with patch(
@@ -104,3 +103,56 @@ def test_index_data_error_handling():
104
103
  bulk_index.side_effect = Exception("upload failed")
105
104
  with pytest.raises(Exception):
106
105
  connector.index_data()
106
+
107
+
108
+ def test_force_restart_upload():
109
+ """Test that force_restart parameter sets forceRestartUpload on first batch."""
110
+ client = DummyStreamingDataClient()
111
+ connector = DummyStreamingConnector("test_stream", client)
112
+ connector.batch_size = 2
113
+
114
+ with patch(
115
+ "glean.indexing.connectors.base_streaming_datasource_connector.api_client"
116
+ ) as api_client:
117
+ bulk_index = api_client().__enter__().indexing.documents.bulk_index
118
+ connector.index_data(force_restart=True)
119
+
120
+ assert bulk_index.call_count == 3
121
+
122
+ # First call should have forceRestartUpload=True
123
+ first_call_kwargs = bulk_index.call_args_list[0][1]
124
+ assert first_call_kwargs["forceRestartUpload"] is True
125
+ assert first_call_kwargs["is_first_page"] is True
126
+ assert first_call_kwargs["is_last_page"] is False
127
+
128
+ # Subsequent calls should NOT have forceRestartUpload
129
+ second_call_kwargs = bulk_index.call_args_list[1][1]
130
+ assert "forceRestartUpload" not in second_call_kwargs
131
+ assert second_call_kwargs["is_first_page"] is False
132
+ assert second_call_kwargs["is_last_page"] is False
133
+
134
+ third_call_kwargs = bulk_index.call_args_list[2][1]
135
+ assert "forceRestartUpload" not in third_call_kwargs
136
+ assert third_call_kwargs["is_first_page"] is False
137
+ assert third_call_kwargs["is_last_page"] is True
138
+
139
+
140
+ def test_normal_upload_no_force_restart():
141
+ """Test that normal upload does not include forceRestartUpload parameter."""
142
+ client = DummyStreamingDataClient()
143
+ connector = DummyStreamingConnector("test_stream", client)
144
+ connector.batch_size = 5
145
+
146
+ with patch(
147
+ "glean.indexing.connectors.base_streaming_datasource_connector.api_client"
148
+ ) as api_client:
149
+ bulk_index = api_client().__enter__().indexing.documents.bulk_index
150
+ connector.index_data(force_restart=False)
151
+
152
+ assert bulk_index.call_count == 1
153
+
154
+ # Should NOT have forceRestartUpload parameter
155
+ call_kwargs = bulk_index.call_args[1]
156
+ assert "forceRestartUpload" not in call_kwargs
157
+ assert call_kwargs["is_first_page"] is True
158
+ assert call_kwargs["is_last_page"] is True