glean-indexing-sdk 0.0.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,4 +53,4 @@ __all__ = [
53
53
  try:
54
54
  __version__ = version("glean-indexing-sdk")
55
55
  except PackageNotFoundError:
56
- __version__ = "0.0.3"
56
+ __version__ = "0.2.0"
@@ -0,0 +1,115 @@
1
+ from typing import List, Optional
2
+
3
+ from glean.api_client.models.propertydefinition import PropertyDefinition, PropertyType, UIOptions
4
+
5
+
6
+ class PropertyDefinitionBuilder:
7
+ """
8
+ Builder class for creating PropertyDefinition objects with a fluent interface.
9
+
10
+ This class provides a convenient way to build multiple PropertyDefinition objects
11
+ with proper validation and type safety.
12
+
13
+ Example:
14
+ builder = PropertyDefinitionBuilder()
15
+ properties = (builder
16
+ .add_property("title", "Title", property_type=PropertyType.TEXT)
17
+ .add_property("author", "Author", display_label_plural="Authors")
18
+ .build())
19
+ """
20
+
21
+ def __init__(self) -> None:
22
+ self.properties: List[PropertyDefinition] = []
23
+
24
+ def add_property(
25
+ self,
26
+ name: str,
27
+ display_label: str,
28
+ display_label_plural: Optional[str] = None,
29
+ property_type: PropertyType = PropertyType.TEXT,
30
+ ui_options: UIOptions = UIOptions.SEARCH_RESULT,
31
+ hide_ui_facet: bool = False,
32
+ ui_facet_order: Optional[int] = None,
33
+ group: Optional[str] = None,
34
+ ) -> "PropertyDefinitionBuilder":
35
+ """
36
+ Add a property definition to the builder.
37
+
38
+ Args:
39
+ name: The property name (must not be empty)
40
+ display_label: The display label for the property
41
+ display_label_plural: Optional plural form of the display label
42
+ property_type: The type of property (defaults to TEXT)
43
+ ui_options: UI options for the property (defaults to SEARCH_RESULT)
44
+ hide_ui_facet: Whether to hide the UI facet
45
+ ui_facet_order: Optional order for UI facet display
46
+ group: Optional group name for the property
47
+
48
+ Returns:
49
+ Self for method chaining
50
+
51
+ Raises:
52
+ ValueError: If name or display_label is empty
53
+ """
54
+ if not name or not name.strip():
55
+ raise ValueError("Property name cannot be empty")
56
+ if not display_label or not display_label.strip():
57
+ raise ValueError("Display label cannot be empty")
58
+
59
+ base_params = {
60
+ "name": name.strip(),
61
+ "display_label": display_label.strip(),
62
+ "property_type": property_type.value,
63
+ "ui_options": ui_options.value,
64
+ "hide_ui_facet": hide_ui_facet,
65
+ }
66
+
67
+ optional_params = {
68
+ k: v
69
+ for k, v in {
70
+ "display_label_plural": display_label_plural.strip()
71
+ if display_label_plural
72
+ else None,
73
+ "ui_facet_order": ui_facet_order,
74
+ "group": group.strip() if group else None,
75
+ }.items()
76
+ if v is not None
77
+ }
78
+
79
+ params = {**base_params, **optional_params}
80
+
81
+ try:
82
+ prop = PropertyDefinition(**params)
83
+ self.properties.append(prop)
84
+ except Exception as e:
85
+ raise ValueError(f"Failed to create PropertyDefinition: {e}") from e
86
+
87
+ return self
88
+
89
+ def clear(self) -> "PropertyDefinitionBuilder":
90
+ """
91
+ Clear all properties from the builder.
92
+
93
+ Returns:
94
+ Self for method chaining
95
+ """
96
+ self.properties.clear()
97
+ return self
98
+
99
+ def count(self) -> int:
100
+ """
101
+ Get the number of properties currently in the builder.
102
+
103
+ Returns:
104
+ Number of properties
105
+ """
106
+ return len(self.properties)
107
+
108
+ def build(self) -> List[PropertyDefinition]:
109
+ """
110
+ Build and return the list of PropertyDefinition objects.
111
+
112
+ Returns:
113
+ List of PropertyDefinition objects
114
+ """
115
+ return self.properties.copy()
@@ -55,6 +55,13 @@ class BaseConnector(ABC, Generic[TSourceData, TIndexableEntityDefinition]):
55
55
  pass
56
56
 
57
57
  @abstractmethod
58
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
59
- """Index data from the connector to Glean."""
58
+ def index_data(
59
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
60
+ ) -> None:
61
+ """Index data from the connector to Glean.
62
+
63
+ Args:
64
+ mode: The indexing mode to use (FULL or INCREMENTAL).
65
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
66
+ """
60
67
  pass
@@ -114,12 +114,16 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
114
114
  client.indexing.datasources.add(**config.dict(exclude_unset=True))
115
115
  logger.info(f"Successfully configured datasource: {config.name}")
116
116
 
117
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
117
+ def index_data(
118
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
119
+ ) -> None:
118
120
  """
119
121
  Index data from the datasource to Glean with identity crawl followed by content crawl.
120
122
 
121
123
  Args:
122
124
  mode: The indexing mode to use (FULL or INCREMENTAL).
125
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
126
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
123
127
  """
124
128
  self._observability.start_execution()
125
129
 
@@ -169,7 +173,7 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
169
173
  self._observability.start_timer("data_upload")
170
174
  if documents:
171
175
  logger.info(f"Indexing {len(documents)} documents")
172
- self._batch_index_documents(documents)
176
+ self._batch_index_documents(documents, force_restart=force_restart)
173
177
  self._observability.end_timer("data_upload")
174
178
 
175
179
  logger.info(f"Successfully indexed {len(documents)} documents to Glean")
@@ -272,8 +276,15 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
272
276
  self._observability.increment_counter("batch_upload_errors")
273
277
  raise
274
278
 
275
- def _batch_index_documents(self, documents: Sequence[DocumentDefinition]) -> None:
276
- """Index documents in batches with proper page signaling."""
279
+ def _batch_index_documents(
280
+ self, documents: Sequence[DocumentDefinition], force_restart: bool = False
281
+ ) -> None:
282
+ """Index documents in batches with proper page signaling.
283
+
284
+ Args:
285
+ documents: The documents to index
286
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
287
+ """
277
288
  if not documents:
278
289
  return
279
290
 
@@ -285,14 +296,21 @@ class BaseDatasourceConnector(BaseConnector[TSourceData, DocumentDefinition], AB
285
296
  upload_id = str(uuid.uuid4())
286
297
  for i, batch in enumerate(batches):
287
298
  try:
299
+ is_first_page = i == 0
300
+ bulk_index_kwargs = {
301
+ "datasource": self.name,
302
+ "documents": list(batch),
303
+ "upload_id": upload_id,
304
+ "is_first_page": is_first_page,
305
+ "is_last_page": (i == total_batches - 1),
306
+ }
307
+
308
+ if force_restart and is_first_page:
309
+ bulk_index_kwargs["forceRestartUpload"] = True
310
+ logger.info("Force restarting upload - discarding any previous upload progress")
311
+
288
312
  with api_client() as client:
289
- client.indexing.documents.bulk_index(
290
- datasource=self.name,
291
- documents=list(batch),
292
- upload_id=upload_id,
293
- is_first_page=(i == 0),
294
- is_last_page=(i == total_batches - 1),
295
- )
313
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
296
314
 
297
315
  logger.info(f"Document batch {i + 1}/{total_batches} uploaded successfully")
298
316
  self._observability.increment_counter("batches_uploaded")
@@ -58,11 +58,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
58
58
  """The observability instance for this connector."""
59
59
  return self._observability
60
60
 
61
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
61
+ def index_data(
62
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
63
+ ) -> None:
62
64
  """Index people data to Glean.
63
65
 
64
66
  Args:
65
67
  mode: The indexing mode to use (FULL or INCREMENTAL).
68
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
69
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
66
70
  """
67
71
  self._observability.start_execution()
68
72
 
@@ -89,7 +93,7 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
89
93
  self._observability.record_metric("employees_transformed", len(employees))
90
94
 
91
95
  self._observability.start_timer("data_upload")
92
- self._batch_index_employees(employees)
96
+ self._batch_index_employees(employees, force_restart=force_restart)
93
97
  self._observability.end_timer("data_upload")
94
98
 
95
99
  logger.info(f"Successfully indexed {len(employees)} employees to Glean")
@@ -113,8 +117,15 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
113
117
  """
114
118
  return self.data_client.get_source_data(since=since)
115
119
 
116
- def _batch_index_employees(self, employees: Sequence[EmployeeInfoDefinition]) -> None:
117
- """Index employees to Glean in batches."""
120
+ def _batch_index_employees(
121
+ self, employees: Sequence[EmployeeInfoDefinition], force_restart: bool = False
122
+ ) -> None:
123
+ """Index employees to Glean in batches.
124
+
125
+ Args:
126
+ employees: The employees to index
127
+ force_restart: If True, forces a restart by generating a new upload ID and setting forceRestartUpload=True on the first batch
128
+ """
118
129
  if not employees:
119
130
  return
120
131
 
@@ -126,13 +137,20 @@ class BasePeopleConnector(BaseConnector[TSourceData, EmployeeInfoDefinition], AB
126
137
  upload_id = str(uuid.uuid4())
127
138
  for i, batch in enumerate(batches):
128
139
  try:
140
+ is_first_page = i == 0
141
+ bulk_index_kwargs = {
142
+ "employees": list(batch),
143
+ "upload_id": upload_id,
144
+ "is_first_page": is_first_page,
145
+ "is_last_page": (i == total_batches - 1),
146
+ }
147
+
148
+ if force_restart and is_first_page:
149
+ bulk_index_kwargs["forceRestartUpload"] = True
150
+ logger.info("Force restarting upload - discarding any previous upload progress")
151
+
129
152
  with api_client() as client:
130
- client.indexing.people.bulk_index(
131
- employees=list(batch),
132
- upload_id=upload_id,
133
- is_first_page=(i == 0),
134
- is_last_page=(i == total_batches - 1),
135
- )
153
+ client.indexing.people.bulk_index(**bulk_index_kwargs)
136
154
 
137
155
  logger.info(f"Employee batch {i + 1}/{total_batches} uploaded successfully")
138
156
  self._observability.increment_counter("batches_uploaded")
@@ -47,6 +47,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
47
47
  super().__init__(name, data_client) # type: ignore[arg-type]
48
48
  self.batch_size = 1000
49
49
  self._upload_id: Optional[str] = None
50
+ self._force_restart: bool = False
50
51
 
51
52
  def generate_upload_id(self) -> str:
52
53
  """Generate a unique upload ID for batch tracking."""
@@ -67,12 +68,16 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
67
68
  logger.info(f"Fetching streaming data from source{' since ' + since if since else ''}")
68
69
  yield from self.data_client.get_source_data(since=since)
69
70
 
70
- def index_data(self, mode: IndexingMode = IndexingMode.FULL) -> None:
71
+ def index_data(
72
+ self, mode: IndexingMode = IndexingMode.FULL, force_restart: bool = False
73
+ ) -> None:
71
74
  """
72
75
  Index data from the datasource to Glean using streaming.
73
76
 
74
77
  Args:
75
78
  mode: The indexing mode to use (FULL or INCREMENTAL).
79
+ force_restart: If True, forces a restart of the upload, discarding any previous upload progress.
80
+ This sets forceRestartUpload=True on the first batch and generates a new upload ID.
76
81
  """
77
82
  logger.info(f"Starting {mode.name.lower()} streaming indexing for datasource '{self.name}'")
78
83
 
@@ -81,6 +86,7 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
81
86
  since = "2023-01-01T00:00:00Z"
82
87
 
83
88
  upload_id = self.generate_upload_id()
89
+ self._force_restart = force_restart
84
90
  data_iterator = self.get_data(since=since)
85
91
  is_first_batch = True
86
92
  batch: List[TSourceData] = []
@@ -150,14 +156,20 @@ class BaseStreamingDatasourceConnector(BaseDatasourceConnector[TSourceData], ABC
150
156
  transformed_batch = self.transform(batch)
151
157
  logger.info(f"Transformed batch {batch_number}: {len(transformed_batch)} documents")
152
158
 
159
+ bulk_index_kwargs = {
160
+ "datasource": self.name,
161
+ "documents": list(transformed_batch),
162
+ "upload_id": upload_id,
163
+ "is_first_page": is_first_batch,
164
+ "is_last_page": is_last_batch,
165
+ }
166
+
167
+ if self._force_restart and is_first_batch:
168
+ bulk_index_kwargs["forceRestartUpload"] = True
169
+ logger.info("Force restarting upload - discarding any previous upload progress")
170
+
153
171
  with api_client() as client:
154
- client.indexing.documents.bulk_index(
155
- datasource=self.name,
156
- documents=list(transformed_batch),
157
- upload_id=upload_id,
158
- is_first_page=is_first_batch,
159
- is_last_page=is_last_batch,
160
- )
172
+ client.indexing.documents.bulk_index(**bulk_index_kwargs)
161
173
 
162
174
  logger.info(f"Batch {batch_number} indexed successfully")
163
175
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: glean-indexing-sdk
3
- Version: 0.0.3
3
+ Version: 0.2.0
4
4
  Summary: SDK for building custom Glean indexing integrations
5
5
  Project-URL: Source Code, https://github.com/glean-io/glean-indexing-sdk
6
6
  Author-email: Steve Calvert <steve.calvert@glean.com>
@@ -232,6 +232,18 @@ connector.configure_datasource()
232
232
  connector.index_data(mode=IndexingMode.FULL)
233
233
  ```
234
234
 
235
+ **When to use forced restarts:**
236
+ - When you need to abort and restart a failed or interrupted upload
237
+ - When you want to ensure a clean upload state by discarding partial uploads
238
+ - When recovering from upload errors or inconsistent states
239
+
240
+ **How it works:**
241
+ - Generates a new `upload_id` to ensure clean separation from previous uploads
242
+ - Sets `forceRestartUpload=True` on the **first batch only**
243
+ - Continues with normal batch processing for subsequent batches
244
+
245
+ This feature is available on all connector types: `BaseDatasourceConnector`, `BaseStreamingDatasourceConnector`, and `BasePeopleConnector`.
246
+
235
247
  ### Complete Example
236
248
 
237
249
  ```python snippet=non_streaming/complete.py
@@ -1,4 +1,4 @@
1
- glean/indexing/__init__.py,sha256=4rk3Q9mlKf707DNKstmOf2l5cljagvYobwwHhYlD-Zw,1519
1
+ glean/indexing/__init__.py,sha256=gyHYmtxSfLw1bgVJaMsDxrJ0oppf7DtGI6A0tR1RMxw,1519
2
2
  glean/indexing/models.py,sha256=UuaEDCx0ygvU4u0lRbSn4YXXZVo7D_pyD_whQtjORm8,1223
3
3
  glean/indexing/py.typed,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
4
4
  glean/indexing/common/__init__.py,sha256=6COS3jP66xJ7VcNGI8I95tkF5zpqHy9QPVn82CB4m4I,513
@@ -7,13 +7,14 @@ glean/indexing/common/content_formatter.py,sha256=PkIUZRoRtaOf1w6tJbB3cDj4oV58I7
7
7
  glean/indexing/common/glean_client.py,sha256=tKRWK_C1Nja0gVy2FLnj9SmUbpIdOA3WKmpuuhIl7kk,488
8
8
  glean/indexing/common/metrics.py,sha256=SWCWCYnNOkN4cnwCxyWyEF8iHVwQ4HZqhewi2lqyS84,1771
9
9
  glean/indexing/common/mocks.py,sha256=-TbLzpZ7yUstQW58AICixiIQM2CV5_OPRXejjI_brhE,726
10
+ glean/indexing/common/property_definition_builder.py,sha256=NZFhSqsSZlhI0Ia76sn0meYr82msBMCKMd78zMKLWAM,3724
10
11
  glean/indexing/connectors/__init__.py,sha256=YaHEmCj246zKIvPIAOjTBTDV2O-KvMLncc6jjmaEeOw,1035
11
- glean/indexing/connectors/base_connector.py,sha256=Q435TzSLqs0OTFBrD3KCcjQnGSICQg11pdSfJ7C3XtI,2398
12
+ glean/indexing/connectors/base_connector.py,sha256=m_zKbg-MMc1bjG5m2SsIarSeiPhFJKzfBQzgnlqTKF8,2640
12
13
  glean/indexing/connectors/base_data_client.py,sha256=krOFHJbwCZI-hCS6fr-z44TvjCbPCTCw54hkk0CZFsQ,1004
13
- glean/indexing/connectors/base_datasource_connector.py,sha256=x0Fsc7uCKgTtTgyOus1yDFBr87JbVGHM3zHFp9mGgc4,12440
14
- glean/indexing/connectors/base_people_connector.py,sha256=XuSCFyegenW271GZJ408IQgT19sBq9C9NkKHkiSxLKg,6239
14
+ glean/indexing/connectors/base_datasource_connector.py,sha256=46hqVmgPMC6qNyq7tRZK_EP2dZ_E3W0jLFEe0stC-EU,13297
15
+ glean/indexing/connectors/base_people_connector.py,sha256=WEOWHVOtwPteAynRGl9MAKldVzLg_sCIem_bMSyN2bg,7097
15
16
  glean/indexing/connectors/base_streaming_data_client.py,sha256=xW67crQ_rHaOnD0NFBi2zTGex9JGME886CjX4EqgbZM,1241
16
- glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=wUcsBPExzmgMQd6P24epR4bZFBl40aN6qm6di_F2hmA,7116
17
+ glean/indexing/connectors/base_streaming_datasource_connector.py,sha256=MJLNgrJy0EU5d9dpTs6n40CyY15cfuDQ8DGCPjXoEP4,7726
17
18
  glean/indexing/observability/__init__.py,sha256=SuWJ7pHs5WFq5vL036B3RIsJSbjDsy6SI705u83874I,455
18
19
  glean/indexing/observability/observability.py,sha256=cHlo-tbrmGie6YeWXqEUap0YE6JRtFvOKTnxWD-7yac,9222
19
20
  glean/indexing/testing/__init__.py,sha256=h9mK0QjRZD5f470ePTeg635jZNwPBAd2S7g1DQO4LuE,448
@@ -21,7 +22,7 @@ glean/indexing/testing/connector_test_harness.py,sha256=CMQZmn0cOIrj_GdIHb3OwRN9
21
22
  glean/indexing/testing/mock_data_source.py,sha256=ICYbbHQZe9RVTzvrlwcxp_suxm9yXgjEAGiNCU-SkS4,1325
22
23
  glean/indexing/testing/mock_glean_client.py,sha256=aY_Jfg_NJNPw2HSM1IshgT2lkT59SD9BJzOnvNFJhck,2528
23
24
  glean/indexing/testing/response_validator.py,sha256=jehEtXlW0AQcOVck-_VPoDFtQM_vkHJQ10SUN1ftr1Q,1800
24
- glean_indexing_sdk-0.0.3.dist-info/METADATA,sha256=uyukc_HxjJuhdYzolllnxL8tqpmyWani9g1_NVJhnI4,15619
25
- glean_indexing_sdk-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
26
- glean_indexing_sdk-0.0.3.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
27
- glean_indexing_sdk-0.0.3.dist-info/RECORD,,
25
+ glean_indexing_sdk-0.2.0.dist-info/METADATA,sha256=WWxCredUH5aqfJGdEUBP5z7wVHVUD-pbGrf9opsjHH4,16224
26
+ glean_indexing_sdk-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
27
+ glean_indexing_sdk-0.2.0.dist-info/licenses/LICENSE,sha256=RAfePGwatR5BOtlNhW60zAKWCeHVgtGpaGBqZQadXNQ,1062
28
+ glean_indexing_sdk-0.2.0.dist-info/RECORD,,