sf-vector-sdk 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sf-vector-sdk
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Python SDK for the Vector Gateway service (embeddings and vector search)
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: redis>=5.0.0
@@ -266,8 +266,9 @@ result = client.structured_embeddings.embed_test_question_and_wait(
266
266
  )
267
267
 
268
268
  # Embed a topic - uses TopicMetadata (all fields optional)
269
+ # Note: Topic data requires an "id" field which becomes the TurboPuffer document ID
269
270
  result = client.structured_embeddings.embed_topic_and_wait(
270
- data={"topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
271
+ data={"id": "topic-123", "topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
271
272
  metadata=TopicMetadata(user_id="user123", topic_id="topic456"), # No tool_id needed
272
273
  )
273
274
 
@@ -276,9 +277,9 @@ from vector_sdk import TopicBatchItem
276
277
 
277
278
  batch_result = client.structured_embeddings.embed_topic_batch_and_wait(
278
279
  items=[
279
- TopicBatchItem(data={"topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
280
- TopicBatchItem(data={"topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
281
- TopicBatchItem(data={"topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
280
+ TopicBatchItem(data={"id": "topic-1", "topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
281
+ TopicBatchItem(data={"id": "topic-2", "topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
282
+ TopicBatchItem(data={"id": "topic-3", "topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
282
283
  ],
283
284
  )
284
285
  ```
@@ -1,4 +1,4 @@
1
- vector_sdk/__init__.py,sha256=3VdEG4tOuwTAWVvx9J-rOTuVY5RM-7tHzdL-ZLxRCYI,6979
1
+ vector_sdk/__init__.py,sha256=VoljCrab1syIU3NWthWI9ks2s2QDIroixzFGkYamJSY,6979
2
2
  vector_sdk/client.py,sha256=NQFGHyR1aM0UToRFy6e9Xm_v6mk0opqzKN8UlHu97n0,17186
3
3
  vector_sdk/content_types.py,sha256=krvFOR58iUZPfYlEVsk0sXD6_ANAFbxEBQGNpt1YPDU,7381
4
4
  vector_sdk/types.py,sha256=rQgA2z3ls21vY-DRPZgfmm8gYFkWJk1dQaJI-nbc0no,25514
@@ -12,16 +12,16 @@ vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py,sha256=cf4PCZK-Otf
12
12
  vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi,sha256=WKj_iRAuhXMNH3a2tf5j-ERYE5HLKamJTcQXm88JjDo,2451
13
13
  vector_sdk/hash/__init__.py,sha256=if-8tGOPyGUZy0_joGH66moE0e5zzwSzfUeMqP_8QsU,723
14
14
  vector_sdk/hash/hasher.py,sha256=k5VSQB-T0TtBM5ipaVE_TQu_uiaiWNjOWSbByxjriwQ,8618
15
- vector_sdk/hash/types.py,sha256=RHDM-ob9cOHPGMI7tXqiN_ZRowTPSc3GYHf8terrd8U,1983
15
+ vector_sdk/hash/types.py,sha256=clBRk_D5SrXWU19K3Jg8COecz9--WZh9Ws4f70T3BXg,2044
16
16
  vector_sdk/namespaces/__init__.py,sha256=S9dJfB39s2zjYOpFn9Fvf8bk7mLKcXk5aPatKOA-xO0,374
17
17
  vector_sdk/namespaces/base.py,sha256=lioZBcd43mijnN0JwTMMEpQ6whiAjaueTDAAIZS1JM0,1156
18
18
  vector_sdk/namespaces/db.py,sha256=a5sEHrfy1xAjRjyM9qfZxr3IznZVA8BnY5W1Hq5jr4I,7230
19
- vector_sdk/namespaces/embeddings.py,sha256=7hH0hvBAeDf-ypTtOzUAqzc3W6wci_dbt_ZPavcRVyU,8950
20
- vector_sdk/namespaces/search.py,sha256=bwtZ_rTiP6q-dg8oOM5YA6taDHSphO88aq7RSuzc-tQ,8894
19
+ vector_sdk/namespaces/embeddings.py,sha256=r0cbCZjj0jZ9oyBpm8lA2BjUYzi8bmunWwFsYxiXtJo,7704
20
+ vector_sdk/namespaces/search.py,sha256=8ruX0xp5vXD9tS8oXAu1vmF4aC25fNg4gDOtiR8aQ_0,7874
21
21
  vector_sdk/structured/__init__.py,sha256=ZUhrH_l7bX5vA78DSKqDucWhfhYmkDX-W_MPzo5J9JU,1758
22
22
  vector_sdk/structured/router.py,sha256=F3O1TYtbVFCPqVWCCYCt5QcRffX5WPlPQ7K3KlayooQ,5792
23
- vector_sdk/structured/structured_embeddings.py,sha256=Z0enOHx4vdhxAs0sbk9B6XHtRjZSfeYbNbtbq9f8Hh8,37147
24
- vector_sdk/structured/tool_config.py,sha256=YJp-S2_mwoODHWaWJHnGJRaKXuuqbbm2dYHTum2BuG4,8138
25
- sf_vector_sdk-0.2.2.dist-info/METADATA,sha256=JTf4o16e5REDLegscjWMbJcvdLVxDUCrwdCEAcH4fgk,15915
26
- sf_vector_sdk-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
- sf_vector_sdk-0.2.2.dist-info/RECORD,,
23
+ vector_sdk/structured/structured_embeddings.py,sha256=e-EOYgpx7SXOo1xQV6-5ZgB6W3ZH1HS2Tx1m7O_1VNE,36869
24
+ vector_sdk/structured/tool_config.py,sha256=qMwP8UWQTt8mkTYFVgvNXd9Dh_WztJSsqcgAjvQ_YoY,8212
25
+ sf_vector_sdk-0.2.4.dist-info/METADATA,sha256=kvP3u9ZJ3RUsLMcKz5yMRfkUworAcqJ-pZoLtXaYVoc,16069
26
+ sf_vector_sdk-0.2.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
+ sf_vector_sdk-0.2.4.dist-info/RECORD,,
vector_sdk/__init__.py CHANGED
@@ -166,7 +166,7 @@ from vector_sdk.types import (
166
166
  validate_model,
167
167
  )
168
168
 
169
- __version__ = "0.2.2"
169
+ __version__ = "0.2.4"
170
170
 
171
171
  __all__ = [
172
172
  # Clients (New API)
vector_sdk/hash/types.py CHANGED
@@ -72,5 +72,7 @@ class TopicData(BaseModel):
72
72
 
73
73
  model_config = ConfigDict(extra="allow")
74
74
 
75
+ # Required - becomes TurboPuffer document ID
76
+ id: str
75
77
  topic: Optional[str] = None
76
78
  description: Optional[str] = None
@@ -154,26 +154,21 @@ class EmbeddingsNamespace(BaseNamespace):
154
154
  Raises:
155
155
  TimeoutError: If no result is received within the timeout
156
156
  """
157
- channel = f"embedding:result:{request_id}"
158
- pubsub = self._redis.pubsub()
159
- pubsub.subscribe(channel)
160
-
161
- try:
162
- start_time = datetime.utcnow()
163
- while True:
164
- message = pubsub.get_message(timeout=1.0)
165
- if message and message["type"] == "message":
166
- data = json.loads(message["data"])
167
- return EmbeddingResult.from_dict(data)
168
-
169
- elapsed = (datetime.utcnow() - start_time).total_seconds()
170
- if elapsed >= timeout:
171
- raise TimeoutError(
172
- f"No result received for {request_id} within {timeout}s"
173
- )
174
- finally:
175
- pubsub.unsubscribe(channel)
176
- pubsub.close()
157
+ list_key = f"embedding:response:{request_id}"
158
+
159
+ # BRPOP blocks until result is available or timeout
160
+ result = self._redis.brpop(list_key, timeout=timeout)
161
+
162
+ if result is None:
163
+ raise TimeoutError(
164
+ f"No result received for {request_id} within {timeout}s"
165
+ )
166
+
167
+ # result = (key, value)
168
+ data = json.loads(result[1])
169
+ # Cleanup the response list
170
+ self._redis.delete(list_key)
171
+ return EmbeddingResult.from_dict(data)
177
172
 
178
173
  def create_and_wait(
179
174
  self,
@@ -189,8 +184,8 @@ class EmbeddingsNamespace(BaseNamespace):
189
184
  """
190
185
  Create embeddings and wait for the result.
191
186
 
192
- This method subscribes to the result channel BEFORE submitting the request,
193
- ensuring no race condition where the result is published before we're listening.
187
+ Uses BRPOP for efficient blocking wait - no race condition since the result
188
+ is pushed to a list that persists until consumed.
194
189
 
195
190
  Args:
196
191
  texts: List of text inputs
@@ -205,43 +200,22 @@ class EmbeddingsNamespace(BaseNamespace):
205
200
  Returns:
206
201
  The embedding result
207
202
  """
208
- # Generate request ID upfront so we can subscribe before submitting
209
203
  request_id = str(uuid.uuid4())
210
- channel = f"embedding:result:{request_id}"
211
-
212
- # Subscribe BEFORE submitting to avoid race condition
213
- pubsub = self._redis.pubsub()
214
- pubsub.subscribe(channel)
215
-
216
- try:
217
- # Now submit the request (subscription is already active)
218
- self.create(
219
- texts=texts,
220
- content_type=content_type,
221
- priority=priority,
222
- storage=storage,
223
- metadata=metadata,
224
- request_id=request_id,
225
- embedding_model=embedding_model,
226
- embedding_dimensions=embedding_dimensions,
227
- )
228
204
 
229
- # Wait for message with timeout
230
- start_time = datetime.utcnow()
231
- while True:
232
- message = pubsub.get_message(timeout=1.0)
233
- if message and message["type"] == "message":
234
- data = json.loads(message["data"])
235
- return EmbeddingResult.from_dict(data)
236
-
237
- elapsed = (datetime.utcnow() - start_time).total_seconds()
238
- if elapsed >= timeout:
239
- raise TimeoutError(
240
- f"No result received for {request_id} within {timeout}s"
241
- )
242
- finally:
243
- pubsub.unsubscribe(channel)
244
- pubsub.close()
205
+ # Submit the request first
206
+ self.create(
207
+ texts=texts,
208
+ content_type=content_type,
209
+ priority=priority,
210
+ storage=storage,
211
+ metadata=metadata,
212
+ request_id=request_id,
213
+ embedding_model=embedding_model,
214
+ embedding_dimensions=embedding_dimensions,
215
+ )
216
+
217
+ # Wait for result via BRPOP
218
+ return self.wait_for(request_id, timeout)
245
219
 
246
220
  def get_queue_depth(self) -> dict[str, int]:
247
221
  """
@@ -151,26 +151,21 @@ class SearchNamespace(BaseNamespace):
151
151
  Raises:
152
152
  TimeoutError: If no result is received within the timeout
153
153
  """
154
- channel = f"query:result:{request_id}"
155
- pubsub = self._redis.pubsub()
156
- pubsub.subscribe(channel)
157
-
158
- try:
159
- start_time = datetime.utcnow()
160
- while True:
161
- message = pubsub.get_message(timeout=1.0)
162
- if message and message["type"] == "message":
163
- data = json.loads(message["data"])
164
- return QueryResult.from_dict(data)
165
-
166
- elapsed = (datetime.utcnow() - start_time).total_seconds()
167
- if elapsed >= timeout:
168
- raise TimeoutError(
169
- f"No query result received for {request_id} within {timeout}s"
170
- )
171
- finally:
172
- pubsub.unsubscribe(channel)
173
- pubsub.close()
154
+ list_key = f"query:response:{request_id}"
155
+
156
+ # BRPOP blocks until result is available or timeout
157
+ result = self._redis.brpop(list_key, timeout=timeout)
158
+
159
+ if result is None:
160
+ raise TimeoutError(
161
+ f"No query result received for {request_id} within {timeout}s"
162
+ )
163
+
164
+ # result = (key, value)
165
+ data = json.loads(result[1])
166
+ # Cleanup the response list
167
+ self._redis.delete(list_key)
168
+ return QueryResult.from_dict(data)
174
169
 
175
170
  def query_and_wait(
176
171
  self,
@@ -193,8 +188,8 @@ class SearchNamespace(BaseNamespace):
193
188
  """
194
189
  Submit a search query and wait for the result.
195
190
 
196
- This method subscribes to the result channel BEFORE submitting the request,
197
- ensuring no race condition where the result is published before we're listening.
191
+ Uses BRPOP for efficient blocking wait - no race condition since the result
192
+ is pushed to a list that persists until consumed.
198
193
 
199
194
  Args:
200
195
  query_text: The text to search for
@@ -217,42 +212,25 @@ class SearchNamespace(BaseNamespace):
217
212
  The query result
218
213
  """
219
214
  request_id = str(uuid.uuid4())
220
- channel = f"query:result:{request_id}"
221
-
222
- pubsub = self._redis.pubsub()
223
- pubsub.subscribe(channel)
224
-
225
- try:
226
- self.query(
227
- query_text=query_text,
228
- database=database,
229
- top_k=top_k,
230
- min_score=min_score,
231
- filters=filters,
232
- namespace=namespace,
233
- collection=collection,
234
- database_name=database_name,
235
- include_vectors=include_vectors,
236
- include_metadata=include_metadata,
237
- embedding_model=embedding_model,
238
- embedding_dimensions=embedding_dimensions,
239
- priority=priority,
240
- metadata=metadata,
241
- request_id=request_id,
242
- )
243
215
 
244
- start_time = datetime.utcnow()
245
- while True:
246
- message = pubsub.get_message(timeout=1.0)
247
- if message and message["type"] == "message":
248
- data = json.loads(message["data"])
249
- return QueryResult.from_dict(data)
250
-
251
- elapsed = (datetime.utcnow() - start_time).total_seconds()
252
- if elapsed >= timeout:
253
- raise TimeoutError(
254
- f"No query result received for {request_id} within {timeout}s"
255
- )
256
- finally:
257
- pubsub.unsubscribe(channel)
258
- pubsub.close()
216
+ # Submit the request first
217
+ self.query(
218
+ query_text=query_text,
219
+ database=database,
220
+ top_k=top_k,
221
+ min_score=min_score,
222
+ filters=filters,
223
+ namespace=namespace,
224
+ collection=collection,
225
+ database_name=database_name,
226
+ include_vectors=include_vectors,
227
+ include_metadata=include_metadata,
228
+ embedding_model=embedding_model,
229
+ embedding_dimensions=embedding_dimensions,
230
+ priority=priority,
231
+ metadata=metadata,
232
+ request_id=request_id,
233
+ )
234
+
235
+ # Wait for result via BRPOP
236
+ return self.wait_for(request_id, timeout)
@@ -665,16 +665,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
665
665
  ) -> str:
666
666
  """Internal method to embed a topic with TopicMetadata."""
667
667
  # 1. Extract text using the spec
668
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
668
+ text = extract_tool_text(tool_collection, data)
669
669
  if not text:
670
670
  raise ValueError(
671
671
  f"Failed to extract text from {tool_collection} - empty content"
672
672
  )
673
673
 
674
674
  # 2. Compute content hash
675
- content_hash = compute_content_hash(
676
- {"toolCollection": tool_collection, "data": data}
677
- )
675
+ content_hash = compute_content_hash(tool_collection, data)
678
676
  if not content_hash:
679
677
  raise ValueError(
680
678
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -686,6 +684,7 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
686
684
  # 4. Build document with metadata (TopicMetadata doesn't have toolId)
687
685
  document = {
688
686
  **metadata.to_dict(),
687
+ "id": data["id"],
689
688
  "toolCollection": tool_collection,
690
689
  "contentHash": content_hash,
691
690
  }
@@ -698,9 +697,9 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
698
697
  document_fields=document,
699
698
  )
700
699
 
701
- # 6. Build text input
700
+ # 6. Build text input - use data["id"] as the TurboPuffer document ID
702
701
  text_input = {
703
- "id": content_hash,
702
+ "id": data["id"],
704
703
  "text": text,
705
704
  "document": document,
706
705
  }
@@ -729,16 +728,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
729
728
  ) -> EmbeddingResult:
730
729
  """Internal method to embed a topic and wait for result."""
731
730
  # 1. Extract text using the spec
732
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
731
+ text = extract_tool_text(tool_collection, data)
733
732
  if not text:
734
733
  raise ValueError(
735
734
  f"Failed to extract text from {tool_collection} - empty content"
736
735
  )
737
736
 
738
737
  # 2. Compute content hash
739
- content_hash = compute_content_hash(
740
- {"toolCollection": tool_collection, "data": data}
741
- )
738
+ content_hash = compute_content_hash(tool_collection, data)
742
739
  if not content_hash:
743
740
  raise ValueError(
744
741
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -750,6 +747,7 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
750
747
  # 4. Build document with metadata
751
748
  document = {
752
749
  **metadata.to_dict(),
750
+ "id": data["id"],
753
751
  "toolCollection": tool_collection,
754
752
  "contentHash": content_hash,
755
753
  }
@@ -762,9 +760,9 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
762
760
  document_fields=document,
763
761
  )
764
762
 
765
- # 6. Build text input
763
+ # 6. Build text input - use data["id"] as the TurboPuffer document ID
766
764
  text_input = {
767
- "id": content_hash,
765
+ "id": data["id"],
768
766
  "text": text,
769
767
  "document": document,
770
768
  }
@@ -802,16 +800,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
802
800
  metadata = item.metadata
803
801
 
804
802
  # Extract text
805
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
803
+ text = extract_tool_text(tool_collection, data)
806
804
  if not text:
807
805
  raise ValueError(
808
806
  f"Failed to extract text from {tool_collection} - empty content"
809
807
  )
810
808
 
811
809
  # Compute content hash
812
- content_hash = compute_content_hash(
813
- {"toolCollection": tool_collection, "data": data}
814
- )
810
+ content_hash = compute_content_hash(tool_collection, data)
815
811
  if not content_hash:
816
812
  raise ValueError(
817
813
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -820,12 +816,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
820
816
  # Build document with metadata (TopicMetadata doesn't have toolId)
821
817
  document = {
822
818
  **metadata.to_dict(),
819
+ "id": data["id"],
823
820
  "toolCollection": tool_collection,
824
821
  "contentHash": content_hash,
825
822
  }
826
823
 
824
+ # Use data["id"] as the TurboPuffer document ID
827
825
  text_inputs.append({
828
- "id": content_hash,
826
+ "id": data["id"],
829
827
  "text": text,
830
828
  "document": document,
831
829
  })
@@ -834,7 +832,7 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
834
832
  storage_config = build_storage_config(
835
833
  tool_collection=tool_collection,
836
834
  sub_type=None,
837
- content_hash=text_inputs[0]["id"],
835
+ content_hash=text_inputs[0]["document"]["contentHash"],
838
836
  document_fields=text_inputs[0]["document"],
839
837
  )
840
838
 
@@ -871,16 +869,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
871
869
  metadata = item.metadata
872
870
 
873
871
  # Extract text
874
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
872
+ text = extract_tool_text(tool_collection, data)
875
873
  if not text:
876
874
  raise ValueError(
877
875
  f"Failed to extract text from {tool_collection} - empty content"
878
876
  )
879
877
 
880
878
  # Compute content hash
881
- content_hash = compute_content_hash(
882
- {"toolCollection": tool_collection, "data": data}
883
- )
879
+ content_hash = compute_content_hash(tool_collection, data)
884
880
  if not content_hash:
885
881
  raise ValueError(
886
882
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -889,12 +885,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
889
885
  # Build document with metadata
890
886
  document = {
891
887
  **metadata.to_dict(),
888
+ "id": data["id"],
892
889
  "toolCollection": tool_collection,
893
890
  "contentHash": content_hash,
894
891
  }
895
892
 
893
+ # Use data["id"] as the TurboPuffer document ID
896
894
  text_inputs.append({
897
- "id": content_hash,
895
+ "id": data["id"],
898
896
  "text": text,
899
897
  "document": document,
900
898
  })
@@ -903,7 +901,7 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
903
901
  storage_config = build_storage_config(
904
902
  tool_collection=tool_collection,
905
903
  sub_type=None,
906
- content_hash=text_inputs[0]["id"],
904
+ content_hash=text_inputs[0]["document"]["contentHash"],
907
905
  document_fields=text_inputs[0]["document"],
908
906
  )
909
907
 
@@ -935,16 +933,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
935
933
  ) -> str:
936
934
  """Internal method to embed any tool type."""
937
935
  # 1. Extract text using the spec
938
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
936
+ text = extract_tool_text(tool_collection, data)
939
937
  if not text:
940
938
  raise ValueError(
941
939
  f"Failed to extract text from {tool_collection} - empty content"
942
940
  )
943
941
 
944
942
  # 2. Compute content hash
945
- content_hash = compute_content_hash(
946
- {"toolCollection": tool_collection, "data": data}
947
- )
943
+ content_hash = compute_content_hash(tool_collection, data)
948
944
  if not content_hash:
949
945
  raise ValueError(
950
946
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -999,16 +995,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
999
995
  ) -> EmbeddingResult:
1000
996
  """Internal method to embed any tool type and wait for result."""
1001
997
  # 1. Extract text using the spec
1002
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
998
+ text = extract_tool_text(tool_collection, data)
1003
999
  if not text:
1004
1000
  raise ValueError(
1005
1001
  f"Failed to extract text from {tool_collection} - empty content"
1006
1002
  )
1007
1003
 
1008
1004
  # 2. Compute content hash
1009
- content_hash = compute_content_hash(
1010
- {"toolCollection": tool_collection, "data": data}
1011
- )
1005
+ content_hash = compute_content_hash(tool_collection, data)
1012
1006
  if not content_hash:
1013
1007
  raise ValueError(
1014
1008
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -1082,16 +1076,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
1082
1076
  metadata = item["metadata"]
1083
1077
 
1084
1078
  # Extract text
1085
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
1079
+ text = extract_tool_text(tool_collection, data)
1086
1080
  if not text:
1087
1081
  raise ValueError(
1088
1082
  f"Failed to extract text from {tool_collection} - empty content"
1089
1083
  )
1090
1084
 
1091
1085
  # Compute content hash
1092
- content_hash = compute_content_hash(
1093
- {"toolCollection": tool_collection, "data": data}
1094
- )
1086
+ content_hash = compute_content_hash(tool_collection, data)
1095
1087
  if not content_hash:
1096
1088
  raise ValueError(
1097
1089
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -1163,16 +1155,14 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
1163
1155
  metadata = item["metadata"]
1164
1156
 
1165
1157
  # Extract text
1166
- text = extract_tool_text({"toolCollection": tool_collection, "data": data})
1158
+ text = extract_tool_text(tool_collection, data)
1167
1159
  if not text:
1168
1160
  raise ValueError(
1169
1161
  f"Failed to extract text from {tool_collection} - empty content"
1170
1162
  )
1171
1163
 
1172
1164
  # Compute content hash
1173
- content_hash = compute_content_hash(
1174
- {"toolCollection": tool_collection, "data": data}
1175
- )
1165
+ content_hash = compute_content_hash(tool_collection, data)
1176
1166
  if not content_hash:
1177
1167
  raise ValueError(
1178
1168
  f"Failed to compute content hash for {tool_collection} - empty content"
@@ -151,15 +151,15 @@ TOOL_CONFIGS: dict[ToolCollection, ToolConfig] = {
151
151
  default_priority=PRIORITY_NORMAL,
152
152
  turbopuffer=TurboPufferToolConfig(
153
153
  enabled=True,
154
- id_field="contentHash",
155
- metadata_fields=_DEFAULT_METADATA_FIELDS,
154
+ id_field="id",
155
+ metadata_fields=("toolId", "toolCollection", "topicId", "userId", "contentHash", "id"),
156
156
  namespace_pattern="topic_vectors",
157
157
  ),
158
158
  pinecone=PineconeToolConfig(
159
159
  enabled=False,
160
160
  index_name="tool-vectors",
161
- id_field="contentHash",
162
- metadata_fields=_DEFAULT_METADATA_FIELDS,
161
+ id_field="id",
162
+ metadata_fields=("toolId", "toolCollection", "topicId", "userId", "contentHash", "id"),
163
163
  namespace_pattern="topic_vectors",
164
164
  ),
165
165
  ),