sf-vector-sdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sf-vector-sdk
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Python SDK for the Vector Gateway service (embeddings and vector search)
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: redis>=5.0.0
@@ -220,25 +220,42 @@ Type-safe embedding for known tool types (FlashCard, TestQuestion, etc.) with au
220
220
  |--------|-------------|
221
221
  | `embed_flashcard(data, metadata)` | Embed a flashcard, return request ID |
222
222
  | `embed_flashcard_and_wait(data, metadata, timeout)` | Embed and wait for result |
223
+ | `embed_flashcard_batch(items)` | Embed batch of flashcards, return request ID |
224
+ | `embed_flashcard_batch_and_wait(items, timeout)` | Embed batch and wait for result |
223
225
  | `embed_test_question(data, metadata)` | Embed a test question, return request ID |
224
226
  | `embed_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
227
+ | `embed_test_question_batch(items)` | Embed batch of test questions, return request ID |
228
+ | `embed_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
225
229
  | `embed_spaced_test_question(data, metadata)` | Embed a spaced test question, return request ID |
226
230
  | `embed_spaced_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
231
+ | `embed_spaced_test_question_batch(items)` | Embed batch of spaced test questions, return request ID |
232
+ | `embed_spaced_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
227
233
  | `embed_audio_recap(data, metadata)` | Embed an audio recap section, return request ID |
228
234
  | `embed_audio_recap_and_wait(data, metadata, timeout)` | Embed and wait for result |
235
+ | `embed_audio_recap_batch(items)` | Embed batch of audio recaps, return request ID |
236
+ | `embed_audio_recap_batch_and_wait(items, timeout)` | Embed batch and wait for result |
237
+ | `embed_topic(data, metadata)` | Embed a topic (uses `TopicMetadata`), return request ID |
238
+ | `embed_topic_and_wait(data, metadata, timeout)` | Embed and wait for result (uses `TopicMetadata`) |
239
+ | `embed_topic_batch(items)` | Embed batch of topics (uses `TopicMetadata`), return request ID |
240
+ | `embed_topic_batch_and_wait(items, timeout)` | Embed batch and wait for result (uses `TopicMetadata`) |
241
+
242
+ **Metadata Types:**
243
+
244
+ - `ToolMetadata` - For tools (FlashCard, TestQuestion, etc.) - requires `tool_id`
245
+ - `TopicMetadata` - For topics only - all fields optional (`user_id`, `topic_id`)
229
246
 
230
247
  ```python
231
- from vector_sdk import VectorClient, ToolMetadata, TestQuestionInput
248
+ from vector_sdk import VectorClient, ToolMetadata, TopicMetadata, TestQuestionInput
232
249
 
233
250
  client = VectorClient(redis_url="redis://localhost:6379")
234
251
 
235
- # Embed a flashcard - SDK handles text extraction, hashing, and routing
252
+ # Embed a flashcard - uses ToolMetadata (tool_id required)
236
253
  result = client.structured_embeddings.embed_flashcard_and_wait(
237
254
  data={"type": "BASIC", "term": "Mitochondria", "definition": "The powerhouse of the cell"},
238
255
  metadata=ToolMetadata(tool_id="tool123", user_id="user456", topic_id="topic789"),
239
256
  )
240
257
 
241
- # Embed a test question
258
+ # Embed a test question - uses ToolMetadata (tool_id required)
242
259
  result = client.structured_embeddings.embed_test_question_and_wait(
243
260
  data=TestQuestionInput(
244
261
  question="What is the capital?",
@@ -247,6 +264,23 @@ result = client.structured_embeddings.embed_test_question_and_wait(
247
264
  ),
248
265
  metadata=ToolMetadata(tool_id="tool456"),
249
266
  )
267
+
268
+ # Embed a topic - uses TopicMetadata (all fields optional)
269
+ result = client.structured_embeddings.embed_topic_and_wait(
270
+ data={"topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
271
+ metadata=TopicMetadata(user_id="user123", topic_id="topic456"), # No tool_id needed
272
+ )
273
+
274
+ # Batch embedding - embed multiple topics in a single request
275
+ from vector_sdk import TopicBatchItem
276
+
277
+ batch_result = client.structured_embeddings.embed_topic_batch_and_wait(
278
+ items=[
279
+ TopicBatchItem(data={"topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
280
+ TopicBatchItem(data={"topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
281
+ TopicBatchItem(data={"topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
282
+ ],
283
+ )
250
284
  ```
251
285
 
252
286
  **Database Routing:**
@@ -1,27 +1,27 @@
1
- vector_sdk/__init__.py,sha256=Fq4Pqq-xbDX_4M_lQhw6DVXAggUB-bjscNrUk8GEk2o,6632
1
+ vector_sdk/__init__.py,sha256=3VdEG4tOuwTAWVvx9J-rOTuVY5RM-7tHzdL-ZLxRCYI,6979
2
2
  vector_sdk/client.py,sha256=NQFGHyR1aM0UToRFy6e9Xm_v6mk0opqzKN8UlHu97n0,17186
3
3
  vector_sdk/content_types.py,sha256=krvFOR58iUZPfYlEVsk0sXD6_ANAFbxEBQGNpt1YPDU,7381
4
4
  vector_sdk/types.py,sha256=rQgA2z3ls21vY-DRPZgfmm8gYFkWJk1dQaJI-nbc0no,25514
5
- vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py,sha256=HOyqisydOUgjQ2yEcLdAuW4O46ghcL7W--Sykc9Iwzc,5962
5
+ vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py,sha256=5dW14j_DyIPKCaFI2cxCKKtQoLMGtRqV3aiRZ8Utxw4,5962
6
6
  vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.pyi,sha256=fOw6liHkiXSEyvEZ_QKexDUgFNhbemuGuk52hwQ5pnQ,6738
7
- vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py,sha256=xwujSU8GXborGSDgKoRHQq_DL5CA6YAGX8L_Om35lRc,7057
7
+ vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py,sha256=nFmjLnJJh5H-t25FJ8oP7jLH-mAcuEw-EK0U-dYlgDI,7057
8
8
  vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.pyi,sha256=lxZ27fReDhHv2mKDCiPvKpicXuJObZX2zpfVYuGTk3I,8068
9
- vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py,sha256=E7uzz9w9xQlGbe_ZfO2MN_H8B-xpyFkkyixlDfaoGv0,5579
9
+ vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py,sha256=X0EIUHMCt9a0L5iVjQXkdi2zKi7xKwPbEUeU8gNPeTk,5579
10
10
  vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.pyi,sha256=a-rWfFQVAdZM5jK1qHB0bUiuSo6brcB-zUIHQezi0I8,5598
11
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py,sha256=i2VcFAoa8xVFsiXTxseDnZwpwl6iYv4CgwNV18mHHqU,2266
12
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi,sha256=KlMGcmAOXgCxVUmtqlyir_fNyRan9Bm3VEv4yGsWgHk,1453
13
- vector_sdk/hash/__init__.py,sha256=xyy3ezP4o58IkfUr2Kk-YwuAVtNH5uUG6QyUIed4Psw,691
14
- vector_sdk/hash/hasher.py,sha256=BjQ5d-dhJy2Smo-dssss2JDHcIfHjsvf08O_1h7Dq2I,7976
15
- vector_sdk/hash/types.py,sha256=QWSmGhbNcYZsRPZKWxgHC-XBOf-OBtE3XMGoTb2hm9Y,1785
11
+ vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py,sha256=cf4PCZK-OtfLMyCuac0XqpZ6MQxk2XH4cy3QRu1_i8I,3094
12
+ vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi,sha256=WKj_iRAuhXMNH3a2tf5j-ERYE5HLKamJTcQXm88JjDo,2451
13
+ vector_sdk/hash/__init__.py,sha256=if-8tGOPyGUZy0_joGH66moE0e5zzwSzfUeMqP_8QsU,723
14
+ vector_sdk/hash/hasher.py,sha256=k5VSQB-T0TtBM5ipaVE_TQu_uiaiWNjOWSbByxjriwQ,8618
15
+ vector_sdk/hash/types.py,sha256=RHDM-ob9cOHPGMI7tXqiN_ZRowTPSc3GYHf8terrd8U,1983
16
16
  vector_sdk/namespaces/__init__.py,sha256=S9dJfB39s2zjYOpFn9Fvf8bk7mLKcXk5aPatKOA-xO0,374
17
17
  vector_sdk/namespaces/base.py,sha256=lioZBcd43mijnN0JwTMMEpQ6whiAjaueTDAAIZS1JM0,1156
18
18
  vector_sdk/namespaces/db.py,sha256=a5sEHrfy1xAjRjyM9qfZxr3IznZVA8BnY5W1Hq5jr4I,7230
19
19
  vector_sdk/namespaces/embeddings.py,sha256=7hH0hvBAeDf-ypTtOzUAqzc3W6wci_dbt_ZPavcRVyU,8950
20
20
  vector_sdk/namespaces/search.py,sha256=bwtZ_rTiP6q-dg8oOM5YA6taDHSphO88aq7RSuzc-tQ,8894
21
- vector_sdk/structured/__init__.py,sha256=eRiH-V6U-TSl98peoGdsRQukTysW9ZhTsous0fwHU-I,1468
21
+ vector_sdk/structured/__init__.py,sha256=ZUhrH_l7bX5vA78DSKqDucWhfhYmkDX-W_MPzo5J9JU,1758
22
22
  vector_sdk/structured/router.py,sha256=F3O1TYtbVFCPqVWCCYCt5QcRffX5WPlPQ7K3KlayooQ,5792
23
- vector_sdk/structured/structured_embeddings.py,sha256=Z93Bcf38fpk8jhQvXTeioxt-yyhtQXo1fMoUKqgVlus,13031
24
- vector_sdk/structured/tool_config.py,sha256=sv0mRNUcuPO9C8Oh0_Y52YTyakbLk6gjrBW0C04Jt_w,7462
25
- sf_vector_sdk-0.2.0.dist-info/METADATA,sha256=HA0KnQ9SAIRYk7M-g_IOn5o1y1yL7ISpQS0j_T7u0tw,13671
26
- sf_vector_sdk-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
- sf_vector_sdk-0.2.0.dist-info/RECORD,,
23
+ vector_sdk/structured/structured_embeddings.py,sha256=Z0enOHx4vdhxAs0sbk9B6XHtRjZSfeYbNbtbq9f8Hh8,37147
24
+ vector_sdk/structured/tool_config.py,sha256=YJp-S2_mwoODHWaWJHnGJRaKXuuqbbm2dYHTum2BuG4,8138
25
+ sf_vector_sdk-0.2.2.dist-info/METADATA,sha256=JTf4o16e5REDLegscjWMbJcvdLVxDUCrwdCEAcH4fgk,15915
26
+ sf_vector_sdk-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
+ sf_vector_sdk-0.2.2.dist-info/RECORD,,
vector_sdk/__init__.py CHANGED
@@ -70,6 +70,7 @@ from vector_sdk.hash import (
70
70
  MultipleChoiceOption,
71
71
  QuestionData,
72
72
  ToolCollection,
73
+ TopicData,
73
74
  compute_content_hash,
74
75
  extract_tool_text,
75
76
  )
@@ -89,15 +90,21 @@ from vector_sdk.namespaces import (
89
90
  # ============================================================================
90
91
  from vector_sdk.structured import (
91
92
  TOOL_CONFIGS,
93
+ AudioRecapBatchItem,
94
+ BatchItem,
92
95
  DatabaseRoutingError,
93
96
  DatabaseRoutingMode,
97
+ FlashCardBatchItem,
94
98
  PineconeToolConfig,
95
99
  QuestionType,
96
100
  StructuredEmbeddingsNamespace,
101
+ TestQuestionBatchItem,
97
102
  TestQuestionInput,
98
103
  ToolConfig,
99
104
  ToolDatabaseConfig,
100
105
  ToolMetadata,
106
+ TopicBatchItem,
107
+ TopicMetadata,
101
108
  TurboPufferToolConfig,
102
109
  build_storage_config,
103
110
  get_content_type,
@@ -159,7 +166,7 @@ from vector_sdk.types import (
159
166
  validate_model,
160
167
  )
161
168
 
162
- __version__ = "0.2.0"
169
+ __version__ = "0.2.2"
163
170
 
164
171
  __all__ = [
165
172
  # Clients (New API)
@@ -236,12 +243,21 @@ __all__ = [
236
243
  "FlashCardData",
237
244
  "QuestionData",
238
245
  "AudioRecapSectionData",
246
+ "TopicData",
239
247
  "MultipleChoiceOption",
240
248
  "AnswerObject",
241
249
  # Structured Embeddings
242
250
  "StructuredEmbeddingsNamespace",
243
251
  "ToolMetadata",
252
+ "TopicMetadata",
244
253
  "TestQuestionInput",
254
+ # Batch types
255
+ "BatchItem",
256
+ "FlashCardBatchItem",
257
+ "TestQuestionBatchItem",
258
+ "AudioRecapBatchItem",
259
+ "TopicBatchItem",
260
+ # Tool configuration
245
261
  "ToolConfig",
246
262
  "ToolDatabaseConfig",
247
263
  "TurboPufferToolConfig",
@@ -2,7 +2,7 @@
2
2
  # Generated by the protocol buffer compiler. DO NOT EDIT!
3
3
  # NO CHECKED-IN PROTOBUF GENCODE
4
4
  # source: embedding_pipeline/content_types/v1/content_types.proto
5
- # Protobuf Python Version: 6.33.4
5
+ # Protobuf Python Version: 6.33.5
6
6
  """Generated protocol buffer code."""
7
7
  from google.protobuf import descriptor as _descriptor
8
8
  from google.protobuf import descriptor_pool as _descriptor_pool
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
13
13
  _runtime_version.Domain.PUBLIC,
14
14
  6,
15
15
  33,
16
- 4,
16
+ 5,
17
17
  '',
18
18
  'embedding_pipeline/content_types/v1/content_types.proto'
19
19
  )
@@ -2,7 +2,7 @@
2
2
  # Generated by the protocol buffer compiler. DO NOT EDIT!
3
3
  # NO CHECKED-IN PROTOBUF GENCODE
4
4
  # source: embedding_pipeline/db/vectors/v1/vectors.proto
5
- # Protobuf Python Version: 6.33.4
5
+ # Protobuf Python Version: 6.33.5
6
6
  """Generated protocol buffer code."""
7
7
  from google.protobuf import descriptor as _descriptor
8
8
  from google.protobuf import descriptor_pool as _descriptor_pool
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
13
13
  _runtime_version.Domain.PUBLIC,
14
14
  6,
15
15
  33,
16
- 4,
16
+ 5,
17
17
  '',
18
18
  'embedding_pipeline/db/vectors/v1/vectors.proto'
19
19
  )
@@ -2,7 +2,7 @@
2
2
  # Generated by the protocol buffer compiler. DO NOT EDIT!
3
3
  # NO CHECKED-IN PROTOBUF GENCODE
4
4
  # source: embedding_pipeline/query/v1/query.proto
5
- # Protobuf Python Version: 6.33.4
5
+ # Protobuf Python Version: 6.33.5
6
6
  """Generated protocol buffer code."""
7
7
  from google.protobuf import descriptor as _descriptor
8
8
  from google.protobuf import descriptor_pool as _descriptor_pool
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
13
13
  _runtime_version.Domain.PUBLIC,
14
14
  6,
15
15
  33,
16
- 4,
16
+ 5,
17
17
  '',
18
18
  'embedding_pipeline/query/v1/query.proto'
19
19
  )
@@ -2,7 +2,7 @@
2
2
  # Generated by the protocol buffer compiler. DO NOT EDIT!
3
3
  # NO CHECKED-IN PROTOBUF GENCODE
4
4
  # source: embedding_pipeline/tools/v1/tools.proto
5
- # Protobuf Python Version: 6.33.4
5
+ # Protobuf Python Version: 6.33.5
6
6
  """Generated protocol buffer code."""
7
7
  from google.protobuf import descriptor as _descriptor
8
8
  from google.protobuf import descriptor_pool as _descriptor_pool
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
13
13
  _runtime_version.Domain.PUBLIC,
14
14
  6,
15
15
  33,
16
- 4,
16
+ 5,
17
17
  '',
18
18
  'embedding_pipeline/tools/v1/tools.proto'
19
19
  )
@@ -24,7 +24,7 @@ _sym_db = _symbol_database.Default()
24
24
 
25
25
 
26
26
 
27
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1*\xc9\x01\n\x0eToolCollection\x12\x1f\n\x1bTOOL_COLLECTION_UNSPECIFIED\x10\x00\x12\x1d\n\x19TOOL_COLLECTION_FLASHCARD\x10\x01\x12!\n\x1dTOOL_COLLECTION_TEST_QUESTION\x10\x02\x12(\n$TOOL_COLLECTION_SPACED_TEST_QUESTION\x10\x03\x12*\n&TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION\x10\x04*\xb2\x01\n\rFlashCardType\x12\x1f\n\x1b\x46LASH_CARD_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15\x46LASH_CARD_TYPE_BASIC\x10\x01\x12\x19\n\x15\x46LASH_CARD_TYPE_CLOZE\x10\x02\x12%\n!FLASH_CARD_TYPE_FILL_IN_THE_BLANK\x10\x03\x12#\n\x1f\x46LASH_CARD_TYPE_MULTIPLE_CHOICE\x10\x04\x42gZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1b\x06proto3')
27
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1\"\xed\x01\n\rTopicMetadata\x12\x1c\n\x07user_id\x18\x01 \x01(\tH\x00R\x06userId\x88\x01\x01\x12\x1e\n\x08topic_id\x18\x02 \x01(\tH\x01R\x07topicId\x88\x01\x01\x12K\n\x05\x65xtra\x18\x03 \x03(\x0b\x32\x35.embedding_pipeline.tools.v1.TopicMetadata.ExtraEntryR\x05\x65xtra\x1a\x38\n\nExtraEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08_user_idB\x0b\n\t_topic_id*\xe4\x01\n\x0eToolCollection\x12\x1f\n\x1bTOOL_COLLECTION_UNSPECIFIED\x10\x00\x12\x1d\n\x19TOOL_COLLECTION_FLASHCARD\x10\x01\x12!\n\x1dTOOL_COLLECTION_TEST_QUESTION\x10\x02\x12(\n$TOOL_COLLECTION_SPACED_TEST_QUESTION\x10\x03\x12*\n&TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION\x10\x04\x12\x19\n\x15TOOL_COLLECTION_TOPIC\x10\x05*\xb2\x01\n\rFlashCardType\x12\x1f\n\x1b\x46LASH_CARD_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15\x46LASH_CARD_TYPE_BASIC\x10\x01\x12\x19\n\x15\x46LASH_CARD_TYPE_CLOZE\x10\x02\x12%\n!FLASH_CARD_TYPE_FILL_IN_THE_BLANK\x10\x03\x12#\n\x1f\x46LASH_CARD_TYPE_MULTIPLE_CHOICE\x10\x04\x42gZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1b\x06proto3')
28
28
 
29
29
  _globals = globals()
30
30
  _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -32,8 +32,14 @@ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pipeline.tools.v1
32
32
  if not _descriptor._USE_C_DESCRIPTORS:
33
33
  _globals['DESCRIPTOR']._loaded_options = None
34
34
  _globals['DESCRIPTOR']._serialized_options = b'Zegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1'
35
- _globals['_TOOLCOLLECTION']._serialized_start=73
36
- _globals['_TOOLCOLLECTION']._serialized_end=274
37
- _globals['_FLASHCARDTYPE']._serialized_start=277
38
- _globals['_FLASHCARDTYPE']._serialized_end=455
35
+ _globals['_TOPICMETADATA_EXTRAENTRY']._loaded_options = None
36
+ _globals['_TOPICMETADATA_EXTRAENTRY']._serialized_options = b'8\001'
37
+ _globals['_TOOLCOLLECTION']._serialized_start=313
38
+ _globals['_TOOLCOLLECTION']._serialized_end=541
39
+ _globals['_FLASHCARDTYPE']._serialized_start=544
40
+ _globals['_FLASHCARDTYPE']._serialized_end=722
41
+ _globals['_TOPICMETADATA']._serialized_start=73
42
+ _globals['_TOPICMETADATA']._serialized_end=310
43
+ _globals['_TOPICMETADATA_EXTRAENTRY']._serialized_start=229
44
+ _globals['_TOPICMETADATA_EXTRAENTRY']._serialized_end=285
39
45
  # @@protoc_insertion_point(module_scope)
@@ -1,6 +1,9 @@
1
+ from google.protobuf.internal import containers as _containers
1
2
  from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
2
3
  from google.protobuf import descriptor as _descriptor
3
- from typing import ClassVar as _ClassVar
4
+ from google.protobuf import message as _message
5
+ from collections.abc import Mapping as _Mapping
6
+ from typing import ClassVar as _ClassVar, Optional as _Optional
4
7
 
5
8
  DESCRIPTOR: _descriptor.FileDescriptor
6
9
 
@@ -11,6 +14,7 @@ class ToolCollection(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
11
14
  TOOL_COLLECTION_TEST_QUESTION: _ClassVar[ToolCollection]
12
15
  TOOL_COLLECTION_SPACED_TEST_QUESTION: _ClassVar[ToolCollection]
13
16
  TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: _ClassVar[ToolCollection]
17
+ TOOL_COLLECTION_TOPIC: _ClassVar[ToolCollection]
14
18
 
15
19
  class FlashCardType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
16
20
  __slots__ = ()
@@ -24,8 +28,26 @@ TOOL_COLLECTION_FLASHCARD: ToolCollection
24
28
  TOOL_COLLECTION_TEST_QUESTION: ToolCollection
25
29
  TOOL_COLLECTION_SPACED_TEST_QUESTION: ToolCollection
26
30
  TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: ToolCollection
31
+ TOOL_COLLECTION_TOPIC: ToolCollection
27
32
  FLASH_CARD_TYPE_UNSPECIFIED: FlashCardType
28
33
  FLASH_CARD_TYPE_BASIC: FlashCardType
29
34
  FLASH_CARD_TYPE_CLOZE: FlashCardType
30
35
  FLASH_CARD_TYPE_FILL_IN_THE_BLANK: FlashCardType
31
36
  FLASH_CARD_TYPE_MULTIPLE_CHOICE: FlashCardType
37
+
38
+ class TopicMetadata(_message.Message):
39
+ __slots__ = ("user_id", "topic_id", "extra")
40
+ class ExtraEntry(_message.Message):
41
+ __slots__ = ("key", "value")
42
+ KEY_FIELD_NUMBER: _ClassVar[int]
43
+ VALUE_FIELD_NUMBER: _ClassVar[int]
44
+ key: str
45
+ value: str
46
+ def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
47
+ USER_ID_FIELD_NUMBER: _ClassVar[int]
48
+ TOPIC_ID_FIELD_NUMBER: _ClassVar[int]
49
+ EXTRA_FIELD_NUMBER: _ClassVar[int]
50
+ user_id: str
51
+ topic_id: str
52
+ extra: _containers.ScalarMap[str, str]
53
+ def __init__(self, user_id: _Optional[str] = ..., topic_id: _Optional[str] = ..., extra: _Optional[_Mapping[str, str]] = ...) -> None: ...
@@ -16,6 +16,7 @@ from .types import (
16
16
  MultipleChoiceOption,
17
17
  QuestionData,
18
18
  ToolCollection,
19
+ TopicData,
19
20
  )
20
21
 
21
22
  __all__ = [
@@ -26,6 +27,7 @@ __all__ = [
26
27
  "FlashCardData",
27
28
  "QuestionData",
28
29
  "AudioRecapSectionData",
30
+ "TopicData",
29
31
  "MultipleChoiceOption",
30
32
  "AnswerObject",
31
33
  ]
vector_sdk/hash/hasher.py CHANGED
@@ -15,6 +15,7 @@ from .types import (
15
15
  MultipleChoiceOption,
16
16
  QuestionData,
17
17
  ToolCollection,
18
+ TopicData,
18
19
  )
19
20
 
20
21
  # Hash length in hex characters (128 bits = 32 hex chars)
@@ -23,7 +24,7 @@ HASH_LENGTH = 32
23
24
 
24
25
  def compute_content_hash(
25
26
  tool_collection: ToolCollection,
26
- data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
27
+ data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
27
28
  ) -> str:
28
29
  """
29
30
  Compute a deterministic content hash for a learning tool.
@@ -52,7 +53,7 @@ def compute_content_hash(
52
53
 
53
54
  def extract_tool_text(
54
55
  tool_collection: ToolCollection,
55
- data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
56
+ data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
56
57
  ) -> str:
57
58
  """
58
59
  Extract the text content from a learning tool for embedding.
@@ -94,6 +95,8 @@ def extract_tool_text(
94
95
  return _extract_question_text(data_dict)
95
96
  elif tool_collection == "AudioRecapV2Section":
96
97
  return _extract_audio_recap_text(data_dict)
98
+ elif tool_collection == "Topic":
99
+ return _extract_topic_text(data_dict)
97
100
  else:
98
101
  return ""
99
102
 
@@ -183,6 +186,29 @@ def _extract_audio_recap_text(data: dict) -> str:
183
186
  return ""
184
187
 
185
188
 
189
+ def _extract_topic_text(data: dict) -> str:
190
+ """
191
+ Extract text from Topic.
192
+
193
+ Format: "Topic: {topic}. Description: {description}."
194
+ """
195
+ parts: list[str] = []
196
+
197
+ topic = data.get("topic")
198
+ if topic:
199
+ trimmed = topic.strip()
200
+ if trimmed:
201
+ parts.append(f"Topic: {trimmed}.")
202
+
203
+ description = data.get("description")
204
+ if description:
205
+ trimmed = description.strip()
206
+ if trimmed:
207
+ parts.append(f"Description: {trimmed}.")
208
+
209
+ return " ".join(parts)
210
+
211
+
186
212
  def _strip_flashcard_syntax(text: str) -> str:
187
213
  """
188
214
  Strip {{...}} markers from cloze/fill-in-blank text.
vector_sdk/hash/types.py CHANGED
@@ -9,7 +9,7 @@ from typing import Literal, Optional, Union
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
 
11
11
  # Tool collection types
12
- ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section"]
12
+ ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section", "Topic"]
13
13
 
14
14
  # FlashCard type variants
15
15
  FlashCardType = Literal["BASIC", "CLOZE", "FILL_IN_THE_BLANK", "MULTIPLE_CHOICE"]
@@ -65,3 +65,12 @@ class AudioRecapSectionData(BaseModel):
65
65
  model_config = ConfigDict(extra="allow")
66
66
 
67
67
  script: Optional[str] = None
68
+
69
+
70
+ class TopicData(BaseModel):
71
+ """Topic data for content hashing."""
72
+
73
+ model_config = ConfigDict(extra="allow")
74
+
75
+ topic: Optional[str] = None
76
+ description: Optional[str] = None
@@ -14,9 +14,15 @@ from .router import (
14
14
  validate_database_routing,
15
15
  )
16
16
  from .structured_embeddings import (
17
+ AudioRecapBatchItem,
18
+ BatchItem,
19
+ FlashCardBatchItem,
17
20
  StructuredEmbeddingsNamespace,
21
+ TestQuestionBatchItem,
18
22
  TestQuestionInput,
19
23
  ToolMetadata,
24
+ TopicBatchItem,
25
+ TopicMetadata,
20
26
  )
21
27
  from .tool_config import (
22
28
  TOOL_CONFIGS,
@@ -37,7 +43,14 @@ __all__ = [
37
43
  "StructuredEmbeddingsNamespace",
38
44
  # Types
39
45
  "ToolMetadata",
46
+ "TopicMetadata",
40
47
  "TestQuestionInput",
48
+ # Batch types
49
+ "BatchItem",
50
+ "FlashCardBatchItem",
51
+ "TestQuestionBatchItem",
52
+ "AudioRecapBatchItem",
53
+ "TopicBatchItem",
41
54
  # Tool configuration
42
55
  "ToolConfig",
43
56
  "ToolDatabaseConfig",
@@ -12,6 +12,7 @@ from ..hash import (
12
12
  AudioRecapSectionData,
13
13
  FlashCardData,
14
14
  ToolCollection,
15
+ TopicData,
15
16
  compute_content_hash,
16
17
  extract_tool_text,
17
18
  )
@@ -47,6 +48,29 @@ class ToolMetadata:
47
48
  return result
48
49
 
49
50
 
51
+ @dataclass
52
+ class TopicMetadata:
53
+ """
54
+ Metadata for topic embeddings.
55
+ Unlike ToolMetadata, all fields are optional since topics don't have a toolId.
56
+ """
57
+
58
+ user_id: Optional[str] = None
59
+ topic_id: Optional[str] = None
60
+ extra: Optional[dict[str, Any]] = None
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ """Convert to dictionary for document storage."""
64
+ result: dict[str, Any] = {}
65
+ if self.user_id:
66
+ result["userId"] = self.user_id
67
+ if self.topic_id:
68
+ result["topicId"] = self.topic_id
69
+ if self.extra:
70
+ result.update(self.extra)
71
+ return result
72
+
73
+
50
74
  @dataclass
51
75
  class TestQuestionInput:
52
76
  """Extended question data with question type."""
@@ -67,6 +91,46 @@ class TestQuestionInput:
67
91
  return result
68
92
 
69
93
 
94
+ @dataclass
95
+ class BatchItem:
96
+ """Batch item for embedding multiple items of the same type."""
97
+
98
+ data: dict[str, Any]
99
+ metadata: ToolMetadata
100
+
101
+
102
+ @dataclass
103
+ class FlashCardBatchItem:
104
+ """Batch item for FlashCard embeddings."""
105
+
106
+ data: FlashCardData
107
+ metadata: ToolMetadata
108
+
109
+
110
+ @dataclass
111
+ class TestQuestionBatchItem:
112
+ """Batch item for TestQuestion embeddings."""
113
+
114
+ data: TestQuestionInput
115
+ metadata: ToolMetadata
116
+
117
+
118
+ @dataclass
119
+ class AudioRecapBatchItem:
120
+ """Batch item for AudioRecap embeddings."""
121
+
122
+ data: AudioRecapSectionData
123
+ metadata: ToolMetadata
124
+
125
+
126
+ @dataclass
127
+ class TopicBatchItem:
128
+ """Batch item for Topic embeddings."""
129
+
130
+ data: TopicData
131
+ metadata: TopicMetadata
132
+
133
+
70
134
  # ============================================================================
71
135
  # StructuredEmbeddingsNamespace
72
136
  # ============================================================================
@@ -150,6 +214,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
150
214
  card_type = data.get("type")
151
215
  return self._embed_tool_and_wait("FlashCard", data, metadata, card_type, timeout)
152
216
 
217
+ def embed_flashcard_batch(
218
+ self,
219
+ items: list[FlashCardBatchItem],
220
+ ) -> str:
221
+ """
222
+ Embed a batch of flashcards and return the request ID.
223
+ All flashcards in the batch should have the same type for proper namespace routing.
224
+
225
+ Args:
226
+ items: List of FlashCardBatchItem objects
227
+
228
+ Returns:
229
+ The request ID
230
+ """
231
+ return self._embed_tool_batch(
232
+ "FlashCard",
233
+ [
234
+ {
235
+ "data": item.data,
236
+ "metadata": item.metadata,
237
+ "sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
238
+ }
239
+ for item in items
240
+ ],
241
+ )
242
+
243
+ def embed_flashcard_batch_and_wait(
244
+ self,
245
+ items: list[FlashCardBatchItem],
246
+ timeout: int = 60,
247
+ ) -> EmbeddingResult:
248
+ """
249
+ Embed a batch of flashcards and wait for the result.
250
+ All flashcards in the batch should have the same type for proper namespace routing.
251
+
252
+ Args:
253
+ items: List of FlashCardBatchItem objects
254
+ timeout: Timeout in seconds (default: 60)
255
+
256
+ Returns:
257
+ The embedding result
258
+ """
259
+ return self._embed_tool_batch_and_wait(
260
+ "FlashCard",
261
+ [
262
+ {
263
+ "data": item.data,
264
+ "metadata": item.metadata,
265
+ "sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
266
+ }
267
+ for item in items
268
+ ],
269
+ timeout,
270
+ )
271
+
153
272
  # ==========================================================================
154
273
  # TestQuestion Methods
155
274
  # ==========================================================================
@@ -201,6 +320,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
201
320
  timeout,
202
321
  )
203
322
 
323
+ def embed_test_question_batch(
324
+ self,
325
+ items: list[TestQuestionBatchItem],
326
+ ) -> str:
327
+ """
328
+ Embed a batch of test questions and return the request ID.
329
+ All questions in the batch should have the same question_type for proper namespace routing.
330
+
331
+ Args:
332
+ items: List of TestQuestionBatchItem objects
333
+
334
+ Returns:
335
+ The request ID
336
+ """
337
+ return self._embed_tool_batch(
338
+ "TestQuestion",
339
+ [
340
+ {
341
+ "data": item.data.to_question_data(),
342
+ "metadata": item.metadata,
343
+ "sub_type": item.data.question_type,
344
+ }
345
+ for item in items
346
+ ],
347
+ )
348
+
349
+ def embed_test_question_batch_and_wait(
350
+ self,
351
+ items: list[TestQuestionBatchItem],
352
+ timeout: int = 60,
353
+ ) -> EmbeddingResult:
354
+ """
355
+ Embed a batch of test questions and wait for the result.
356
+ All questions in the batch should have the same question_type for proper namespace routing.
357
+
358
+ Args:
359
+ items: List of TestQuestionBatchItem objects
360
+ timeout: Timeout in seconds (default: 60)
361
+
362
+ Returns:
363
+ The embedding result
364
+ """
365
+ return self._embed_tool_batch_and_wait(
366
+ "TestQuestion",
367
+ [
368
+ {
369
+ "data": item.data.to_question_data(),
370
+ "metadata": item.metadata,
371
+ "sub_type": item.data.question_type,
372
+ }
373
+ for item in items
374
+ ],
375
+ timeout,
376
+ )
377
+
204
378
  # ==========================================================================
205
379
  # SpacedTestQuestion Methods
206
380
  # ==========================================================================
@@ -252,6 +426,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
252
426
  timeout,
253
427
  )
254
428
 
429
+ def embed_spaced_test_question_batch(
430
+ self,
431
+ items: list[TestQuestionBatchItem],
432
+ ) -> str:
433
+ """
434
+ Embed a batch of spaced test questions and return the request ID.
435
+ All questions in the batch should have the same question_type for proper namespace routing.
436
+
437
+ Args:
438
+ items: List of TestQuestionBatchItem objects
439
+
440
+ Returns:
441
+ The request ID
442
+ """
443
+ return self._embed_tool_batch(
444
+ "SpacedTestQuestion",
445
+ [
446
+ {
447
+ "data": item.data.to_question_data(),
448
+ "metadata": item.metadata,
449
+ "sub_type": item.data.question_type,
450
+ }
451
+ for item in items
452
+ ],
453
+ )
454
+
455
+ def embed_spaced_test_question_batch_and_wait(
456
+ self,
457
+ items: list[TestQuestionBatchItem],
458
+ timeout: int = 60,
459
+ ) -> EmbeddingResult:
460
+ """
461
+ Embed a batch of spaced test questions and wait for the result.
462
+ All questions in the batch should have the same question_type for proper namespace routing.
463
+
464
+ Args:
465
+ items: List of TestQuestionBatchItem objects
466
+ timeout: Timeout in seconds (default: 60)
467
+
468
+ Returns:
469
+ The embedding result
470
+ """
471
+ return self._embed_tool_batch_and_wait(
472
+ "SpacedTestQuestion",
473
+ [
474
+ {
475
+ "data": item.data.to_question_data(),
476
+ "metadata": item.metadata,
477
+ "sub_type": item.data.question_type,
478
+ }
479
+ for item in items
480
+ ],
481
+ timeout,
482
+ )
483
+
255
484
  # ==========================================================================
256
485
  # AudioRecap Methods
257
486
  # ==========================================================================
@@ -298,6 +527,401 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
298
527
  timeout,
299
528
  )
300
529
 
530
+ def embed_audio_recap_batch(
531
+ self,
532
+ items: list[AudioRecapBatchItem],
533
+ ) -> str:
534
+ """
535
+ Embed a batch of audio recap sections and return the request ID.
536
+
537
+ Args:
538
+ items: List of AudioRecapBatchItem objects
539
+
540
+ Returns:
541
+ The request ID
542
+ """
543
+ return self._embed_tool_batch(
544
+ "AudioRecapV2Section",
545
+ [
546
+ {
547
+ "data": item.data,
548
+ "metadata": item.metadata,
549
+ "sub_type": None,
550
+ }
551
+ for item in items
552
+ ],
553
+ )
554
+
555
+ def embed_audio_recap_batch_and_wait(
556
+ self,
557
+ items: list[AudioRecapBatchItem],
558
+ timeout: int = 60,
559
+ ) -> EmbeddingResult:
560
+ """
561
+ Embed a batch of audio recap sections and wait for the result.
562
+
563
+ Args:
564
+ items: List of AudioRecapBatchItem objects
565
+ timeout: Timeout in seconds (default: 60)
566
+
567
+ Returns:
568
+ The embedding result
569
+ """
570
+ return self._embed_tool_batch_and_wait(
571
+ "AudioRecapV2Section",
572
+ [
573
+ {
574
+ "data": item.data,
575
+ "metadata": item.metadata,
576
+ "sub_type": None,
577
+ }
578
+ for item in items
579
+ ],
580
+ timeout,
581
+ )
582
+
583
+ # ==========================================================================
584
+ # Topic Methods
585
+ # ==========================================================================
586
+
587
+ def embed_topic(
588
+ self,
589
+ data: TopicData,
590
+ metadata: TopicMetadata,
591
+ ) -> str:
592
+ """
593
+ Embed a topic and return the request ID.
594
+
595
+ Args:
596
+ data: Topic data (topic, description)
597
+ metadata: Topic metadata (all fields optional)
598
+
599
+ Returns:
600
+ The request ID
601
+ """
602
+ return self._embed_topic_internal("Topic", data, metadata, None)
603
+
604
+ def embed_topic_and_wait(
605
+ self,
606
+ data: TopicData,
607
+ metadata: TopicMetadata,
608
+ timeout: int = 60,
609
+ ) -> EmbeddingResult:
610
+ """
611
+ Embed a topic and wait for the result.
612
+
613
+ Args:
614
+ data: Topic data
615
+ metadata: Topic metadata (all fields optional)
616
+ timeout: Timeout in seconds (default: 60)
617
+
618
+ Returns:
619
+ The embedding result
620
+ """
621
+ return self._embed_topic_internal_and_wait("Topic", data, metadata, None, timeout)
622
+
623
+ def embed_topic_batch(
624
+ self,
625
+ items: list[TopicBatchItem],
626
+ ) -> str:
627
+ """
628
+ Embed a batch of topics and return the request ID.
629
+
630
+ Args:
631
+ items: List of TopicBatchItem objects
632
+
633
+ Returns:
634
+ The request ID
635
+ """
636
+ return self._embed_topic_batch_internal(items)
637
+
638
+ def embed_topic_batch_and_wait(
639
+ self,
640
+ items: list[TopicBatchItem],
641
+ timeout: int = 60,
642
+ ) -> EmbeddingResult:
643
+ """
644
+ Embed a batch of topics and wait for the result.
645
+
646
+ Args:
647
+ items: List of TopicBatchItem objects
648
+ timeout: Timeout in seconds (default: 60)
649
+
650
+ Returns:
651
+ The embedding result
652
+ """
653
+ return self._embed_topic_batch_internal_and_wait(items, timeout)
654
+
655
+ # ==========================================================================
656
+ # Internal Topic Methods (using TopicMetadata)
657
+ # ==========================================================================
658
+
659
+ def _embed_topic_internal(
660
+ self,
661
+ tool_collection: ToolCollection,
662
+ data: TopicData,
663
+ metadata: TopicMetadata,
664
+ sub_type: Optional[str],
665
+ ) -> str:
666
+ """Internal method to embed a topic with TopicMetadata."""
667
+ # 1. Extract text using the spec
668
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
669
+ if not text:
670
+ raise ValueError(
671
+ f"Failed to extract text from {tool_collection} - empty content"
672
+ )
673
+
674
+ # 2. Compute content hash
675
+ content_hash = compute_content_hash(
676
+ {"toolCollection": tool_collection, "data": data}
677
+ )
678
+ if not content_hash:
679
+ raise ValueError(
680
+ f"Failed to compute content hash for {tool_collection} - empty content"
681
+ )
682
+
683
+ # 3. Get tool config
684
+ tool_config = get_tool_config(tool_collection)
685
+
686
+ # 4. Build document with metadata (TopicMetadata doesn't have toolId)
687
+ document = {
688
+ **metadata.to_dict(),
689
+ "toolCollection": tool_collection,
690
+ "contentHash": content_hash,
691
+ }
692
+
693
+ # 5. Build storage config using router
694
+ storage_config = build_storage_config(
695
+ tool_collection=tool_collection,
696
+ sub_type=sub_type,
697
+ content_hash=content_hash,
698
+ document_fields=document,
699
+ )
700
+
701
+ # 6. Build text input
702
+ text_input = {
703
+ "id": content_hash,
704
+ "text": text,
705
+ "document": document,
706
+ }
707
+
708
+ # 7. Submit using embeddings namespace
709
+ return self._embeddings.create(
710
+ texts=[text_input],
711
+ content_type=get_content_type(tool_collection),
712
+ priority=tool_config.default_priority,
713
+ storage=storage_config,
714
+ metadata={
715
+ "toolCollection": tool_collection,
716
+ "contentHash": content_hash,
717
+ },
718
+ embedding_model=tool_config.model,
719
+ embedding_dimensions=tool_config.dimensions,
720
+ )
721
+
722
+ def _embed_topic_internal_and_wait(
723
+ self,
724
+ tool_collection: ToolCollection,
725
+ data: TopicData,
726
+ metadata: TopicMetadata,
727
+ sub_type: Optional[str],
728
+ timeout: int = 60,
729
+ ) -> EmbeddingResult:
730
+ """Internal method to embed a topic and wait for result."""
731
+ # 1. Extract text using the spec
732
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
733
+ if not text:
734
+ raise ValueError(
735
+ f"Failed to extract text from {tool_collection} - empty content"
736
+ )
737
+
738
+ # 2. Compute content hash
739
+ content_hash = compute_content_hash(
740
+ {"toolCollection": tool_collection, "data": data}
741
+ )
742
+ if not content_hash:
743
+ raise ValueError(
744
+ f"Failed to compute content hash for {tool_collection} - empty content"
745
+ )
746
+
747
+ # 3. Get tool config
748
+ tool_config = get_tool_config(tool_collection)
749
+
750
+ # 4. Build document with metadata
751
+ document = {
752
+ **metadata.to_dict(),
753
+ "toolCollection": tool_collection,
754
+ "contentHash": content_hash,
755
+ }
756
+
757
+ # 5. Build storage config using router
758
+ storage_config = build_storage_config(
759
+ tool_collection=tool_collection,
760
+ sub_type=sub_type,
761
+ content_hash=content_hash,
762
+ document_fields=document,
763
+ )
764
+
765
+ # 6. Build text input
766
+ text_input = {
767
+ "id": content_hash,
768
+ "text": text,
769
+ "document": document,
770
+ }
771
+
772
+ # 7. Submit and wait using embeddings namespace
773
+ return self._embeddings.create_and_wait(
774
+ texts=[text_input],
775
+ content_type=get_content_type(tool_collection),
776
+ priority=tool_config.default_priority,
777
+ storage=storage_config,
778
+ metadata={
779
+ "toolCollection": tool_collection,
780
+ "contentHash": content_hash,
781
+ },
782
+ embedding_model=tool_config.model,
783
+ embedding_dimensions=tool_config.dimensions,
784
+ timeout=timeout,
785
+ )
786
+
787
+ def _embed_topic_batch_internal(
788
+ self,
789
+ items: list[TopicBatchItem],
790
+ ) -> str:
791
+ """Internal method to embed a batch of topics."""
792
+ if not items:
793
+ raise ValueError("Batch cannot be empty")
794
+
795
+ tool_collection: ToolCollection = "Topic"
796
+ tool_config = get_tool_config(tool_collection)
797
+
798
+ # Process each item
799
+ text_inputs = []
800
+ for item in items:
801
+ data = item.data
802
+ metadata = item.metadata
803
+
804
+ # Extract text
805
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
806
+ if not text:
807
+ raise ValueError(
808
+ f"Failed to extract text from {tool_collection} - empty content"
809
+ )
810
+
811
+ # Compute content hash
812
+ content_hash = compute_content_hash(
813
+ {"toolCollection": tool_collection, "data": data}
814
+ )
815
+ if not content_hash:
816
+ raise ValueError(
817
+ f"Failed to compute content hash for {tool_collection} - empty content"
818
+ )
819
+
820
+ # Build document with metadata (TopicMetadata doesn't have toolId)
821
+ document = {
822
+ **metadata.to_dict(),
823
+ "toolCollection": tool_collection,
824
+ "contentHash": content_hash,
825
+ }
826
+
827
+ text_inputs.append({
828
+ "id": content_hash,
829
+ "text": text,
830
+ "document": document,
831
+ })
832
+
833
+ # Build storage config using first item
834
+ storage_config = build_storage_config(
835
+ tool_collection=tool_collection,
836
+ sub_type=None,
837
+ content_hash=text_inputs[0]["id"],
838
+ document_fields=text_inputs[0]["document"],
839
+ )
840
+
841
+ # Submit batch to embeddings namespace
842
+ return self._embeddings.create(
843
+ texts=text_inputs,
844
+ content_type=get_content_type(tool_collection),
845
+ priority=tool_config.default_priority,
846
+ storage=storage_config,
847
+ metadata={
848
+ "toolCollection": tool_collection,
849
+ "batchSize": len(items),
850
+ },
851
+ embedding_model=tool_config.model,
852
+ embedding_dimensions=tool_config.dimensions,
853
+ )
854
+
855
+ def _embed_topic_batch_internal_and_wait(
856
+ self,
857
+ items: list[TopicBatchItem],
858
+ timeout: int = 60,
859
+ ) -> EmbeddingResult:
860
+ """Internal method to embed a batch of topics and wait for result."""
861
+ if not items:
862
+ raise ValueError("Batch cannot be empty")
863
+
864
+ tool_collection: ToolCollection = "Topic"
865
+ tool_config = get_tool_config(tool_collection)
866
+
867
+ # Process each item
868
+ text_inputs = []
869
+ for item in items:
870
+ data = item.data
871
+ metadata = item.metadata
872
+
873
+ # Extract text
874
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
875
+ if not text:
876
+ raise ValueError(
877
+ f"Failed to extract text from {tool_collection} - empty content"
878
+ )
879
+
880
+ # Compute content hash
881
+ content_hash = compute_content_hash(
882
+ {"toolCollection": tool_collection, "data": data}
883
+ )
884
+ if not content_hash:
885
+ raise ValueError(
886
+ f"Failed to compute content hash for {tool_collection} - empty content"
887
+ )
888
+
889
+ # Build document with metadata
890
+ document = {
891
+ **metadata.to_dict(),
892
+ "toolCollection": tool_collection,
893
+ "contentHash": content_hash,
894
+ }
895
+
896
+ text_inputs.append({
897
+ "id": content_hash,
898
+ "text": text,
899
+ "document": document,
900
+ })
901
+
902
+ # Build storage config using first item
903
+ storage_config = build_storage_config(
904
+ tool_collection=tool_collection,
905
+ sub_type=None,
906
+ content_hash=text_inputs[0]["id"],
907
+ document_fields=text_inputs[0]["document"],
908
+ )
909
+
910
+ # Submit batch and wait
911
+ return self._embeddings.create_and_wait(
912
+ texts=text_inputs,
913
+ content_type=get_content_type(tool_collection),
914
+ priority=tool_config.default_priority,
915
+ storage=storage_config,
916
+ metadata={
917
+ "toolCollection": tool_collection,
918
+ "batchSize": len(items),
919
+ },
920
+ embedding_model=tool_config.model,
921
+ embedding_dimensions=tool_config.dimensions,
922
+ timeout=timeout,
923
+ )
924
+
301
925
  # ==========================================================================
302
926
  # Internal Methods
303
927
  # ==========================================================================
@@ -429,3 +1053,164 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
429
1053
  embedding_dimensions=tool_config.dimensions,
430
1054
  timeout=timeout,
431
1055
  )
1056
+
1057
+ def _embed_tool_batch(
1058
+ self,
1059
+ tool_collection: ToolCollection,
1060
+ items: list[dict[str, Any]],
1061
+ ) -> str:
1062
+ """
1063
+ Internal method to embed a batch of items of the same tool type.
1064
+
1065
+ Args:
1066
+ tool_collection: The tool collection type
1067
+ items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
1068
+
1069
+ Returns:
1070
+ The request ID
1071
+ """
1072
+ if not items:
1073
+ raise ValueError("Batch cannot be empty")
1074
+
1075
+ # Get tool config (same for all items)
1076
+ tool_config = get_tool_config(tool_collection)
1077
+
1078
+ # Process each item
1079
+ text_inputs = []
1080
+ for item in items:
1081
+ data = item["data"]
1082
+ metadata = item["metadata"]
1083
+
1084
+ # Extract text
1085
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
1086
+ if not text:
1087
+ raise ValueError(
1088
+ f"Failed to extract text from {tool_collection} - empty content"
1089
+ )
1090
+
1091
+ # Compute content hash
1092
+ content_hash = compute_content_hash(
1093
+ {"toolCollection": tool_collection, "data": data}
1094
+ )
1095
+ if not content_hash:
1096
+ raise ValueError(
1097
+ f"Failed to compute content hash for {tool_collection} - empty content"
1098
+ )
1099
+
1100
+ # Build document with metadata
1101
+ document = {
1102
+ **metadata.to_dict(),
1103
+ "toolCollection": tool_collection,
1104
+ "contentHash": content_hash,
1105
+ }
1106
+
1107
+ text_inputs.append({
1108
+ "id": content_hash,
1109
+ "text": text,
1110
+ "document": document,
1111
+ })
1112
+
1113
+ # Build storage config using first item's sub_type
1114
+ first_item = items[0]
1115
+ storage_config = build_storage_config(
1116
+ tool_collection=tool_collection,
1117
+ sub_type=first_item.get("sub_type"),
1118
+ content_hash=text_inputs[0]["id"],
1119
+ document_fields=text_inputs[0]["document"],
1120
+ )
1121
+
1122
+ # Submit batch to embeddings namespace
1123
+ return self._embeddings.create(
1124
+ texts=text_inputs,
1125
+ content_type=get_content_type(tool_collection),
1126
+ priority=tool_config.default_priority,
1127
+ storage=storage_config,
1128
+ metadata={
1129
+ "toolCollection": tool_collection,
1130
+ "batchSize": len(items),
1131
+ },
1132
+ embedding_model=tool_config.model,
1133
+ embedding_dimensions=tool_config.dimensions,
1134
+ )
1135
+
1136
+ def _embed_tool_batch_and_wait(
1137
+ self,
1138
+ tool_collection: ToolCollection,
1139
+ items: list[dict[str, Any]],
1140
+ timeout: int = 60,
1141
+ ) -> EmbeddingResult:
1142
+ """
1143
+ Internal method to embed a batch of items and wait for result.
1144
+
1145
+ Args:
1146
+ tool_collection: The tool collection type
1147
+ items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
1148
+ timeout: Timeout in seconds (default: 60)
1149
+
1150
+ Returns:
1151
+ The embedding result
1152
+ """
1153
+ if not items:
1154
+ raise ValueError("Batch cannot be empty")
1155
+
1156
+ # Get tool config (same for all items)
1157
+ tool_config = get_tool_config(tool_collection)
1158
+
1159
+ # Process each item
1160
+ text_inputs = []
1161
+ for item in items:
1162
+ data = item["data"]
1163
+ metadata = item["metadata"]
1164
+
1165
+ # Extract text
1166
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
1167
+ if not text:
1168
+ raise ValueError(
1169
+ f"Failed to extract text from {tool_collection} - empty content"
1170
+ )
1171
+
1172
+ # Compute content hash
1173
+ content_hash = compute_content_hash(
1174
+ {"toolCollection": tool_collection, "data": data}
1175
+ )
1176
+ if not content_hash:
1177
+ raise ValueError(
1178
+ f"Failed to compute content hash for {tool_collection} - empty content"
1179
+ )
1180
+
1181
+ # Build document with metadata
1182
+ document = {
1183
+ **metadata.to_dict(),
1184
+ "toolCollection": tool_collection,
1185
+ "contentHash": content_hash,
1186
+ }
1187
+
1188
+ text_inputs.append({
1189
+ "id": content_hash,
1190
+ "text": text,
1191
+ "document": document,
1192
+ })
1193
+
1194
+ # Build storage config using first item's sub_type
1195
+ first_item = items[0]
1196
+ storage_config = build_storage_config(
1197
+ tool_collection=tool_collection,
1198
+ sub_type=first_item.get("sub_type"),
1199
+ content_hash=text_inputs[0]["id"],
1200
+ document_fields=text_inputs[0]["document"],
1201
+ )
1202
+
1203
+ # Submit batch and wait
1204
+ return self._embeddings.create_and_wait(
1205
+ texts=text_inputs,
1206
+ content_type=get_content_type(tool_collection),
1207
+ priority=tool_config.default_priority,
1208
+ storage=storage_config,
1209
+ metadata={
1210
+ "toolCollection": tool_collection,
1211
+ "batchSize": len(items),
1212
+ },
1213
+ embedding_model=tool_config.model,
1214
+ embedding_dimensions=tool_config.dimensions,
1215
+ timeout=timeout,
1216
+ )
@@ -144,6 +144,25 @@ TOOL_CONFIGS: dict[ToolCollection, ToolConfig] = {
144
144
  namespace_pattern="audiorecapv2section",
145
145
  ),
146
146
  ),
147
+ "Topic": ToolConfig(
148
+ tool_collection="Topic",
149
+ model="gemini-embedding-001",
150
+ dimensions=3072,
151
+ default_priority=PRIORITY_NORMAL,
152
+ turbopuffer=TurboPufferToolConfig(
153
+ enabled=True,
154
+ id_field="contentHash",
155
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
156
+ namespace_pattern="topic_vectors",
157
+ ),
158
+ pinecone=PineconeToolConfig(
159
+ enabled=False,
160
+ index_name="tool-vectors",
161
+ id_field="contentHash",
162
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
163
+ namespace_pattern="topic_vectors",
164
+ ),
165
+ ),
147
166
  }
148
167
 
149
168
 
@@ -198,8 +217,8 @@ def get_turbopuffer_namespace(
198
217
  config = TOOL_CONFIGS[tool_collection]
199
218
  pattern = config.turbopuffer.namespace_pattern
200
219
 
201
- # AudioRecapV2Section doesn't have sub-types
202
- if tool_collection == "AudioRecapV2Section":
220
+ # AudioRecapV2Section and Topic don't have sub-types
221
+ if tool_collection in ("AudioRecapV2Section", "Topic"):
203
222
  return pattern
204
223
 
205
224
  # Derive the type suffix
@@ -228,8 +247,8 @@ def get_pinecone_namespace(
228
247
  config = TOOL_CONFIGS[tool_collection]
229
248
  pattern = config.pinecone.namespace_pattern
230
249
 
231
- # AudioRecapV2Section doesn't have sub-types
232
- if tool_collection == "AudioRecapV2Section":
250
+ # AudioRecapV2Section and Topic don't have sub-types
251
+ if tool_collection in ("AudioRecapV2Section", "Topic"):
233
252
  return pattern
234
253
 
235
254
  # Derive the type suffix