sf-vector-sdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.2.dist-info}/METADATA +38 -4
- {sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.2.dist-info}/RECORD +15 -15
- vector_sdk/__init__.py +17 -1
- vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py +2 -2
- vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py +2 -2
- vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py +2 -2
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +13 -7
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi +23 -1
- vector_sdk/hash/__init__.py +2 -0
- vector_sdk/hash/hasher.py +28 -2
- vector_sdk/hash/types.py +10 -1
- vector_sdk/structured/__init__.py +13 -0
- vector_sdk/structured/structured_embeddings.py +785 -0
- vector_sdk/structured/tool_config.py +23 -4
- {sf_vector_sdk-0.2.0.dist-info → sf_vector_sdk-0.2.2.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sf-vector-sdk
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Python SDK for the Vector Gateway service (embeddings and vector search)
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Requires-Dist: redis>=5.0.0
|
|
@@ -220,25 +220,42 @@ Type-safe embedding for known tool types (FlashCard, TestQuestion, etc.) with au
|
|
|
220
220
|
|--------|-------------|
|
|
221
221
|
| `embed_flashcard(data, metadata)` | Embed a flashcard, return request ID |
|
|
222
222
|
| `embed_flashcard_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
223
|
+
| `embed_flashcard_batch(items)` | Embed batch of flashcards, return request ID |
|
|
224
|
+
| `embed_flashcard_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
223
225
|
| `embed_test_question(data, metadata)` | Embed a test question, return request ID |
|
|
224
226
|
| `embed_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
227
|
+
| `embed_test_question_batch(items)` | Embed batch of test questions, return request ID |
|
|
228
|
+
| `embed_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
225
229
|
| `embed_spaced_test_question(data, metadata)` | Embed a spaced test question, return request ID |
|
|
226
230
|
| `embed_spaced_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
231
|
+
| `embed_spaced_test_question_batch(items)` | Embed batch of spaced test questions, return request ID |
|
|
232
|
+
| `embed_spaced_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
227
233
|
| `embed_audio_recap(data, metadata)` | Embed an audio recap section, return request ID |
|
|
228
234
|
| `embed_audio_recap_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
235
|
+
| `embed_audio_recap_batch(items)` | Embed batch of audio recaps, return request ID |
|
|
236
|
+
| `embed_audio_recap_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
237
|
+
| `embed_topic(data, metadata)` | Embed a topic (uses `TopicMetadata`), return request ID |
|
|
238
|
+
| `embed_topic_and_wait(data, metadata, timeout)` | Embed and wait for result (uses `TopicMetadata`) |
|
|
239
|
+
| `embed_topic_batch(items)` | Embed batch of topics (uses `TopicMetadata`), return request ID |
|
|
240
|
+
| `embed_topic_batch_and_wait(items, timeout)` | Embed batch and wait for result (uses `TopicMetadata`) |
|
|
241
|
+
|
|
242
|
+
**Metadata Types:**
|
|
243
|
+
|
|
244
|
+
- `ToolMetadata` - For tools (FlashCard, TestQuestion, etc.) - requires `tool_id`
|
|
245
|
+
- `TopicMetadata` - For topics only - all fields optional (`user_id`, `topic_id`)
|
|
229
246
|
|
|
230
247
|
```python
|
|
231
|
-
from vector_sdk import VectorClient, ToolMetadata, TestQuestionInput
|
|
248
|
+
from vector_sdk import VectorClient, ToolMetadata, TopicMetadata, TestQuestionInput
|
|
232
249
|
|
|
233
250
|
client = VectorClient(redis_url="redis://localhost:6379")
|
|
234
251
|
|
|
235
|
-
# Embed a flashcard -
|
|
252
|
+
# Embed a flashcard - uses ToolMetadata (tool_id required)
|
|
236
253
|
result = client.structured_embeddings.embed_flashcard_and_wait(
|
|
237
254
|
data={"type": "BASIC", "term": "Mitochondria", "definition": "The powerhouse of the cell"},
|
|
238
255
|
metadata=ToolMetadata(tool_id="tool123", user_id="user456", topic_id="topic789"),
|
|
239
256
|
)
|
|
240
257
|
|
|
241
|
-
# Embed a test question
|
|
258
|
+
# Embed a test question - uses ToolMetadata (tool_id required)
|
|
242
259
|
result = client.structured_embeddings.embed_test_question_and_wait(
|
|
243
260
|
data=TestQuestionInput(
|
|
244
261
|
question="What is the capital?",
|
|
@@ -247,6 +264,23 @@ result = client.structured_embeddings.embed_test_question_and_wait(
|
|
|
247
264
|
),
|
|
248
265
|
metadata=ToolMetadata(tool_id="tool456"),
|
|
249
266
|
)
|
|
267
|
+
|
|
268
|
+
# Embed a topic - uses TopicMetadata (all fields optional)
|
|
269
|
+
result = client.structured_embeddings.embed_topic_and_wait(
|
|
270
|
+
data={"topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
|
|
271
|
+
metadata=TopicMetadata(user_id="user123", topic_id="topic456"), # No tool_id needed
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Batch embedding - embed multiple topics in a single request
|
|
275
|
+
from vector_sdk import TopicBatchItem
|
|
276
|
+
|
|
277
|
+
batch_result = client.structured_embeddings.embed_topic_batch_and_wait(
|
|
278
|
+
items=[
|
|
279
|
+
TopicBatchItem(data={"topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
|
|
280
|
+
TopicBatchItem(data={"topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
|
|
281
|
+
TopicBatchItem(data={"topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
|
|
282
|
+
],
|
|
283
|
+
)
|
|
250
284
|
```
|
|
251
285
|
|
|
252
286
|
**Database Routing:**
|
|
@@ -1,27 +1,27 @@
|
|
|
1
|
-
vector_sdk/__init__.py,sha256=
|
|
1
|
+
vector_sdk/__init__.py,sha256=3VdEG4tOuwTAWVvx9J-rOTuVY5RM-7tHzdL-ZLxRCYI,6979
|
|
2
2
|
vector_sdk/client.py,sha256=NQFGHyR1aM0UToRFy6e9Xm_v6mk0opqzKN8UlHu97n0,17186
|
|
3
3
|
vector_sdk/content_types.py,sha256=krvFOR58iUZPfYlEVsk0sXD6_ANAFbxEBQGNpt1YPDU,7381
|
|
4
4
|
vector_sdk/types.py,sha256=rQgA2z3ls21vY-DRPZgfmm8gYFkWJk1dQaJI-nbc0no,25514
|
|
5
|
-
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py,sha256=
|
|
5
|
+
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py,sha256=5dW14j_DyIPKCaFI2cxCKKtQoLMGtRqV3aiRZ8Utxw4,5962
|
|
6
6
|
vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.pyi,sha256=fOw6liHkiXSEyvEZ_QKexDUgFNhbemuGuk52hwQ5pnQ,6738
|
|
7
|
-
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py,sha256=
|
|
7
|
+
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py,sha256=nFmjLnJJh5H-t25FJ8oP7jLH-mAcuEw-EK0U-dYlgDI,7057
|
|
8
8
|
vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.pyi,sha256=lxZ27fReDhHv2mKDCiPvKpicXuJObZX2zpfVYuGTk3I,8068
|
|
9
|
-
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py,sha256=
|
|
9
|
+
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py,sha256=X0EIUHMCt9a0L5iVjQXkdi2zKi7xKwPbEUeU8gNPeTk,5579
|
|
10
10
|
vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.pyi,sha256=a-rWfFQVAdZM5jK1qHB0bUiuSo6brcB-zUIHQezi0I8,5598
|
|
11
|
-
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py,sha256=
|
|
12
|
-
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi,sha256=
|
|
13
|
-
vector_sdk/hash/__init__.py,sha256=
|
|
14
|
-
vector_sdk/hash/hasher.py,sha256=
|
|
15
|
-
vector_sdk/hash/types.py,sha256=
|
|
11
|
+
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py,sha256=cf4PCZK-OtfLMyCuac0XqpZ6MQxk2XH4cy3QRu1_i8I,3094
|
|
12
|
+
vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi,sha256=WKj_iRAuhXMNH3a2tf5j-ERYE5HLKamJTcQXm88JjDo,2451
|
|
13
|
+
vector_sdk/hash/__init__.py,sha256=if-8tGOPyGUZy0_joGH66moE0e5zzwSzfUeMqP_8QsU,723
|
|
14
|
+
vector_sdk/hash/hasher.py,sha256=k5VSQB-T0TtBM5ipaVE_TQu_uiaiWNjOWSbByxjriwQ,8618
|
|
15
|
+
vector_sdk/hash/types.py,sha256=RHDM-ob9cOHPGMI7tXqiN_ZRowTPSc3GYHf8terrd8U,1983
|
|
16
16
|
vector_sdk/namespaces/__init__.py,sha256=S9dJfB39s2zjYOpFn9Fvf8bk7mLKcXk5aPatKOA-xO0,374
|
|
17
17
|
vector_sdk/namespaces/base.py,sha256=lioZBcd43mijnN0JwTMMEpQ6whiAjaueTDAAIZS1JM0,1156
|
|
18
18
|
vector_sdk/namespaces/db.py,sha256=a5sEHrfy1xAjRjyM9qfZxr3IznZVA8BnY5W1Hq5jr4I,7230
|
|
19
19
|
vector_sdk/namespaces/embeddings.py,sha256=7hH0hvBAeDf-ypTtOzUAqzc3W6wci_dbt_ZPavcRVyU,8950
|
|
20
20
|
vector_sdk/namespaces/search.py,sha256=bwtZ_rTiP6q-dg8oOM5YA6taDHSphO88aq7RSuzc-tQ,8894
|
|
21
|
-
vector_sdk/structured/__init__.py,sha256=
|
|
21
|
+
vector_sdk/structured/__init__.py,sha256=ZUhrH_l7bX5vA78DSKqDucWhfhYmkDX-W_MPzo5J9JU,1758
|
|
22
22
|
vector_sdk/structured/router.py,sha256=F3O1TYtbVFCPqVWCCYCt5QcRffX5WPlPQ7K3KlayooQ,5792
|
|
23
|
-
vector_sdk/structured/structured_embeddings.py,sha256=
|
|
24
|
-
vector_sdk/structured/tool_config.py,sha256=
|
|
25
|
-
sf_vector_sdk-0.2.
|
|
26
|
-
sf_vector_sdk-0.2.
|
|
27
|
-
sf_vector_sdk-0.2.
|
|
23
|
+
vector_sdk/structured/structured_embeddings.py,sha256=Z0enOHx4vdhxAs0sbk9B6XHtRjZSfeYbNbtbq9f8Hh8,37147
|
|
24
|
+
vector_sdk/structured/tool_config.py,sha256=YJp-S2_mwoODHWaWJHnGJRaKXuuqbbm2dYHTum2BuG4,8138
|
|
25
|
+
sf_vector_sdk-0.2.2.dist-info/METADATA,sha256=JTf4o16e5REDLegscjWMbJcvdLVxDUCrwdCEAcH4fgk,15915
|
|
26
|
+
sf_vector_sdk-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
27
|
+
sf_vector_sdk-0.2.2.dist-info/RECORD,,
|
vector_sdk/__init__.py
CHANGED
|
@@ -70,6 +70,7 @@ from vector_sdk.hash import (
|
|
|
70
70
|
MultipleChoiceOption,
|
|
71
71
|
QuestionData,
|
|
72
72
|
ToolCollection,
|
|
73
|
+
TopicData,
|
|
73
74
|
compute_content_hash,
|
|
74
75
|
extract_tool_text,
|
|
75
76
|
)
|
|
@@ -89,15 +90,21 @@ from vector_sdk.namespaces import (
|
|
|
89
90
|
# ============================================================================
|
|
90
91
|
from vector_sdk.structured import (
|
|
91
92
|
TOOL_CONFIGS,
|
|
93
|
+
AudioRecapBatchItem,
|
|
94
|
+
BatchItem,
|
|
92
95
|
DatabaseRoutingError,
|
|
93
96
|
DatabaseRoutingMode,
|
|
97
|
+
FlashCardBatchItem,
|
|
94
98
|
PineconeToolConfig,
|
|
95
99
|
QuestionType,
|
|
96
100
|
StructuredEmbeddingsNamespace,
|
|
101
|
+
TestQuestionBatchItem,
|
|
97
102
|
TestQuestionInput,
|
|
98
103
|
ToolConfig,
|
|
99
104
|
ToolDatabaseConfig,
|
|
100
105
|
ToolMetadata,
|
|
106
|
+
TopicBatchItem,
|
|
107
|
+
TopicMetadata,
|
|
101
108
|
TurboPufferToolConfig,
|
|
102
109
|
build_storage_config,
|
|
103
110
|
get_content_type,
|
|
@@ -159,7 +166,7 @@ from vector_sdk.types import (
|
|
|
159
166
|
validate_model,
|
|
160
167
|
)
|
|
161
168
|
|
|
162
|
-
__version__ = "0.2.
|
|
169
|
+
__version__ = "0.2.2"
|
|
163
170
|
|
|
164
171
|
__all__ = [
|
|
165
172
|
# Clients (New API)
|
|
@@ -236,12 +243,21 @@ __all__ = [
|
|
|
236
243
|
"FlashCardData",
|
|
237
244
|
"QuestionData",
|
|
238
245
|
"AudioRecapSectionData",
|
|
246
|
+
"TopicData",
|
|
239
247
|
"MultipleChoiceOption",
|
|
240
248
|
"AnswerObject",
|
|
241
249
|
# Structured Embeddings
|
|
242
250
|
"StructuredEmbeddingsNamespace",
|
|
243
251
|
"ToolMetadata",
|
|
252
|
+
"TopicMetadata",
|
|
244
253
|
"TestQuestionInput",
|
|
254
|
+
# Batch types
|
|
255
|
+
"BatchItem",
|
|
256
|
+
"FlashCardBatchItem",
|
|
257
|
+
"TestQuestionBatchItem",
|
|
258
|
+
"AudioRecapBatchItem",
|
|
259
|
+
"TopicBatchItem",
|
|
260
|
+
# Tool configuration
|
|
245
261
|
"ToolConfig",
|
|
246
262
|
"ToolDatabaseConfig",
|
|
247
263
|
"TurboPufferToolConfig",
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/content_types/v1/content_types.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/content_types/v1/content_types.proto'
|
|
19
19
|
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/db/vectors/v1/vectors.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/db/vectors/v1/vectors.proto'
|
|
19
19
|
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/query/v1/query.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/query/v1/query.proto'
|
|
19
19
|
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/tools/v1/tools.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/tools/v1/tools.proto'
|
|
19
19
|
)
|
|
@@ -24,7 +24,7 @@ _sym_db = _symbol_database.Default()
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1*\
|
|
27
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1\"\xed\x01\n\rTopicMetadata\x12\x1c\n\x07user_id\x18\x01 \x01(\tH\x00R\x06userId\x88\x01\x01\x12\x1e\n\x08topic_id\x18\x02 \x01(\tH\x01R\x07topicId\x88\x01\x01\x12K\n\x05\x65xtra\x18\x03 \x03(\x0b\x32\x35.embedding_pipeline.tools.v1.TopicMetadata.ExtraEntryR\x05\x65xtra\x1a\x38\n\nExtraEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08_user_idB\x0b\n\t_topic_id*\xe4\x01\n\x0eToolCollection\x12\x1f\n\x1bTOOL_COLLECTION_UNSPECIFIED\x10\x00\x12\x1d\n\x19TOOL_COLLECTION_FLASHCARD\x10\x01\x12!\n\x1dTOOL_COLLECTION_TEST_QUESTION\x10\x02\x12(\n$TOOL_COLLECTION_SPACED_TEST_QUESTION\x10\x03\x12*\n&TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION\x10\x04\x12\x19\n\x15TOOL_COLLECTION_TOPIC\x10\x05*\xb2\x01\n\rFlashCardType\x12\x1f\n\x1b\x46LASH_CARD_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15\x46LASH_CARD_TYPE_BASIC\x10\x01\x12\x19\n\x15\x46LASH_CARD_TYPE_CLOZE\x10\x02\x12%\n!FLASH_CARD_TYPE_FILL_IN_THE_BLANK\x10\x03\x12#\n\x1f\x46LASH_CARD_TYPE_MULTIPLE_CHOICE\x10\x04\x42gZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1b\x06proto3')
|
|
28
28
|
|
|
29
29
|
_globals = globals()
|
|
30
30
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -32,8 +32,14 @@ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pipeline.tools.v1
|
|
|
32
32
|
if not _descriptor._USE_C_DESCRIPTORS:
|
|
33
33
|
_globals['DESCRIPTOR']._loaded_options = None
|
|
34
34
|
_globals['DESCRIPTOR']._serialized_options = b'Zegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1'
|
|
35
|
-
_globals['
|
|
36
|
-
_globals['
|
|
37
|
-
_globals['
|
|
38
|
-
_globals['
|
|
35
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._loaded_options = None
|
|
36
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_options = b'8\001'
|
|
37
|
+
_globals['_TOOLCOLLECTION']._serialized_start=313
|
|
38
|
+
_globals['_TOOLCOLLECTION']._serialized_end=541
|
|
39
|
+
_globals['_FLASHCARDTYPE']._serialized_start=544
|
|
40
|
+
_globals['_FLASHCARDTYPE']._serialized_end=722
|
|
41
|
+
_globals['_TOPICMETADATA']._serialized_start=73
|
|
42
|
+
_globals['_TOPICMETADATA']._serialized_end=310
|
|
43
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_start=229
|
|
44
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_end=285
|
|
39
45
|
# @@protoc_insertion_point(module_scope)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
from google.protobuf.internal import containers as _containers
|
|
1
2
|
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
2
3
|
from google.protobuf import descriptor as _descriptor
|
|
3
|
-
from
|
|
4
|
+
from google.protobuf import message as _message
|
|
5
|
+
from collections.abc import Mapping as _Mapping
|
|
6
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional
|
|
4
7
|
|
|
5
8
|
DESCRIPTOR: _descriptor.FileDescriptor
|
|
6
9
|
|
|
@@ -11,6 +14,7 @@ class ToolCollection(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
|
11
14
|
TOOL_COLLECTION_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
12
15
|
TOOL_COLLECTION_SPACED_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
13
16
|
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: _ClassVar[ToolCollection]
|
|
17
|
+
TOOL_COLLECTION_TOPIC: _ClassVar[ToolCollection]
|
|
14
18
|
|
|
15
19
|
class FlashCardType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
16
20
|
__slots__ = ()
|
|
@@ -24,8 +28,26 @@ TOOL_COLLECTION_FLASHCARD: ToolCollection
|
|
|
24
28
|
TOOL_COLLECTION_TEST_QUESTION: ToolCollection
|
|
25
29
|
TOOL_COLLECTION_SPACED_TEST_QUESTION: ToolCollection
|
|
26
30
|
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: ToolCollection
|
|
31
|
+
TOOL_COLLECTION_TOPIC: ToolCollection
|
|
27
32
|
FLASH_CARD_TYPE_UNSPECIFIED: FlashCardType
|
|
28
33
|
FLASH_CARD_TYPE_BASIC: FlashCardType
|
|
29
34
|
FLASH_CARD_TYPE_CLOZE: FlashCardType
|
|
30
35
|
FLASH_CARD_TYPE_FILL_IN_THE_BLANK: FlashCardType
|
|
31
36
|
FLASH_CARD_TYPE_MULTIPLE_CHOICE: FlashCardType
|
|
37
|
+
|
|
38
|
+
class TopicMetadata(_message.Message):
|
|
39
|
+
__slots__ = ("user_id", "topic_id", "extra")
|
|
40
|
+
class ExtraEntry(_message.Message):
|
|
41
|
+
__slots__ = ("key", "value")
|
|
42
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
43
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
44
|
+
key: str
|
|
45
|
+
value: str
|
|
46
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
47
|
+
USER_ID_FIELD_NUMBER: _ClassVar[int]
|
|
48
|
+
TOPIC_ID_FIELD_NUMBER: _ClassVar[int]
|
|
49
|
+
EXTRA_FIELD_NUMBER: _ClassVar[int]
|
|
50
|
+
user_id: str
|
|
51
|
+
topic_id: str
|
|
52
|
+
extra: _containers.ScalarMap[str, str]
|
|
53
|
+
def __init__(self, user_id: _Optional[str] = ..., topic_id: _Optional[str] = ..., extra: _Optional[_Mapping[str, str]] = ...) -> None: ...
|
vector_sdk/hash/__init__.py
CHANGED
|
@@ -16,6 +16,7 @@ from .types import (
|
|
|
16
16
|
MultipleChoiceOption,
|
|
17
17
|
QuestionData,
|
|
18
18
|
ToolCollection,
|
|
19
|
+
TopicData,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
@@ -26,6 +27,7 @@ __all__ = [
|
|
|
26
27
|
"FlashCardData",
|
|
27
28
|
"QuestionData",
|
|
28
29
|
"AudioRecapSectionData",
|
|
30
|
+
"TopicData",
|
|
29
31
|
"MultipleChoiceOption",
|
|
30
32
|
"AnswerObject",
|
|
31
33
|
]
|
vector_sdk/hash/hasher.py
CHANGED
|
@@ -15,6 +15,7 @@ from .types import (
|
|
|
15
15
|
MultipleChoiceOption,
|
|
16
16
|
QuestionData,
|
|
17
17
|
ToolCollection,
|
|
18
|
+
TopicData,
|
|
18
19
|
)
|
|
19
20
|
|
|
20
21
|
# Hash length in hex characters (128 bits = 32 hex chars)
|
|
@@ -23,7 +24,7 @@ HASH_LENGTH = 32
|
|
|
23
24
|
|
|
24
25
|
def compute_content_hash(
|
|
25
26
|
tool_collection: ToolCollection,
|
|
26
|
-
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
27
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
|
|
27
28
|
) -> str:
|
|
28
29
|
"""
|
|
29
30
|
Compute a deterministic content hash for a learning tool.
|
|
@@ -52,7 +53,7 @@ def compute_content_hash(
|
|
|
52
53
|
|
|
53
54
|
def extract_tool_text(
|
|
54
55
|
tool_collection: ToolCollection,
|
|
55
|
-
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
56
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
|
|
56
57
|
) -> str:
|
|
57
58
|
"""
|
|
58
59
|
Extract the text content from a learning tool for embedding.
|
|
@@ -94,6 +95,8 @@ def extract_tool_text(
|
|
|
94
95
|
return _extract_question_text(data_dict)
|
|
95
96
|
elif tool_collection == "AudioRecapV2Section":
|
|
96
97
|
return _extract_audio_recap_text(data_dict)
|
|
98
|
+
elif tool_collection == "Topic":
|
|
99
|
+
return _extract_topic_text(data_dict)
|
|
97
100
|
else:
|
|
98
101
|
return ""
|
|
99
102
|
|
|
@@ -183,6 +186,29 @@ def _extract_audio_recap_text(data: dict) -> str:
|
|
|
183
186
|
return ""
|
|
184
187
|
|
|
185
188
|
|
|
189
|
+
def _extract_topic_text(data: dict) -> str:
|
|
190
|
+
"""
|
|
191
|
+
Extract text from Topic.
|
|
192
|
+
|
|
193
|
+
Format: "Topic: {topic}. Description: {description}."
|
|
194
|
+
"""
|
|
195
|
+
parts: list[str] = []
|
|
196
|
+
|
|
197
|
+
topic = data.get("topic")
|
|
198
|
+
if topic:
|
|
199
|
+
trimmed = topic.strip()
|
|
200
|
+
if trimmed:
|
|
201
|
+
parts.append(f"Topic: {trimmed}.")
|
|
202
|
+
|
|
203
|
+
description = data.get("description")
|
|
204
|
+
if description:
|
|
205
|
+
trimmed = description.strip()
|
|
206
|
+
if trimmed:
|
|
207
|
+
parts.append(f"Description: {trimmed}.")
|
|
208
|
+
|
|
209
|
+
return " ".join(parts)
|
|
210
|
+
|
|
211
|
+
|
|
186
212
|
def _strip_flashcard_syntax(text: str) -> str:
|
|
187
213
|
"""
|
|
188
214
|
Strip {{...}} markers from cloze/fill-in-blank text.
|
vector_sdk/hash/types.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Literal, Optional, Union
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
|
|
11
11
|
# Tool collection types
|
|
12
|
-
ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section"]
|
|
12
|
+
ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section", "Topic"]
|
|
13
13
|
|
|
14
14
|
# FlashCard type variants
|
|
15
15
|
FlashCardType = Literal["BASIC", "CLOZE", "FILL_IN_THE_BLANK", "MULTIPLE_CHOICE"]
|
|
@@ -65,3 +65,12 @@ class AudioRecapSectionData(BaseModel):
|
|
|
65
65
|
model_config = ConfigDict(extra="allow")
|
|
66
66
|
|
|
67
67
|
script: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TopicData(BaseModel):
|
|
71
|
+
"""Topic data for content hashing."""
|
|
72
|
+
|
|
73
|
+
model_config = ConfigDict(extra="allow")
|
|
74
|
+
|
|
75
|
+
topic: Optional[str] = None
|
|
76
|
+
description: Optional[str] = None
|
|
@@ -14,9 +14,15 @@ from .router import (
|
|
|
14
14
|
validate_database_routing,
|
|
15
15
|
)
|
|
16
16
|
from .structured_embeddings import (
|
|
17
|
+
AudioRecapBatchItem,
|
|
18
|
+
BatchItem,
|
|
19
|
+
FlashCardBatchItem,
|
|
17
20
|
StructuredEmbeddingsNamespace,
|
|
21
|
+
TestQuestionBatchItem,
|
|
18
22
|
TestQuestionInput,
|
|
19
23
|
ToolMetadata,
|
|
24
|
+
TopicBatchItem,
|
|
25
|
+
TopicMetadata,
|
|
20
26
|
)
|
|
21
27
|
from .tool_config import (
|
|
22
28
|
TOOL_CONFIGS,
|
|
@@ -37,7 +43,14 @@ __all__ = [
|
|
|
37
43
|
"StructuredEmbeddingsNamespace",
|
|
38
44
|
# Types
|
|
39
45
|
"ToolMetadata",
|
|
46
|
+
"TopicMetadata",
|
|
40
47
|
"TestQuestionInput",
|
|
48
|
+
# Batch types
|
|
49
|
+
"BatchItem",
|
|
50
|
+
"FlashCardBatchItem",
|
|
51
|
+
"TestQuestionBatchItem",
|
|
52
|
+
"AudioRecapBatchItem",
|
|
53
|
+
"TopicBatchItem",
|
|
41
54
|
# Tool configuration
|
|
42
55
|
"ToolConfig",
|
|
43
56
|
"ToolDatabaseConfig",
|
|
@@ -12,6 +12,7 @@ from ..hash import (
|
|
|
12
12
|
AudioRecapSectionData,
|
|
13
13
|
FlashCardData,
|
|
14
14
|
ToolCollection,
|
|
15
|
+
TopicData,
|
|
15
16
|
compute_content_hash,
|
|
16
17
|
extract_tool_text,
|
|
17
18
|
)
|
|
@@ -47,6 +48,29 @@ class ToolMetadata:
|
|
|
47
48
|
return result
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
@dataclass
|
|
52
|
+
class TopicMetadata:
|
|
53
|
+
"""
|
|
54
|
+
Metadata for topic embeddings.
|
|
55
|
+
Unlike ToolMetadata, all fields are optional since topics don't have a toolId.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
user_id: Optional[str] = None
|
|
59
|
+
topic_id: Optional[str] = None
|
|
60
|
+
extra: Optional[dict[str, Any]] = None
|
|
61
|
+
|
|
62
|
+
def to_dict(self) -> dict[str, Any]:
|
|
63
|
+
"""Convert to dictionary for document storage."""
|
|
64
|
+
result: dict[str, Any] = {}
|
|
65
|
+
if self.user_id:
|
|
66
|
+
result["userId"] = self.user_id
|
|
67
|
+
if self.topic_id:
|
|
68
|
+
result["topicId"] = self.topic_id
|
|
69
|
+
if self.extra:
|
|
70
|
+
result.update(self.extra)
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
50
74
|
@dataclass
|
|
51
75
|
class TestQuestionInput:
|
|
52
76
|
"""Extended question data with question type."""
|
|
@@ -67,6 +91,46 @@ class TestQuestionInput:
|
|
|
67
91
|
return result
|
|
68
92
|
|
|
69
93
|
|
|
94
|
+
@dataclass
|
|
95
|
+
class BatchItem:
|
|
96
|
+
"""Batch item for embedding multiple items of the same type."""
|
|
97
|
+
|
|
98
|
+
data: dict[str, Any]
|
|
99
|
+
metadata: ToolMetadata
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class FlashCardBatchItem:
|
|
104
|
+
"""Batch item for FlashCard embeddings."""
|
|
105
|
+
|
|
106
|
+
data: FlashCardData
|
|
107
|
+
metadata: ToolMetadata
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class TestQuestionBatchItem:
|
|
112
|
+
"""Batch item for TestQuestion embeddings."""
|
|
113
|
+
|
|
114
|
+
data: TestQuestionInput
|
|
115
|
+
metadata: ToolMetadata
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class AudioRecapBatchItem:
|
|
120
|
+
"""Batch item for AudioRecap embeddings."""
|
|
121
|
+
|
|
122
|
+
data: AudioRecapSectionData
|
|
123
|
+
metadata: ToolMetadata
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class TopicBatchItem:
|
|
128
|
+
"""Batch item for Topic embeddings."""
|
|
129
|
+
|
|
130
|
+
data: TopicData
|
|
131
|
+
metadata: TopicMetadata
|
|
132
|
+
|
|
133
|
+
|
|
70
134
|
# ============================================================================
|
|
71
135
|
# StructuredEmbeddingsNamespace
|
|
72
136
|
# ============================================================================
|
|
@@ -150,6 +214,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
|
|
|
150
214
|
card_type = data.get("type")
|
|
151
215
|
return self._embed_tool_and_wait("FlashCard", data, metadata, card_type, timeout)
|
|
152
216
|
|
|
217
|
+
def embed_flashcard_batch(
|
|
218
|
+
self,
|
|
219
|
+
items: list[FlashCardBatchItem],
|
|
220
|
+
) -> str:
|
|
221
|
+
"""
|
|
222
|
+
Embed a batch of flashcards and return the request ID.
|
|
223
|
+
All flashcards in the batch should have the same type for proper namespace routing.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
items: List of FlashCardBatchItem objects
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The request ID
|
|
230
|
+
"""
|
|
231
|
+
return self._embed_tool_batch(
|
|
232
|
+
"FlashCard",
|
|
233
|
+
[
|
|
234
|
+
{
|
|
235
|
+
"data": item.data,
|
|
236
|
+
"metadata": item.metadata,
|
|
237
|
+
"sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
|
|
238
|
+
}
|
|
239
|
+
for item in items
|
|
240
|
+
],
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def embed_flashcard_batch_and_wait(
|
|
244
|
+
self,
|
|
245
|
+
items: list[FlashCardBatchItem],
|
|
246
|
+
timeout: int = 60,
|
|
247
|
+
) -> EmbeddingResult:
|
|
248
|
+
"""
|
|
249
|
+
Embed a batch of flashcards and wait for the result.
|
|
250
|
+
All flashcards in the batch should have the same type for proper namespace routing.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
items: List of FlashCardBatchItem objects
|
|
254
|
+
timeout: Timeout in seconds (default: 60)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
The embedding result
|
|
258
|
+
"""
|
|
259
|
+
return self._embed_tool_batch_and_wait(
|
|
260
|
+
"FlashCard",
|
|
261
|
+
[
|
|
262
|
+
{
|
|
263
|
+
"data": item.data,
|
|
264
|
+
"metadata": item.metadata,
|
|
265
|
+
"sub_type": item.data.get("type") if isinstance(item.data, dict) else None,
|
|
266
|
+
}
|
|
267
|
+
for item in items
|
|
268
|
+
],
|
|
269
|
+
timeout,
|
|
270
|
+
)
|
|
271
|
+
|
|
153
272
|
# ==========================================================================
|
|
154
273
|
# TestQuestion Methods
|
|
155
274
|
# ==========================================================================
|
|
@@ -201,6 +320,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
|
|
|
201
320
|
timeout,
|
|
202
321
|
)
|
|
203
322
|
|
|
323
|
+
def embed_test_question_batch(
|
|
324
|
+
self,
|
|
325
|
+
items: list[TestQuestionBatchItem],
|
|
326
|
+
) -> str:
|
|
327
|
+
"""
|
|
328
|
+
Embed a batch of test questions and return the request ID.
|
|
329
|
+
All questions in the batch should have the same question_type for proper namespace routing.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
items: List of TestQuestionBatchItem objects
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
The request ID
|
|
336
|
+
"""
|
|
337
|
+
return self._embed_tool_batch(
|
|
338
|
+
"TestQuestion",
|
|
339
|
+
[
|
|
340
|
+
{
|
|
341
|
+
"data": item.data.to_question_data(),
|
|
342
|
+
"metadata": item.metadata,
|
|
343
|
+
"sub_type": item.data.question_type,
|
|
344
|
+
}
|
|
345
|
+
for item in items
|
|
346
|
+
],
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def embed_test_question_batch_and_wait(
|
|
350
|
+
self,
|
|
351
|
+
items: list[TestQuestionBatchItem],
|
|
352
|
+
timeout: int = 60,
|
|
353
|
+
) -> EmbeddingResult:
|
|
354
|
+
"""
|
|
355
|
+
Embed a batch of test questions and wait for the result.
|
|
356
|
+
All questions in the batch should have the same question_type for proper namespace routing.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
items: List of TestQuestionBatchItem objects
|
|
360
|
+
timeout: Timeout in seconds (default: 60)
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
The embedding result
|
|
364
|
+
"""
|
|
365
|
+
return self._embed_tool_batch_and_wait(
|
|
366
|
+
"TestQuestion",
|
|
367
|
+
[
|
|
368
|
+
{
|
|
369
|
+
"data": item.data.to_question_data(),
|
|
370
|
+
"metadata": item.metadata,
|
|
371
|
+
"sub_type": item.data.question_type,
|
|
372
|
+
}
|
|
373
|
+
for item in items
|
|
374
|
+
],
|
|
375
|
+
timeout,
|
|
376
|
+
)
|
|
377
|
+
|
|
204
378
|
# ==========================================================================
|
|
205
379
|
# SpacedTestQuestion Methods
|
|
206
380
|
# ==========================================================================
|
|
@@ -252,6 +426,61 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
|
|
|
252
426
|
timeout,
|
|
253
427
|
)
|
|
254
428
|
|
|
429
|
+
def embed_spaced_test_question_batch(
|
|
430
|
+
self,
|
|
431
|
+
items: list[TestQuestionBatchItem],
|
|
432
|
+
) -> str:
|
|
433
|
+
"""
|
|
434
|
+
Embed a batch of spaced test questions and return the request ID.
|
|
435
|
+
All questions in the batch should have the same question_type for proper namespace routing.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
items: List of TestQuestionBatchItem objects
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
The request ID
|
|
442
|
+
"""
|
|
443
|
+
return self._embed_tool_batch(
|
|
444
|
+
"SpacedTestQuestion",
|
|
445
|
+
[
|
|
446
|
+
{
|
|
447
|
+
"data": item.data.to_question_data(),
|
|
448
|
+
"metadata": item.metadata,
|
|
449
|
+
"sub_type": item.data.question_type,
|
|
450
|
+
}
|
|
451
|
+
for item in items
|
|
452
|
+
],
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
def embed_spaced_test_question_batch_and_wait(
|
|
456
|
+
self,
|
|
457
|
+
items: list[TestQuestionBatchItem],
|
|
458
|
+
timeout: int = 60,
|
|
459
|
+
) -> EmbeddingResult:
|
|
460
|
+
"""
|
|
461
|
+
Embed a batch of spaced test questions and wait for the result.
|
|
462
|
+
All questions in the batch should have the same question_type for proper namespace routing.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
items: List of TestQuestionBatchItem objects
|
|
466
|
+
timeout: Timeout in seconds (default: 60)
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
The embedding result
|
|
470
|
+
"""
|
|
471
|
+
return self._embed_tool_batch_and_wait(
|
|
472
|
+
"SpacedTestQuestion",
|
|
473
|
+
[
|
|
474
|
+
{
|
|
475
|
+
"data": item.data.to_question_data(),
|
|
476
|
+
"metadata": item.metadata,
|
|
477
|
+
"sub_type": item.data.question_type,
|
|
478
|
+
}
|
|
479
|
+
for item in items
|
|
480
|
+
],
|
|
481
|
+
timeout,
|
|
482
|
+
)
|
|
483
|
+
|
|
255
484
|
# ==========================================================================
|
|
256
485
|
# AudioRecap Methods
|
|
257
486
|
# ==========================================================================
|
|
@@ -298,6 +527,401 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
|
|
|
298
527
|
timeout,
|
|
299
528
|
)
|
|
300
529
|
|
|
530
|
+
def embed_audio_recap_batch(
|
|
531
|
+
self,
|
|
532
|
+
items: list[AudioRecapBatchItem],
|
|
533
|
+
) -> str:
|
|
534
|
+
"""
|
|
535
|
+
Embed a batch of audio recap sections and return the request ID.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
items: List of AudioRecapBatchItem objects
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
The request ID
|
|
542
|
+
"""
|
|
543
|
+
return self._embed_tool_batch(
|
|
544
|
+
"AudioRecapV2Section",
|
|
545
|
+
[
|
|
546
|
+
{
|
|
547
|
+
"data": item.data,
|
|
548
|
+
"metadata": item.metadata,
|
|
549
|
+
"sub_type": None,
|
|
550
|
+
}
|
|
551
|
+
for item in items
|
|
552
|
+
],
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
def embed_audio_recap_batch_and_wait(
|
|
556
|
+
self,
|
|
557
|
+
items: list[AudioRecapBatchItem],
|
|
558
|
+
timeout: int = 60,
|
|
559
|
+
) -> EmbeddingResult:
|
|
560
|
+
"""
|
|
561
|
+
Embed a batch of audio recap sections and wait for the result.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
items: List of AudioRecapBatchItem objects
|
|
565
|
+
timeout: Timeout in seconds (default: 60)
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
The embedding result
|
|
569
|
+
"""
|
|
570
|
+
return self._embed_tool_batch_and_wait(
|
|
571
|
+
"AudioRecapV2Section",
|
|
572
|
+
[
|
|
573
|
+
{
|
|
574
|
+
"data": item.data,
|
|
575
|
+
"metadata": item.metadata,
|
|
576
|
+
"sub_type": None,
|
|
577
|
+
}
|
|
578
|
+
for item in items
|
|
579
|
+
],
|
|
580
|
+
timeout,
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
# ==========================================================================
|
|
584
|
+
# Topic Methods
|
|
585
|
+
# ==========================================================================
|
|
586
|
+
|
|
587
|
+
def embed_topic(
|
|
588
|
+
self,
|
|
589
|
+
data: TopicData,
|
|
590
|
+
metadata: TopicMetadata,
|
|
591
|
+
) -> str:
|
|
592
|
+
"""
|
|
593
|
+
Embed a topic and return the request ID.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
data: Topic data (topic, description)
|
|
597
|
+
metadata: Topic metadata (all fields optional)
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
The request ID
|
|
601
|
+
"""
|
|
602
|
+
return self._embed_topic_internal("Topic", data, metadata, None)
|
|
603
|
+
|
|
604
|
+
def embed_topic_and_wait(
|
|
605
|
+
self,
|
|
606
|
+
data: TopicData,
|
|
607
|
+
metadata: TopicMetadata,
|
|
608
|
+
timeout: int = 60,
|
|
609
|
+
) -> EmbeddingResult:
|
|
610
|
+
"""
|
|
611
|
+
Embed a topic and wait for the result.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
data: Topic data
|
|
615
|
+
metadata: Topic metadata (all fields optional)
|
|
616
|
+
timeout: Timeout in seconds (default: 60)
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
The embedding result
|
|
620
|
+
"""
|
|
621
|
+
return self._embed_topic_internal_and_wait("Topic", data, metadata, None, timeout)
|
|
622
|
+
|
|
623
|
+
def embed_topic_batch(
|
|
624
|
+
self,
|
|
625
|
+
items: list[TopicBatchItem],
|
|
626
|
+
) -> str:
|
|
627
|
+
"""
|
|
628
|
+
Embed a batch of topics and return the request ID.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
items: List of TopicBatchItem objects
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
The request ID
|
|
635
|
+
"""
|
|
636
|
+
return self._embed_topic_batch_internal(items)
|
|
637
|
+
|
|
638
|
+
def embed_topic_batch_and_wait(
|
|
639
|
+
self,
|
|
640
|
+
items: list[TopicBatchItem],
|
|
641
|
+
timeout: int = 60,
|
|
642
|
+
) -> EmbeddingResult:
|
|
643
|
+
"""
|
|
644
|
+
Embed a batch of topics and wait for the result.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
items: List of TopicBatchItem objects
|
|
648
|
+
timeout: Timeout in seconds (default: 60)
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
The embedding result
|
|
652
|
+
"""
|
|
653
|
+
return self._embed_topic_batch_internal_and_wait(items, timeout)
|
|
654
|
+
|
|
655
|
+
# ==========================================================================
|
|
656
|
+
# Internal Topic Methods (using TopicMetadata)
|
|
657
|
+
# ==========================================================================
|
|
658
|
+
|
|
659
|
+
def _embed_topic_internal(
|
|
660
|
+
self,
|
|
661
|
+
tool_collection: ToolCollection,
|
|
662
|
+
data: TopicData,
|
|
663
|
+
metadata: TopicMetadata,
|
|
664
|
+
sub_type: Optional[str],
|
|
665
|
+
) -> str:
|
|
666
|
+
"""Internal method to embed a topic with TopicMetadata."""
|
|
667
|
+
# 1. Extract text using the spec
|
|
668
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
669
|
+
if not text:
|
|
670
|
+
raise ValueError(
|
|
671
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# 2. Compute content hash
|
|
675
|
+
content_hash = compute_content_hash(
|
|
676
|
+
{"toolCollection": tool_collection, "data": data}
|
|
677
|
+
)
|
|
678
|
+
if not content_hash:
|
|
679
|
+
raise ValueError(
|
|
680
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# 3. Get tool config
|
|
684
|
+
tool_config = get_tool_config(tool_collection)
|
|
685
|
+
|
|
686
|
+
# 4. Build document with metadata (TopicMetadata doesn't have toolId)
|
|
687
|
+
document = {
|
|
688
|
+
**metadata.to_dict(),
|
|
689
|
+
"toolCollection": tool_collection,
|
|
690
|
+
"contentHash": content_hash,
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
# 5. Build storage config using router
|
|
694
|
+
storage_config = build_storage_config(
|
|
695
|
+
tool_collection=tool_collection,
|
|
696
|
+
sub_type=sub_type,
|
|
697
|
+
content_hash=content_hash,
|
|
698
|
+
document_fields=document,
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
# 6. Build text input
|
|
702
|
+
text_input = {
|
|
703
|
+
"id": content_hash,
|
|
704
|
+
"text": text,
|
|
705
|
+
"document": document,
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
# 7. Submit using embeddings namespace
|
|
709
|
+
return self._embeddings.create(
|
|
710
|
+
texts=[text_input],
|
|
711
|
+
content_type=get_content_type(tool_collection),
|
|
712
|
+
priority=tool_config.default_priority,
|
|
713
|
+
storage=storage_config,
|
|
714
|
+
metadata={
|
|
715
|
+
"toolCollection": tool_collection,
|
|
716
|
+
"contentHash": content_hash,
|
|
717
|
+
},
|
|
718
|
+
embedding_model=tool_config.model,
|
|
719
|
+
embedding_dimensions=tool_config.dimensions,
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
def _embed_topic_internal_and_wait(
|
|
723
|
+
self,
|
|
724
|
+
tool_collection: ToolCollection,
|
|
725
|
+
data: TopicData,
|
|
726
|
+
metadata: TopicMetadata,
|
|
727
|
+
sub_type: Optional[str],
|
|
728
|
+
timeout: int = 60,
|
|
729
|
+
) -> EmbeddingResult:
|
|
730
|
+
"""Internal method to embed a topic and wait for result."""
|
|
731
|
+
# 1. Extract text using the spec
|
|
732
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
733
|
+
if not text:
|
|
734
|
+
raise ValueError(
|
|
735
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# 2. Compute content hash
|
|
739
|
+
content_hash = compute_content_hash(
|
|
740
|
+
{"toolCollection": tool_collection, "data": data}
|
|
741
|
+
)
|
|
742
|
+
if not content_hash:
|
|
743
|
+
raise ValueError(
|
|
744
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
# 3. Get tool config
|
|
748
|
+
tool_config = get_tool_config(tool_collection)
|
|
749
|
+
|
|
750
|
+
# 4. Build document with metadata
|
|
751
|
+
document = {
|
|
752
|
+
**metadata.to_dict(),
|
|
753
|
+
"toolCollection": tool_collection,
|
|
754
|
+
"contentHash": content_hash,
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
# 5. Build storage config using router
|
|
758
|
+
storage_config = build_storage_config(
|
|
759
|
+
tool_collection=tool_collection,
|
|
760
|
+
sub_type=sub_type,
|
|
761
|
+
content_hash=content_hash,
|
|
762
|
+
document_fields=document,
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
# 6. Build text input
|
|
766
|
+
text_input = {
|
|
767
|
+
"id": content_hash,
|
|
768
|
+
"text": text,
|
|
769
|
+
"document": document,
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
# 7. Submit and wait using embeddings namespace
|
|
773
|
+
return self._embeddings.create_and_wait(
|
|
774
|
+
texts=[text_input],
|
|
775
|
+
content_type=get_content_type(tool_collection),
|
|
776
|
+
priority=tool_config.default_priority,
|
|
777
|
+
storage=storage_config,
|
|
778
|
+
metadata={
|
|
779
|
+
"toolCollection": tool_collection,
|
|
780
|
+
"contentHash": content_hash,
|
|
781
|
+
},
|
|
782
|
+
embedding_model=tool_config.model,
|
|
783
|
+
embedding_dimensions=tool_config.dimensions,
|
|
784
|
+
timeout=timeout,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
def _embed_topic_batch_internal(
|
|
788
|
+
self,
|
|
789
|
+
items: list[TopicBatchItem],
|
|
790
|
+
) -> str:
|
|
791
|
+
"""Internal method to embed a batch of topics."""
|
|
792
|
+
if not items:
|
|
793
|
+
raise ValueError("Batch cannot be empty")
|
|
794
|
+
|
|
795
|
+
tool_collection: ToolCollection = "Topic"
|
|
796
|
+
tool_config = get_tool_config(tool_collection)
|
|
797
|
+
|
|
798
|
+
# Process each item
|
|
799
|
+
text_inputs = []
|
|
800
|
+
for item in items:
|
|
801
|
+
data = item.data
|
|
802
|
+
metadata = item.metadata
|
|
803
|
+
|
|
804
|
+
# Extract text
|
|
805
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
806
|
+
if not text:
|
|
807
|
+
raise ValueError(
|
|
808
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
# Compute content hash
|
|
812
|
+
content_hash = compute_content_hash(
|
|
813
|
+
{"toolCollection": tool_collection, "data": data}
|
|
814
|
+
)
|
|
815
|
+
if not content_hash:
|
|
816
|
+
raise ValueError(
|
|
817
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
# Build document with metadata (TopicMetadata doesn't have toolId)
|
|
821
|
+
document = {
|
|
822
|
+
**metadata.to_dict(),
|
|
823
|
+
"toolCollection": tool_collection,
|
|
824
|
+
"contentHash": content_hash,
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
text_inputs.append({
|
|
828
|
+
"id": content_hash,
|
|
829
|
+
"text": text,
|
|
830
|
+
"document": document,
|
|
831
|
+
})
|
|
832
|
+
|
|
833
|
+
# Build storage config using first item
|
|
834
|
+
storage_config = build_storage_config(
|
|
835
|
+
tool_collection=tool_collection,
|
|
836
|
+
sub_type=None,
|
|
837
|
+
content_hash=text_inputs[0]["id"],
|
|
838
|
+
document_fields=text_inputs[0]["document"],
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
# Submit batch to embeddings namespace
|
|
842
|
+
return self._embeddings.create(
|
|
843
|
+
texts=text_inputs,
|
|
844
|
+
content_type=get_content_type(tool_collection),
|
|
845
|
+
priority=tool_config.default_priority,
|
|
846
|
+
storage=storage_config,
|
|
847
|
+
metadata={
|
|
848
|
+
"toolCollection": tool_collection,
|
|
849
|
+
"batchSize": len(items),
|
|
850
|
+
},
|
|
851
|
+
embedding_model=tool_config.model,
|
|
852
|
+
embedding_dimensions=tool_config.dimensions,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
def _embed_topic_batch_internal_and_wait(
|
|
856
|
+
self,
|
|
857
|
+
items: list[TopicBatchItem],
|
|
858
|
+
timeout: int = 60,
|
|
859
|
+
) -> EmbeddingResult:
|
|
860
|
+
"""Internal method to embed a batch of topics and wait for result."""
|
|
861
|
+
if not items:
|
|
862
|
+
raise ValueError("Batch cannot be empty")
|
|
863
|
+
|
|
864
|
+
tool_collection: ToolCollection = "Topic"
|
|
865
|
+
tool_config = get_tool_config(tool_collection)
|
|
866
|
+
|
|
867
|
+
# Process each item
|
|
868
|
+
text_inputs = []
|
|
869
|
+
for item in items:
|
|
870
|
+
data = item.data
|
|
871
|
+
metadata = item.metadata
|
|
872
|
+
|
|
873
|
+
# Extract text
|
|
874
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
875
|
+
if not text:
|
|
876
|
+
raise ValueError(
|
|
877
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
# Compute content hash
|
|
881
|
+
content_hash = compute_content_hash(
|
|
882
|
+
{"toolCollection": tool_collection, "data": data}
|
|
883
|
+
)
|
|
884
|
+
if not content_hash:
|
|
885
|
+
raise ValueError(
|
|
886
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
# Build document with metadata
|
|
890
|
+
document = {
|
|
891
|
+
**metadata.to_dict(),
|
|
892
|
+
"toolCollection": tool_collection,
|
|
893
|
+
"contentHash": content_hash,
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
text_inputs.append({
|
|
897
|
+
"id": content_hash,
|
|
898
|
+
"text": text,
|
|
899
|
+
"document": document,
|
|
900
|
+
})
|
|
901
|
+
|
|
902
|
+
# Build storage config using first item
|
|
903
|
+
storage_config = build_storage_config(
|
|
904
|
+
tool_collection=tool_collection,
|
|
905
|
+
sub_type=None,
|
|
906
|
+
content_hash=text_inputs[0]["id"],
|
|
907
|
+
document_fields=text_inputs[0]["document"],
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
# Submit batch and wait
|
|
911
|
+
return self._embeddings.create_and_wait(
|
|
912
|
+
texts=text_inputs,
|
|
913
|
+
content_type=get_content_type(tool_collection),
|
|
914
|
+
priority=tool_config.default_priority,
|
|
915
|
+
storage=storage_config,
|
|
916
|
+
metadata={
|
|
917
|
+
"toolCollection": tool_collection,
|
|
918
|
+
"batchSize": len(items),
|
|
919
|
+
},
|
|
920
|
+
embedding_model=tool_config.model,
|
|
921
|
+
embedding_dimensions=tool_config.dimensions,
|
|
922
|
+
timeout=timeout,
|
|
923
|
+
)
|
|
924
|
+
|
|
301
925
|
# ==========================================================================
|
|
302
926
|
# Internal Methods
|
|
303
927
|
# ==========================================================================
|
|
@@ -429,3 +1053,164 @@ class StructuredEmbeddingsNamespace(BaseNamespace):
|
|
|
429
1053
|
embedding_dimensions=tool_config.dimensions,
|
|
430
1054
|
timeout=timeout,
|
|
431
1055
|
)
|
|
1056
|
+
|
|
1057
|
+
def _embed_tool_batch(
|
|
1058
|
+
self,
|
|
1059
|
+
tool_collection: ToolCollection,
|
|
1060
|
+
items: list[dict[str, Any]],
|
|
1061
|
+
) -> str:
|
|
1062
|
+
"""
|
|
1063
|
+
Internal method to embed a batch of items of the same tool type.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
tool_collection: The tool collection type
|
|
1067
|
+
items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
|
|
1068
|
+
|
|
1069
|
+
Returns:
|
|
1070
|
+
The request ID
|
|
1071
|
+
"""
|
|
1072
|
+
if not items:
|
|
1073
|
+
raise ValueError("Batch cannot be empty")
|
|
1074
|
+
|
|
1075
|
+
# Get tool config (same for all items)
|
|
1076
|
+
tool_config = get_tool_config(tool_collection)
|
|
1077
|
+
|
|
1078
|
+
# Process each item
|
|
1079
|
+
text_inputs = []
|
|
1080
|
+
for item in items:
|
|
1081
|
+
data = item["data"]
|
|
1082
|
+
metadata = item["metadata"]
|
|
1083
|
+
|
|
1084
|
+
# Extract text
|
|
1085
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
1086
|
+
if not text:
|
|
1087
|
+
raise ValueError(
|
|
1088
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
# Compute content hash
|
|
1092
|
+
content_hash = compute_content_hash(
|
|
1093
|
+
{"toolCollection": tool_collection, "data": data}
|
|
1094
|
+
)
|
|
1095
|
+
if not content_hash:
|
|
1096
|
+
raise ValueError(
|
|
1097
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
# Build document with metadata
|
|
1101
|
+
document = {
|
|
1102
|
+
**metadata.to_dict(),
|
|
1103
|
+
"toolCollection": tool_collection,
|
|
1104
|
+
"contentHash": content_hash,
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
text_inputs.append({
|
|
1108
|
+
"id": content_hash,
|
|
1109
|
+
"text": text,
|
|
1110
|
+
"document": document,
|
|
1111
|
+
})
|
|
1112
|
+
|
|
1113
|
+
# Build storage config using first item's sub_type
|
|
1114
|
+
first_item = items[0]
|
|
1115
|
+
storage_config = build_storage_config(
|
|
1116
|
+
tool_collection=tool_collection,
|
|
1117
|
+
sub_type=first_item.get("sub_type"),
|
|
1118
|
+
content_hash=text_inputs[0]["id"],
|
|
1119
|
+
document_fields=text_inputs[0]["document"],
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
# Submit batch to embeddings namespace
|
|
1123
|
+
return self._embeddings.create(
|
|
1124
|
+
texts=text_inputs,
|
|
1125
|
+
content_type=get_content_type(tool_collection),
|
|
1126
|
+
priority=tool_config.default_priority,
|
|
1127
|
+
storage=storage_config,
|
|
1128
|
+
metadata={
|
|
1129
|
+
"toolCollection": tool_collection,
|
|
1130
|
+
"batchSize": len(items),
|
|
1131
|
+
},
|
|
1132
|
+
embedding_model=tool_config.model,
|
|
1133
|
+
embedding_dimensions=tool_config.dimensions,
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
def _embed_tool_batch_and_wait(
|
|
1137
|
+
self,
|
|
1138
|
+
tool_collection: ToolCollection,
|
|
1139
|
+
items: list[dict[str, Any]],
|
|
1140
|
+
timeout: int = 60,
|
|
1141
|
+
) -> EmbeddingResult:
|
|
1142
|
+
"""
|
|
1143
|
+
Internal method to embed a batch of items and wait for result.
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
tool_collection: The tool collection type
|
|
1147
|
+
items: List of dicts with 'data', 'metadata', and optional 'sub_type' keys
|
|
1148
|
+
timeout: Timeout in seconds (default: 60)
|
|
1149
|
+
|
|
1150
|
+
Returns:
|
|
1151
|
+
The embedding result
|
|
1152
|
+
"""
|
|
1153
|
+
if not items:
|
|
1154
|
+
raise ValueError("Batch cannot be empty")
|
|
1155
|
+
|
|
1156
|
+
# Get tool config (same for all items)
|
|
1157
|
+
tool_config = get_tool_config(tool_collection)
|
|
1158
|
+
|
|
1159
|
+
# Process each item
|
|
1160
|
+
text_inputs = []
|
|
1161
|
+
for item in items:
|
|
1162
|
+
data = item["data"]
|
|
1163
|
+
metadata = item["metadata"]
|
|
1164
|
+
|
|
1165
|
+
# Extract text
|
|
1166
|
+
text = extract_tool_text({"toolCollection": tool_collection, "data": data})
|
|
1167
|
+
if not text:
|
|
1168
|
+
raise ValueError(
|
|
1169
|
+
f"Failed to extract text from {tool_collection} - empty content"
|
|
1170
|
+
)
|
|
1171
|
+
|
|
1172
|
+
# Compute content hash
|
|
1173
|
+
content_hash = compute_content_hash(
|
|
1174
|
+
{"toolCollection": tool_collection, "data": data}
|
|
1175
|
+
)
|
|
1176
|
+
if not content_hash:
|
|
1177
|
+
raise ValueError(
|
|
1178
|
+
f"Failed to compute content hash for {tool_collection} - empty content"
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
# Build document with metadata
|
|
1182
|
+
document = {
|
|
1183
|
+
**metadata.to_dict(),
|
|
1184
|
+
"toolCollection": tool_collection,
|
|
1185
|
+
"contentHash": content_hash,
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
text_inputs.append({
|
|
1189
|
+
"id": content_hash,
|
|
1190
|
+
"text": text,
|
|
1191
|
+
"document": document,
|
|
1192
|
+
})
|
|
1193
|
+
|
|
1194
|
+
# Build storage config using first item's sub_type
|
|
1195
|
+
first_item = items[0]
|
|
1196
|
+
storage_config = build_storage_config(
|
|
1197
|
+
tool_collection=tool_collection,
|
|
1198
|
+
sub_type=first_item.get("sub_type"),
|
|
1199
|
+
content_hash=text_inputs[0]["id"],
|
|
1200
|
+
document_fields=text_inputs[0]["document"],
|
|
1201
|
+
)
|
|
1202
|
+
|
|
1203
|
+
# Submit batch and wait
|
|
1204
|
+
return self._embeddings.create_and_wait(
|
|
1205
|
+
texts=text_inputs,
|
|
1206
|
+
content_type=get_content_type(tool_collection),
|
|
1207
|
+
priority=tool_config.default_priority,
|
|
1208
|
+
storage=storage_config,
|
|
1209
|
+
metadata={
|
|
1210
|
+
"toolCollection": tool_collection,
|
|
1211
|
+
"batchSize": len(items),
|
|
1212
|
+
},
|
|
1213
|
+
embedding_model=tool_config.model,
|
|
1214
|
+
embedding_dimensions=tool_config.dimensions,
|
|
1215
|
+
timeout=timeout,
|
|
1216
|
+
)
|
|
@@ -144,6 +144,25 @@ TOOL_CONFIGS: dict[ToolCollection, ToolConfig] = {
|
|
|
144
144
|
namespace_pattern="audiorecapv2section",
|
|
145
145
|
),
|
|
146
146
|
),
|
|
147
|
+
"Topic": ToolConfig(
|
|
148
|
+
tool_collection="Topic",
|
|
149
|
+
model="gemini-embedding-001",
|
|
150
|
+
dimensions=3072,
|
|
151
|
+
default_priority=PRIORITY_NORMAL,
|
|
152
|
+
turbopuffer=TurboPufferToolConfig(
|
|
153
|
+
enabled=True,
|
|
154
|
+
id_field="contentHash",
|
|
155
|
+
metadata_fields=_DEFAULT_METADATA_FIELDS,
|
|
156
|
+
namespace_pattern="topic_vectors",
|
|
157
|
+
),
|
|
158
|
+
pinecone=PineconeToolConfig(
|
|
159
|
+
enabled=False,
|
|
160
|
+
index_name="tool-vectors",
|
|
161
|
+
id_field="contentHash",
|
|
162
|
+
metadata_fields=_DEFAULT_METADATA_FIELDS,
|
|
163
|
+
namespace_pattern="topic_vectors",
|
|
164
|
+
),
|
|
165
|
+
),
|
|
147
166
|
}
|
|
148
167
|
|
|
149
168
|
|
|
@@ -198,8 +217,8 @@ def get_turbopuffer_namespace(
|
|
|
198
217
|
config = TOOL_CONFIGS[tool_collection]
|
|
199
218
|
pattern = config.turbopuffer.namespace_pattern
|
|
200
219
|
|
|
201
|
-
# AudioRecapV2Section
|
|
202
|
-
if tool_collection
|
|
220
|
+
# AudioRecapV2Section and Topic don't have sub-types
|
|
221
|
+
if tool_collection in ("AudioRecapV2Section", "Topic"):
|
|
203
222
|
return pattern
|
|
204
223
|
|
|
205
224
|
# Derive the type suffix
|
|
@@ -228,8 +247,8 @@ def get_pinecone_namespace(
|
|
|
228
247
|
config = TOOL_CONFIGS[tool_collection]
|
|
229
248
|
pattern = config.pinecone.namespace_pattern
|
|
230
249
|
|
|
231
|
-
# AudioRecapV2Section
|
|
232
|
-
if tool_collection
|
|
250
|
+
# AudioRecapV2Section and Topic don't have sub-types
|
|
251
|
+
if tool_collection in ("AudioRecapV2Section", "Topic"):
|
|
233
252
|
return pattern
|
|
234
253
|
|
|
235
254
|
# Derive the type suffix
|
|
File without changes
|