sf-vector-sdk 0.2.0__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/PKG-INFO +38 -4
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/README.md +37 -3
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/__init__.py +17 -1
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py +2 -2
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py +2 -2
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py +2 -2
- sf_vector_sdk-0.2.3/vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +45 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi +23 -1
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/hash/__init__.py +2 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/hash/hasher.py +28 -2
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/hash/types.py +10 -1
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/namespaces/embeddings.py +31 -57
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/namespaces/search.py +38 -60
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/structured/__init__.py +13 -0
- sf_vector_sdk-0.2.3/vector_sdk/structured/structured_embeddings.py +1216 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/structured/tool_config.py +23 -4
- sf_vector_sdk-0.2.0/vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +0 -39
- sf_vector_sdk-0.2.0/vector_sdk/structured/structured_embeddings.py +0 -431
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/.gitignore +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/pyproject.toml +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/tests/__init__.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/tests/test_content_hash.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/client.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/content_types.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.pyi +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.pyi +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.pyi +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/namespaces/__init__.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/namespaces/base.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/namespaces/db.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/structured/router.py +0 -0
- {sf_vector_sdk-0.2.0 → sf_vector_sdk-0.2.3}/vector_sdk/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sf-vector-sdk
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Python SDK for the Vector Gateway service (embeddings and vector search)
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Requires-Dist: redis>=5.0.0
|
|
@@ -220,25 +220,42 @@ Type-safe embedding for known tool types (FlashCard, TestQuestion, etc.) with au
|
|
|
220
220
|
|--------|-------------|
|
|
221
221
|
| `embed_flashcard(data, metadata)` | Embed a flashcard, return request ID |
|
|
222
222
|
| `embed_flashcard_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
223
|
+
| `embed_flashcard_batch(items)` | Embed batch of flashcards, return request ID |
|
|
224
|
+
| `embed_flashcard_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
223
225
|
| `embed_test_question(data, metadata)` | Embed a test question, return request ID |
|
|
224
226
|
| `embed_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
227
|
+
| `embed_test_question_batch(items)` | Embed batch of test questions, return request ID |
|
|
228
|
+
| `embed_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
225
229
|
| `embed_spaced_test_question(data, metadata)` | Embed a spaced test question, return request ID |
|
|
226
230
|
| `embed_spaced_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
231
|
+
| `embed_spaced_test_question_batch(items)` | Embed batch of spaced test questions, return request ID |
|
|
232
|
+
| `embed_spaced_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
227
233
|
| `embed_audio_recap(data, metadata)` | Embed an audio recap section, return request ID |
|
|
228
234
|
| `embed_audio_recap_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
235
|
+
| `embed_audio_recap_batch(items)` | Embed batch of audio recaps, return request ID |
|
|
236
|
+
| `embed_audio_recap_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
237
|
+
| `embed_topic(data, metadata)` | Embed a topic (uses `TopicMetadata`), return request ID |
|
|
238
|
+
| `embed_topic_and_wait(data, metadata, timeout)` | Embed and wait for result (uses `TopicMetadata`) |
|
|
239
|
+
| `embed_topic_batch(items)` | Embed batch of topics (uses `TopicMetadata`), return request ID |
|
|
240
|
+
| `embed_topic_batch_and_wait(items, timeout)` | Embed batch and wait for result (uses `TopicMetadata`) |
|
|
241
|
+
|
|
242
|
+
**Metadata Types:**
|
|
243
|
+
|
|
244
|
+
- `ToolMetadata` - For tools (FlashCard, TestQuestion, etc.) - requires `tool_id`
|
|
245
|
+
- `TopicMetadata` - For topics only - all fields optional (`user_id`, `topic_id`)
|
|
229
246
|
|
|
230
247
|
```python
|
|
231
|
-
from vector_sdk import VectorClient, ToolMetadata, TestQuestionInput
|
|
248
|
+
from vector_sdk import VectorClient, ToolMetadata, TopicMetadata, TestQuestionInput
|
|
232
249
|
|
|
233
250
|
client = VectorClient(redis_url="redis://localhost:6379")
|
|
234
251
|
|
|
235
|
-
# Embed a flashcard -
|
|
252
|
+
# Embed a flashcard - uses ToolMetadata (tool_id required)
|
|
236
253
|
result = client.structured_embeddings.embed_flashcard_and_wait(
|
|
237
254
|
data={"type": "BASIC", "term": "Mitochondria", "definition": "The powerhouse of the cell"},
|
|
238
255
|
metadata=ToolMetadata(tool_id="tool123", user_id="user456", topic_id="topic789"),
|
|
239
256
|
)
|
|
240
257
|
|
|
241
|
-
# Embed a test question
|
|
258
|
+
# Embed a test question - uses ToolMetadata (tool_id required)
|
|
242
259
|
result = client.structured_embeddings.embed_test_question_and_wait(
|
|
243
260
|
data=TestQuestionInput(
|
|
244
261
|
question="What is the capital?",
|
|
@@ -247,6 +264,23 @@ result = client.structured_embeddings.embed_test_question_and_wait(
|
|
|
247
264
|
),
|
|
248
265
|
metadata=ToolMetadata(tool_id="tool456"),
|
|
249
266
|
)
|
|
267
|
+
|
|
268
|
+
# Embed a topic - uses TopicMetadata (all fields optional)
|
|
269
|
+
result = client.structured_embeddings.embed_topic_and_wait(
|
|
270
|
+
data={"topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
|
|
271
|
+
metadata=TopicMetadata(user_id="user123", topic_id="topic456"), # No tool_id needed
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Batch embedding - embed multiple topics in a single request
|
|
275
|
+
from vector_sdk import TopicBatchItem
|
|
276
|
+
|
|
277
|
+
batch_result = client.structured_embeddings.embed_topic_batch_and_wait(
|
|
278
|
+
items=[
|
|
279
|
+
TopicBatchItem(data={"topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
|
|
280
|
+
TopicBatchItem(data={"topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
|
|
281
|
+
TopicBatchItem(data={"topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
|
|
282
|
+
],
|
|
283
|
+
)
|
|
250
284
|
```
|
|
251
285
|
|
|
252
286
|
**Database Routing:**
|
|
@@ -208,25 +208,42 @@ Type-safe embedding for known tool types (FlashCard, TestQuestion, etc.) with au
|
|
|
208
208
|
|--------|-------------|
|
|
209
209
|
| `embed_flashcard(data, metadata)` | Embed a flashcard, return request ID |
|
|
210
210
|
| `embed_flashcard_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
211
|
+
| `embed_flashcard_batch(items)` | Embed batch of flashcards, return request ID |
|
|
212
|
+
| `embed_flashcard_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
211
213
|
| `embed_test_question(data, metadata)` | Embed a test question, return request ID |
|
|
212
214
|
| `embed_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
215
|
+
| `embed_test_question_batch(items)` | Embed batch of test questions, return request ID |
|
|
216
|
+
| `embed_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
213
217
|
| `embed_spaced_test_question(data, metadata)` | Embed a spaced test question, return request ID |
|
|
214
218
|
| `embed_spaced_test_question_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
219
|
+
| `embed_spaced_test_question_batch(items)` | Embed batch of spaced test questions, return request ID |
|
|
220
|
+
| `embed_spaced_test_question_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
215
221
|
| `embed_audio_recap(data, metadata)` | Embed an audio recap section, return request ID |
|
|
216
222
|
| `embed_audio_recap_and_wait(data, metadata, timeout)` | Embed and wait for result |
|
|
223
|
+
| `embed_audio_recap_batch(items)` | Embed batch of audio recaps, return request ID |
|
|
224
|
+
| `embed_audio_recap_batch_and_wait(items, timeout)` | Embed batch and wait for result |
|
|
225
|
+
| `embed_topic(data, metadata)` | Embed a topic (uses `TopicMetadata`), return request ID |
|
|
226
|
+
| `embed_topic_and_wait(data, metadata, timeout)` | Embed and wait for result (uses `TopicMetadata`) |
|
|
227
|
+
| `embed_topic_batch(items)` | Embed batch of topics (uses `TopicMetadata`), return request ID |
|
|
228
|
+
| `embed_topic_batch_and_wait(items, timeout)` | Embed batch and wait for result (uses `TopicMetadata`) |
|
|
229
|
+
|
|
230
|
+
**Metadata Types:**
|
|
231
|
+
|
|
232
|
+
- `ToolMetadata` - For tools (FlashCard, TestQuestion, etc.) - requires `tool_id`
|
|
233
|
+
- `TopicMetadata` - For topics only - all fields optional (`user_id`, `topic_id`)
|
|
217
234
|
|
|
218
235
|
```python
|
|
219
|
-
from vector_sdk import VectorClient, ToolMetadata, TestQuestionInput
|
|
236
|
+
from vector_sdk import VectorClient, ToolMetadata, TopicMetadata, TestQuestionInput
|
|
220
237
|
|
|
221
238
|
client = VectorClient(redis_url="redis://localhost:6379")
|
|
222
239
|
|
|
223
|
-
# Embed a flashcard -
|
|
240
|
+
# Embed a flashcard - uses ToolMetadata (tool_id required)
|
|
224
241
|
result = client.structured_embeddings.embed_flashcard_and_wait(
|
|
225
242
|
data={"type": "BASIC", "term": "Mitochondria", "definition": "The powerhouse of the cell"},
|
|
226
243
|
metadata=ToolMetadata(tool_id="tool123", user_id="user456", topic_id="topic789"),
|
|
227
244
|
)
|
|
228
245
|
|
|
229
|
-
# Embed a test question
|
|
246
|
+
# Embed a test question - uses ToolMetadata (tool_id required)
|
|
230
247
|
result = client.structured_embeddings.embed_test_question_and_wait(
|
|
231
248
|
data=TestQuestionInput(
|
|
232
249
|
question="What is the capital?",
|
|
@@ -235,6 +252,23 @@ result = client.structured_embeddings.embed_test_question_and_wait(
|
|
|
235
252
|
),
|
|
236
253
|
metadata=ToolMetadata(tool_id="tool456"),
|
|
237
254
|
)
|
|
255
|
+
|
|
256
|
+
# Embed a topic - uses TopicMetadata (all fields optional)
|
|
257
|
+
result = client.structured_embeddings.embed_topic_and_wait(
|
|
258
|
+
data={"topic": "Photosynthesis", "description": "The process by which plants convert sunlight to energy"},
|
|
259
|
+
metadata=TopicMetadata(user_id="user123", topic_id="topic456"), # No tool_id needed
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Batch embedding - embed multiple topics in a single request
|
|
263
|
+
from vector_sdk import TopicBatchItem
|
|
264
|
+
|
|
265
|
+
batch_result = client.structured_embeddings.embed_topic_batch_and_wait(
|
|
266
|
+
items=[
|
|
267
|
+
TopicBatchItem(data={"topic": "Topic 1", "description": "Description 1"}, metadata=TopicMetadata(user_id="user1")),
|
|
268
|
+
TopicBatchItem(data={"topic": "Topic 2", "description": "Description 2"}, metadata=TopicMetadata(topic_id="topic2")),
|
|
269
|
+
TopicBatchItem(data={"topic": "Topic 3", "description": "Description 3"}, metadata=TopicMetadata()), # All optional
|
|
270
|
+
],
|
|
271
|
+
)
|
|
238
272
|
```
|
|
239
273
|
|
|
240
274
|
**Database Routing:**
|
|
@@ -70,6 +70,7 @@ from vector_sdk.hash import (
|
|
|
70
70
|
MultipleChoiceOption,
|
|
71
71
|
QuestionData,
|
|
72
72
|
ToolCollection,
|
|
73
|
+
TopicData,
|
|
73
74
|
compute_content_hash,
|
|
74
75
|
extract_tool_text,
|
|
75
76
|
)
|
|
@@ -89,15 +90,21 @@ from vector_sdk.namespaces import (
|
|
|
89
90
|
# ============================================================================
|
|
90
91
|
from vector_sdk.structured import (
|
|
91
92
|
TOOL_CONFIGS,
|
|
93
|
+
AudioRecapBatchItem,
|
|
94
|
+
BatchItem,
|
|
92
95
|
DatabaseRoutingError,
|
|
93
96
|
DatabaseRoutingMode,
|
|
97
|
+
FlashCardBatchItem,
|
|
94
98
|
PineconeToolConfig,
|
|
95
99
|
QuestionType,
|
|
96
100
|
StructuredEmbeddingsNamespace,
|
|
101
|
+
TestQuestionBatchItem,
|
|
97
102
|
TestQuestionInput,
|
|
98
103
|
ToolConfig,
|
|
99
104
|
ToolDatabaseConfig,
|
|
100
105
|
ToolMetadata,
|
|
106
|
+
TopicBatchItem,
|
|
107
|
+
TopicMetadata,
|
|
101
108
|
TurboPufferToolConfig,
|
|
102
109
|
build_storage_config,
|
|
103
110
|
get_content_type,
|
|
@@ -159,7 +166,7 @@ from vector_sdk.types import (
|
|
|
159
166
|
validate_model,
|
|
160
167
|
)
|
|
161
168
|
|
|
162
|
-
__version__ = "0.2.
|
|
169
|
+
__version__ = "0.2.3"
|
|
163
170
|
|
|
164
171
|
__all__ = [
|
|
165
172
|
# Clients (New API)
|
|
@@ -236,12 +243,21 @@ __all__ = [
|
|
|
236
243
|
"FlashCardData",
|
|
237
244
|
"QuestionData",
|
|
238
245
|
"AudioRecapSectionData",
|
|
246
|
+
"TopicData",
|
|
239
247
|
"MultipleChoiceOption",
|
|
240
248
|
"AnswerObject",
|
|
241
249
|
# Structured Embeddings
|
|
242
250
|
"StructuredEmbeddingsNamespace",
|
|
243
251
|
"ToolMetadata",
|
|
252
|
+
"TopicMetadata",
|
|
244
253
|
"TestQuestionInput",
|
|
254
|
+
# Batch types
|
|
255
|
+
"BatchItem",
|
|
256
|
+
"FlashCardBatchItem",
|
|
257
|
+
"TestQuestionBatchItem",
|
|
258
|
+
"AudioRecapBatchItem",
|
|
259
|
+
"TopicBatchItem",
|
|
260
|
+
# Tool configuration
|
|
245
261
|
"ToolConfig",
|
|
246
262
|
"ToolDatabaseConfig",
|
|
247
263
|
"TurboPufferToolConfig",
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/content_types/v1/content_types.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/content_types/v1/content_types.proto'
|
|
19
19
|
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/db/vectors/v1/vectors.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/db/vectors/v1/vectors.proto'
|
|
19
19
|
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
3
|
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
4
|
# source: embedding_pipeline/query/v1/query.proto
|
|
5
|
-
# Protobuf Python Version: 6.33.
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
6
|
"""Generated protocol buffer code."""
|
|
7
7
|
from google.protobuf import descriptor as _descriptor
|
|
8
8
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
@@ -13,7 +13,7 @@ _runtime_version.ValidateProtobufRuntimeVersion(
|
|
|
13
13
|
_runtime_version.Domain.PUBLIC,
|
|
14
14
|
6,
|
|
15
15
|
33,
|
|
16
|
-
|
|
16
|
+
5,
|
|
17
17
|
'',
|
|
18
18
|
'embedding_pipeline/query/v1/query.proto'
|
|
19
19
|
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
|
+
# source: embedding_pipeline/tools/v1/tools.proto
|
|
5
|
+
# Protobuf Python Version: 6.33.5
|
|
6
|
+
"""Generated protocol buffer code."""
|
|
7
|
+
from google.protobuf import descriptor as _descriptor
|
|
8
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
9
|
+
from google.protobuf import runtime_version as _runtime_version
|
|
10
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
11
|
+
from google.protobuf.internal import builder as _builder
|
|
12
|
+
_runtime_version.ValidateProtobufRuntimeVersion(
|
|
13
|
+
_runtime_version.Domain.PUBLIC,
|
|
14
|
+
6,
|
|
15
|
+
33,
|
|
16
|
+
5,
|
|
17
|
+
'',
|
|
18
|
+
'embedding_pipeline/tools/v1/tools.proto'
|
|
19
|
+
)
|
|
20
|
+
# @@protoc_insertion_point(imports)
|
|
21
|
+
|
|
22
|
+
_sym_db = _symbol_database.Default()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1\"\xed\x01\n\rTopicMetadata\x12\x1c\n\x07user_id\x18\x01 \x01(\tH\x00R\x06userId\x88\x01\x01\x12\x1e\n\x08topic_id\x18\x02 \x01(\tH\x01R\x07topicId\x88\x01\x01\x12K\n\x05\x65xtra\x18\x03 \x03(\x0b\x32\x35.embedding_pipeline.tools.v1.TopicMetadata.ExtraEntryR\x05\x65xtra\x1a\x38\n\nExtraEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\n\n\x08_user_idB\x0b\n\t_topic_id*\xe4\x01\n\x0eToolCollection\x12\x1f\n\x1bTOOL_COLLECTION_UNSPECIFIED\x10\x00\x12\x1d\n\x19TOOL_COLLECTION_FLASHCARD\x10\x01\x12!\n\x1dTOOL_COLLECTION_TEST_QUESTION\x10\x02\x12(\n$TOOL_COLLECTION_SPACED_TEST_QUESTION\x10\x03\x12*\n&TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION\x10\x04\x12\x19\n\x15TOOL_COLLECTION_TOPIC\x10\x05*\xb2\x01\n\rFlashCardType\x12\x1f\n\x1b\x46LASH_CARD_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15\x46LASH_CARD_TYPE_BASIC\x10\x01\x12\x19\n\x15\x46LASH_CARD_TYPE_CLOZE\x10\x02\x12%\n!FLASH_CARD_TYPE_FILL_IN_THE_BLANK\x10\x03\x12#\n\x1f\x46LASH_CARD_TYPE_MULTIPLE_CHOICE\x10\x04\x42gZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1b\x06proto3')
|
|
28
|
+
|
|
29
|
+
_globals = globals()
|
|
30
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
31
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pipeline.tools.v1.tools_pb2', _globals)
|
|
32
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
|
33
|
+
_globals['DESCRIPTOR']._loaded_options = None
|
|
34
|
+
_globals['DESCRIPTOR']._serialized_options = b'Zegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1'
|
|
35
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._loaded_options = None
|
|
36
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_options = b'8\001'
|
|
37
|
+
_globals['_TOOLCOLLECTION']._serialized_start=313
|
|
38
|
+
_globals['_TOOLCOLLECTION']._serialized_end=541
|
|
39
|
+
_globals['_FLASHCARDTYPE']._serialized_start=544
|
|
40
|
+
_globals['_FLASHCARDTYPE']._serialized_end=722
|
|
41
|
+
_globals['_TOPICMETADATA']._serialized_start=73
|
|
42
|
+
_globals['_TOPICMETADATA']._serialized_end=310
|
|
43
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_start=229
|
|
44
|
+
_globals['_TOPICMETADATA_EXTRAENTRY']._serialized_end=285
|
|
45
|
+
# @@protoc_insertion_point(module_scope)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
from google.protobuf.internal import containers as _containers
|
|
1
2
|
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
2
3
|
from google.protobuf import descriptor as _descriptor
|
|
3
|
-
from
|
|
4
|
+
from google.protobuf import message as _message
|
|
5
|
+
from collections.abc import Mapping as _Mapping
|
|
6
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional
|
|
4
7
|
|
|
5
8
|
DESCRIPTOR: _descriptor.FileDescriptor
|
|
6
9
|
|
|
@@ -11,6 +14,7 @@ class ToolCollection(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
|
11
14
|
TOOL_COLLECTION_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
12
15
|
TOOL_COLLECTION_SPACED_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
13
16
|
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: _ClassVar[ToolCollection]
|
|
17
|
+
TOOL_COLLECTION_TOPIC: _ClassVar[ToolCollection]
|
|
14
18
|
|
|
15
19
|
class FlashCardType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
16
20
|
__slots__ = ()
|
|
@@ -24,8 +28,26 @@ TOOL_COLLECTION_FLASHCARD: ToolCollection
|
|
|
24
28
|
TOOL_COLLECTION_TEST_QUESTION: ToolCollection
|
|
25
29
|
TOOL_COLLECTION_SPACED_TEST_QUESTION: ToolCollection
|
|
26
30
|
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: ToolCollection
|
|
31
|
+
TOOL_COLLECTION_TOPIC: ToolCollection
|
|
27
32
|
FLASH_CARD_TYPE_UNSPECIFIED: FlashCardType
|
|
28
33
|
FLASH_CARD_TYPE_BASIC: FlashCardType
|
|
29
34
|
FLASH_CARD_TYPE_CLOZE: FlashCardType
|
|
30
35
|
FLASH_CARD_TYPE_FILL_IN_THE_BLANK: FlashCardType
|
|
31
36
|
FLASH_CARD_TYPE_MULTIPLE_CHOICE: FlashCardType
|
|
37
|
+
|
|
38
|
+
class TopicMetadata(_message.Message):
|
|
39
|
+
__slots__ = ("user_id", "topic_id", "extra")
|
|
40
|
+
class ExtraEntry(_message.Message):
|
|
41
|
+
__slots__ = ("key", "value")
|
|
42
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
43
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
44
|
+
key: str
|
|
45
|
+
value: str
|
|
46
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
47
|
+
USER_ID_FIELD_NUMBER: _ClassVar[int]
|
|
48
|
+
TOPIC_ID_FIELD_NUMBER: _ClassVar[int]
|
|
49
|
+
EXTRA_FIELD_NUMBER: _ClassVar[int]
|
|
50
|
+
user_id: str
|
|
51
|
+
topic_id: str
|
|
52
|
+
extra: _containers.ScalarMap[str, str]
|
|
53
|
+
def __init__(self, user_id: _Optional[str] = ..., topic_id: _Optional[str] = ..., extra: _Optional[_Mapping[str, str]] = ...) -> None: ...
|
|
@@ -16,6 +16,7 @@ from .types import (
|
|
|
16
16
|
MultipleChoiceOption,
|
|
17
17
|
QuestionData,
|
|
18
18
|
ToolCollection,
|
|
19
|
+
TopicData,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
@@ -26,6 +27,7 @@ __all__ = [
|
|
|
26
27
|
"FlashCardData",
|
|
27
28
|
"QuestionData",
|
|
28
29
|
"AudioRecapSectionData",
|
|
30
|
+
"TopicData",
|
|
29
31
|
"MultipleChoiceOption",
|
|
30
32
|
"AnswerObject",
|
|
31
33
|
]
|
|
@@ -15,6 +15,7 @@ from .types import (
|
|
|
15
15
|
MultipleChoiceOption,
|
|
16
16
|
QuestionData,
|
|
17
17
|
ToolCollection,
|
|
18
|
+
TopicData,
|
|
18
19
|
)
|
|
19
20
|
|
|
20
21
|
# Hash length in hex characters (128 bits = 32 hex chars)
|
|
@@ -23,7 +24,7 @@ HASH_LENGTH = 32
|
|
|
23
24
|
|
|
24
25
|
def compute_content_hash(
|
|
25
26
|
tool_collection: ToolCollection,
|
|
26
|
-
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
27
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
|
|
27
28
|
) -> str:
|
|
28
29
|
"""
|
|
29
30
|
Compute a deterministic content hash for a learning tool.
|
|
@@ -52,7 +53,7 @@ def compute_content_hash(
|
|
|
52
53
|
|
|
53
54
|
def extract_tool_text(
|
|
54
55
|
tool_collection: ToolCollection,
|
|
55
|
-
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
56
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, TopicData, dict],
|
|
56
57
|
) -> str:
|
|
57
58
|
"""
|
|
58
59
|
Extract the text content from a learning tool for embedding.
|
|
@@ -94,6 +95,8 @@ def extract_tool_text(
|
|
|
94
95
|
return _extract_question_text(data_dict)
|
|
95
96
|
elif tool_collection == "AudioRecapV2Section":
|
|
96
97
|
return _extract_audio_recap_text(data_dict)
|
|
98
|
+
elif tool_collection == "Topic":
|
|
99
|
+
return _extract_topic_text(data_dict)
|
|
97
100
|
else:
|
|
98
101
|
return ""
|
|
99
102
|
|
|
@@ -183,6 +186,29 @@ def _extract_audio_recap_text(data: dict) -> str:
|
|
|
183
186
|
return ""
|
|
184
187
|
|
|
185
188
|
|
|
189
|
+
def _extract_topic_text(data: dict) -> str:
|
|
190
|
+
"""
|
|
191
|
+
Extract text from Topic.
|
|
192
|
+
|
|
193
|
+
Format: "Topic: {topic}. Description: {description}."
|
|
194
|
+
"""
|
|
195
|
+
parts: list[str] = []
|
|
196
|
+
|
|
197
|
+
topic = data.get("topic")
|
|
198
|
+
if topic:
|
|
199
|
+
trimmed = topic.strip()
|
|
200
|
+
if trimmed:
|
|
201
|
+
parts.append(f"Topic: {trimmed}.")
|
|
202
|
+
|
|
203
|
+
description = data.get("description")
|
|
204
|
+
if description:
|
|
205
|
+
trimmed = description.strip()
|
|
206
|
+
if trimmed:
|
|
207
|
+
parts.append(f"Description: {trimmed}.")
|
|
208
|
+
|
|
209
|
+
return " ".join(parts)
|
|
210
|
+
|
|
211
|
+
|
|
186
212
|
def _strip_flashcard_syntax(text: str) -> str:
|
|
187
213
|
"""
|
|
188
214
|
Strip {{...}} markers from cloze/fill-in-blank text.
|
|
@@ -9,7 +9,7 @@ from typing import Literal, Optional, Union
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
|
|
11
11
|
# Tool collection types
|
|
12
|
-
ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section"]
|
|
12
|
+
ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section", "Topic"]
|
|
13
13
|
|
|
14
14
|
# FlashCard type variants
|
|
15
15
|
FlashCardType = Literal["BASIC", "CLOZE", "FILL_IN_THE_BLANK", "MULTIPLE_CHOICE"]
|
|
@@ -65,3 +65,12 @@ class AudioRecapSectionData(BaseModel):
|
|
|
65
65
|
model_config = ConfigDict(extra="allow")
|
|
66
66
|
|
|
67
67
|
script: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TopicData(BaseModel):
|
|
71
|
+
"""Topic data for content hashing."""
|
|
72
|
+
|
|
73
|
+
model_config = ConfigDict(extra="allow")
|
|
74
|
+
|
|
75
|
+
topic: Optional[str] = None
|
|
76
|
+
description: Optional[str] = None
|
|
@@ -154,26 +154,21 @@ class EmbeddingsNamespace(BaseNamespace):
|
|
|
154
154
|
Raises:
|
|
155
155
|
TimeoutError: If no result is received within the timeout
|
|
156
156
|
"""
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
f"No result received for {request_id} within {timeout}s"
|
|
173
|
-
)
|
|
174
|
-
finally:
|
|
175
|
-
pubsub.unsubscribe(channel)
|
|
176
|
-
pubsub.close()
|
|
157
|
+
list_key = f"embedding:response:{request_id}"
|
|
158
|
+
|
|
159
|
+
# BRPOP blocks until result is available or timeout
|
|
160
|
+
result = self._redis.brpop(list_key, timeout=timeout)
|
|
161
|
+
|
|
162
|
+
if result is None:
|
|
163
|
+
raise TimeoutError(
|
|
164
|
+
f"No result received for {request_id} within {timeout}s"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# result = (key, value)
|
|
168
|
+
data = json.loads(result[1])
|
|
169
|
+
# Cleanup the response list
|
|
170
|
+
self._redis.delete(list_key)
|
|
171
|
+
return EmbeddingResult.from_dict(data)
|
|
177
172
|
|
|
178
173
|
def create_and_wait(
|
|
179
174
|
self,
|
|
@@ -189,8 +184,8 @@ class EmbeddingsNamespace(BaseNamespace):
|
|
|
189
184
|
"""
|
|
190
185
|
Create embeddings and wait for the result.
|
|
191
186
|
|
|
192
|
-
|
|
193
|
-
|
|
187
|
+
Uses BRPOP for efficient blocking wait - no race condition since the result
|
|
188
|
+
is pushed to a list that persists until consumed.
|
|
194
189
|
|
|
195
190
|
Args:
|
|
196
191
|
texts: List of text inputs
|
|
@@ -205,43 +200,22 @@ class EmbeddingsNamespace(BaseNamespace):
|
|
|
205
200
|
Returns:
|
|
206
201
|
The embedding result
|
|
207
202
|
"""
|
|
208
|
-
# Generate request ID upfront so we can subscribe before submitting
|
|
209
203
|
request_id = str(uuid.uuid4())
|
|
210
|
-
channel = f"embedding:result:{request_id}"
|
|
211
|
-
|
|
212
|
-
# Subscribe BEFORE submitting to avoid race condition
|
|
213
|
-
pubsub = self._redis.pubsub()
|
|
214
|
-
pubsub.subscribe(channel)
|
|
215
|
-
|
|
216
|
-
try:
|
|
217
|
-
# Now submit the request (subscription is already active)
|
|
218
|
-
self.create(
|
|
219
|
-
texts=texts,
|
|
220
|
-
content_type=content_type,
|
|
221
|
-
priority=priority,
|
|
222
|
-
storage=storage,
|
|
223
|
-
metadata=metadata,
|
|
224
|
-
request_id=request_id,
|
|
225
|
-
embedding_model=embedding_model,
|
|
226
|
-
embedding_dimensions=embedding_dimensions,
|
|
227
|
-
)
|
|
228
204
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
pubsub.unsubscribe(channel)
|
|
244
|
-
pubsub.close()
|
|
205
|
+
# Submit the request first
|
|
206
|
+
self.create(
|
|
207
|
+
texts=texts,
|
|
208
|
+
content_type=content_type,
|
|
209
|
+
priority=priority,
|
|
210
|
+
storage=storage,
|
|
211
|
+
metadata=metadata,
|
|
212
|
+
request_id=request_id,
|
|
213
|
+
embedding_model=embedding_model,
|
|
214
|
+
embedding_dimensions=embedding_dimensions,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Wait for result via BRPOP
|
|
218
|
+
return self.wait_for(request_id, timeout)
|
|
245
219
|
|
|
246
220
|
def get_queue_depth(self) -> dict[str, int]:
|
|
247
221
|
"""
|