sf-vector-sdk 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,431 @@
1
+ """
2
+ Structured Embeddings Namespace.
3
+
4
+ Provides type-safe methods for embedding known tool types (FlashCard, TestQuestion, etc.)
5
+ with automatic text extraction, content hash computation, and database routing.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Optional
10
+
11
+ from ..hash import (
12
+ AudioRecapSectionData,
13
+ FlashCardData,
14
+ ToolCollection,
15
+ compute_content_hash,
16
+ extract_tool_text,
17
+ )
18
+ from ..namespaces.base import BaseNamespace
19
+ from ..namespaces.embeddings import EmbeddingsNamespace
20
+ from ..types import EmbeddingResult
21
+ from .router import build_storage_config, get_content_type
22
+ from .tool_config import QuestionType, get_tool_config
23
+
24
+ # ============================================================================
25
+ # Types
26
+ # ============================================================================
27
+
28
+
29
+ @dataclass
30
+ class ToolMetadata:
31
+ """Metadata to store alongside the embedding."""
32
+
33
+ tool_id: str
34
+ user_id: Optional[str] = None
35
+ topic_id: Optional[str] = None
36
+ extra: Optional[dict[str, Any]] = None
37
+
38
+ def to_dict(self) -> dict[str, Any]:
39
+ """Convert to dictionary for document storage."""
40
+ result: dict[str, Any] = {"toolId": self.tool_id}
41
+ if self.user_id:
42
+ result["userId"] = self.user_id
43
+ if self.topic_id:
44
+ result["topicId"] = self.topic_id
45
+ if self.extra:
46
+ result.update(self.extra)
47
+ return result
48
+
49
+
50
+ @dataclass
51
+ class TestQuestionInput:
52
+ """Extended question data with question type."""
53
+
54
+ question: str
55
+ answers: list[dict[str, Any]]
56
+ question_type: Optional[QuestionType] = None
57
+ explanation: Optional[str] = None
58
+
59
+ def to_question_data(self) -> dict[str, Any]:
60
+ """Convert to QuestionData-compatible dict."""
61
+ result: dict[str, Any] = {
62
+ "question": self.question,
63
+ "answers": self.answers,
64
+ }
65
+ if self.explanation:
66
+ result["explanation"] = self.explanation
67
+ return result
68
+
69
+
70
+ # ============================================================================
71
+ # StructuredEmbeddingsNamespace
72
+ # ============================================================================
73
+
74
+
75
+ class StructuredEmbeddingsNamespace(BaseNamespace):
76
+ """
77
+ Namespace for structured tool embeddings.
78
+
79
+ Provides type-safe methods for embedding known tool types with automatic:
80
+ - Text extraction per tool spec
81
+ - Content hash computation
82
+ - Namespace derivation based on tool type and sub-type
83
+ - Database routing based on environment configuration
84
+
85
+ Example:
86
+ ```python
87
+ client = VectorClient("redis://localhost:6379")
88
+
89
+ # Embed a flashcard
90
+ result = client.structured_embeddings.embed_flashcard_and_wait(
91
+ data={"type": "BASIC", "term": "Mitochondria", "definition": "..."},
92
+ metadata=ToolMetadata(tool_id="tool123", user_id="user456"),
93
+ )
94
+
95
+ # SDK automatically extracts text, computes hash, and routes to correct database
96
+ ```
97
+ """
98
+
99
+ def __init__(self, redis: Any, embeddings: EmbeddingsNamespace, http_url: Optional[str] = None):
100
+ """
101
+ Initialize the namespace.
102
+
103
+ Args:
104
+ redis: Redis client instance
105
+ embeddings: EmbeddingsNamespace instance for submitting requests
106
+ http_url: Optional HTTP URL for query-gateway
107
+ """
108
+ super().__init__(redis, http_url)
109
+ self._embeddings = embeddings
110
+
111
+ # ==========================================================================
112
+ # FlashCard Methods
113
+ # ==========================================================================
114
+
115
+ def embed_flashcard(
116
+ self,
117
+ data: FlashCardData,
118
+ metadata: ToolMetadata,
119
+ ) -> str:
120
+ """
121
+ Embed a flashcard and return the request ID.
122
+
123
+ Args:
124
+ data: FlashCard data (type, term, definition, multiple_choice_options)
125
+ metadata: Tool metadata (tool_id, user_id, topic_id, etc.)
126
+
127
+ Returns:
128
+ The request ID
129
+ """
130
+ card_type = data.get("type")
131
+ return self._embed_tool("FlashCard", data, metadata, card_type)
132
+
133
+ def embed_flashcard_and_wait(
134
+ self,
135
+ data: FlashCardData,
136
+ metadata: ToolMetadata,
137
+ timeout: int = 60,
138
+ ) -> EmbeddingResult:
139
+ """
140
+ Embed a flashcard and wait for the result.
141
+
142
+ Args:
143
+ data: FlashCard data
144
+ metadata: Tool metadata
145
+ timeout: Timeout in seconds (default: 60)
146
+
147
+ Returns:
148
+ The embedding result
149
+ """
150
+ card_type = data.get("type")
151
+ return self._embed_tool_and_wait("FlashCard", data, metadata, card_type, timeout)
152
+
153
+ # ==========================================================================
154
+ # TestQuestion Methods
155
+ # ==========================================================================
156
+
157
+ def embed_test_question(
158
+ self,
159
+ data: TestQuestionInput,
160
+ metadata: ToolMetadata,
161
+ ) -> str:
162
+ """
163
+ Embed a test question and return the request ID.
164
+
165
+ Args:
166
+ data: Question data (question, answers, explanation, question_type)
167
+ metadata: Tool metadata
168
+
169
+ Returns:
170
+ The request ID
171
+ """
172
+ return self._embed_tool(
173
+ "TestQuestion",
174
+ data.to_question_data(),
175
+ metadata,
176
+ data.question_type,
177
+ )
178
+
179
+ def embed_test_question_and_wait(
180
+ self,
181
+ data: TestQuestionInput,
182
+ metadata: ToolMetadata,
183
+ timeout: int = 60,
184
+ ) -> EmbeddingResult:
185
+ """
186
+ Embed a test question and wait for the result.
187
+
188
+ Args:
189
+ data: Question data
190
+ metadata: Tool metadata
191
+ timeout: Timeout in seconds (default: 60)
192
+
193
+ Returns:
194
+ The embedding result
195
+ """
196
+ return self._embed_tool_and_wait(
197
+ "TestQuestion",
198
+ data.to_question_data(),
199
+ metadata,
200
+ data.question_type,
201
+ timeout,
202
+ )
203
+
204
+ # ==========================================================================
205
+ # SpacedTestQuestion Methods
206
+ # ==========================================================================
207
+
208
+ def embed_spaced_test_question(
209
+ self,
210
+ data: TestQuestionInput,
211
+ metadata: ToolMetadata,
212
+ ) -> str:
213
+ """
214
+ Embed a spaced test question and return the request ID.
215
+
216
+ Args:
217
+ data: Question data
218
+ metadata: Tool metadata
219
+
220
+ Returns:
221
+ The request ID
222
+ """
223
+ return self._embed_tool(
224
+ "SpacedTestQuestion",
225
+ data.to_question_data(),
226
+ metadata,
227
+ data.question_type,
228
+ )
229
+
230
+ def embed_spaced_test_question_and_wait(
231
+ self,
232
+ data: TestQuestionInput,
233
+ metadata: ToolMetadata,
234
+ timeout: int = 60,
235
+ ) -> EmbeddingResult:
236
+ """
237
+ Embed a spaced test question and wait for the result.
238
+
239
+ Args:
240
+ data: Question data
241
+ metadata: Tool metadata
242
+ timeout: Timeout in seconds (default: 60)
243
+
244
+ Returns:
245
+ The embedding result
246
+ """
247
+ return self._embed_tool_and_wait(
248
+ "SpacedTestQuestion",
249
+ data.to_question_data(),
250
+ metadata,
251
+ data.question_type,
252
+ timeout,
253
+ )
254
+
255
+ # ==========================================================================
256
+ # AudioRecap Methods
257
+ # ==========================================================================
258
+
259
+ def embed_audio_recap(
260
+ self,
261
+ data: AudioRecapSectionData,
262
+ metadata: ToolMetadata,
263
+ ) -> str:
264
+ """
265
+ Embed an audio recap section and return the request ID.
266
+
267
+ Args:
268
+ data: Audio recap data (script)
269
+ metadata: Tool metadata
270
+
271
+ Returns:
272
+ The request ID
273
+ """
274
+ return self._embed_tool("AudioRecapV2Section", data, metadata, None)
275
+
276
+ def embed_audio_recap_and_wait(
277
+ self,
278
+ data: AudioRecapSectionData,
279
+ metadata: ToolMetadata,
280
+ timeout: int = 60,
281
+ ) -> EmbeddingResult:
282
+ """
283
+ Embed an audio recap section and wait for the result.
284
+
285
+ Args:
286
+ data: Audio recap data
287
+ metadata: Tool metadata
288
+ timeout: Timeout in seconds (default: 60)
289
+
290
+ Returns:
291
+ The embedding result
292
+ """
293
+ return self._embed_tool_and_wait(
294
+ "AudioRecapV2Section",
295
+ data,
296
+ metadata,
297
+ None,
298
+ timeout,
299
+ )
300
+
301
+ # ==========================================================================
302
+ # Internal Methods
303
+ # ==========================================================================
304
+
305
+ def _embed_tool(
306
+ self,
307
+ tool_collection: ToolCollection,
308
+ data: dict[str, Any],
309
+ metadata: ToolMetadata,
310
+ sub_type: Optional[str],
311
+ ) -> str:
312
+ """Internal method to embed any tool type."""
313
+ # 1. Extract text using the spec
314
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
315
+ if not text:
316
+ raise ValueError(
317
+ f"Failed to extract text from {tool_collection} - empty content"
318
+ )
319
+
320
+ # 2. Compute content hash
321
+ content_hash = compute_content_hash(
322
+ {"toolCollection": tool_collection, "data": data}
323
+ )
324
+ if not content_hash:
325
+ raise ValueError(
326
+ f"Failed to compute content hash for {tool_collection} - empty content"
327
+ )
328
+
329
+ # 3. Get tool config
330
+ tool_config = get_tool_config(tool_collection)
331
+
332
+ # 4. Build document with metadata
333
+ document = {
334
+ **metadata.to_dict(),
335
+ "toolCollection": tool_collection,
336
+ "contentHash": content_hash,
337
+ }
338
+
339
+ # 5. Build storage config using router
340
+ storage_config = build_storage_config(
341
+ tool_collection=tool_collection,
342
+ sub_type=sub_type,
343
+ content_hash=content_hash,
344
+ document_fields=document,
345
+ )
346
+
347
+ # 6. Build text input
348
+ text_input = {
349
+ "id": content_hash,
350
+ "text": text,
351
+ "document": document,
352
+ }
353
+
354
+ # 7. Submit to embeddings namespace
355
+ return self._embeddings.create(
356
+ texts=[text_input],
357
+ content_type=get_content_type(tool_collection),
358
+ priority=tool_config.default_priority,
359
+ storage=storage_config,
360
+ metadata={
361
+ "toolCollection": tool_collection,
362
+ "contentHash": content_hash,
363
+ },
364
+ embedding_model=tool_config.model,
365
+ embedding_dimensions=tool_config.dimensions,
366
+ )
367
+
368
+ def _embed_tool_and_wait(
369
+ self,
370
+ tool_collection: ToolCollection,
371
+ data: dict[str, Any],
372
+ metadata: ToolMetadata,
373
+ sub_type: Optional[str],
374
+ timeout: int = 60,
375
+ ) -> EmbeddingResult:
376
+ """Internal method to embed any tool type and wait for result."""
377
+ # 1. Extract text using the spec
378
+ text = extract_tool_text({"toolCollection": tool_collection, "data": data})
379
+ if not text:
380
+ raise ValueError(
381
+ f"Failed to extract text from {tool_collection} - empty content"
382
+ )
383
+
384
+ # 2. Compute content hash
385
+ content_hash = compute_content_hash(
386
+ {"toolCollection": tool_collection, "data": data}
387
+ )
388
+ if not content_hash:
389
+ raise ValueError(
390
+ f"Failed to compute content hash for {tool_collection} - empty content"
391
+ )
392
+
393
+ # 3. Get tool config
394
+ tool_config = get_tool_config(tool_collection)
395
+
396
+ # 4. Build document with metadata
397
+ document = {
398
+ **metadata.to_dict(),
399
+ "toolCollection": tool_collection,
400
+ "contentHash": content_hash,
401
+ }
402
+
403
+ # 5. Build storage config using router
404
+ storage_config = build_storage_config(
405
+ tool_collection=tool_collection,
406
+ sub_type=sub_type,
407
+ content_hash=content_hash,
408
+ document_fields=document,
409
+ )
410
+
411
+ # 6. Build text input
412
+ text_input = {
413
+ "id": content_hash,
414
+ "text": text,
415
+ "document": document,
416
+ }
417
+
418
+ # 7. Submit and wait using embeddings namespace
419
+ return self._embeddings.create_and_wait(
420
+ texts=[text_input],
421
+ content_type=get_content_type(tool_collection),
422
+ priority=tool_config.default_priority,
423
+ storage=storage_config,
424
+ metadata={
425
+ "toolCollection": tool_collection,
426
+ "contentHash": content_hash,
427
+ },
428
+ embedding_model=tool_config.model,
429
+ embedding_dimensions=tool_config.dimensions,
430
+ timeout=timeout,
431
+ )
@@ -0,0 +1,254 @@
1
+ """
2
+ Tool Configuration for Structured Embeddings.
3
+
4
+ Defines the configuration for each tool type including:
5
+ - Embedding model and dimensions
6
+ - TurboPuffer namespace patterns
7
+ - Pinecone index and namespace patterns
8
+ - Enabled/disabled status for each database
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+ from typing import Literal, Optional
13
+
14
+ from ..hash import FlashCardType, ToolCollection
15
+ from ..types import PRIORITY_HIGH, PRIORITY_NORMAL
16
+
17
+ # ============================================================================
18
+ # Types
19
+ # ============================================================================
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class ToolDatabaseConfig:
24
+ """Database-specific configuration for a tool type."""
25
+
26
+ enabled: bool
27
+ id_field: str
28
+ metadata_fields: tuple[str, ...]
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class TurboPufferToolConfig(ToolDatabaseConfig):
33
+ """TurboPuffer-specific configuration."""
34
+
35
+ namespace_pattern: str
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class PineconeToolConfig(ToolDatabaseConfig):
40
+ """Pinecone-specific configuration."""
41
+
42
+ index_name: str
43
+ namespace_pattern: str
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class ToolConfig:
48
+ """Configuration for a tool collection type."""
49
+
50
+ tool_collection: ToolCollection
51
+ model: str
52
+ dimensions: int
53
+ default_priority: str
54
+ turbopuffer: TurboPufferToolConfig
55
+ pinecone: PineconeToolConfig
56
+
57
+
58
+ # ============================================================================
59
+ # Tool Configurations
60
+ # ============================================================================
61
+
62
+ _DEFAULT_METADATA_FIELDS = (
63
+ "toolId",
64
+ "toolCollection",
65
+ "topicId",
66
+ "userId",
67
+ "contentHash",
68
+ )
69
+
70
+ TOOL_CONFIGS: dict[ToolCollection, ToolConfig] = {
71
+ "FlashCard": ToolConfig(
72
+ tool_collection="FlashCard",
73
+ model="gemini-embedding-001",
74
+ dimensions=3072,
75
+ default_priority=PRIORITY_HIGH,
76
+ turbopuffer=TurboPufferToolConfig(
77
+ enabled=True,
78
+ id_field="contentHash",
79
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
80
+ namespace_pattern="flashcard_{type}_tool_embedding",
81
+ ),
82
+ pinecone=PineconeToolConfig(
83
+ enabled=False,
84
+ index_name="tool-vectors",
85
+ id_field="contentHash",
86
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
87
+ namespace_pattern="flashcard_{type}",
88
+ ),
89
+ ),
90
+ "TestQuestion": ToolConfig(
91
+ tool_collection="TestQuestion",
92
+ model="gemini-embedding-001",
93
+ dimensions=3072,
94
+ default_priority=PRIORITY_HIGH,
95
+ turbopuffer=TurboPufferToolConfig(
96
+ enabled=True,
97
+ id_field="contentHash",
98
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
99
+ namespace_pattern="testquestion_{type}_tool_embedding",
100
+ ),
101
+ pinecone=PineconeToolConfig(
102
+ enabled=False,
103
+ index_name="tool-vectors",
104
+ id_field="contentHash",
105
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
106
+ namespace_pattern="testquestion_{type}",
107
+ ),
108
+ ),
109
+ "SpacedTestQuestion": ToolConfig(
110
+ tool_collection="SpacedTestQuestion",
111
+ model="gemini-embedding-001",
112
+ dimensions=3072,
113
+ default_priority=PRIORITY_NORMAL,
114
+ turbopuffer=TurboPufferToolConfig(
115
+ enabled=True,
116
+ id_field="contentHash",
117
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
118
+ namespace_pattern="spacedtestquestion_{type}_tool_embedding",
119
+ ),
120
+ pinecone=PineconeToolConfig(
121
+ enabled=False,
122
+ index_name="tool-vectors",
123
+ id_field="contentHash",
124
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
125
+ namespace_pattern="spacedtestquestion_{type}",
126
+ ),
127
+ ),
128
+ "AudioRecapV2Section": ToolConfig(
129
+ tool_collection="AudioRecapV2Section",
130
+ model="gemini-embedding-001",
131
+ dimensions=3072,
132
+ default_priority=PRIORITY_NORMAL,
133
+ turbopuffer=TurboPufferToolConfig(
134
+ enabled=True,
135
+ id_field="contentHash",
136
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
137
+ namespace_pattern="audiorecapv2section_tool_embedding",
138
+ ),
139
+ pinecone=PineconeToolConfig(
140
+ enabled=False,
141
+ index_name="tool-vectors",
142
+ id_field="contentHash",
143
+ metadata_fields=_DEFAULT_METADATA_FIELDS,
144
+ namespace_pattern="audiorecapv2section",
145
+ ),
146
+ ),
147
+ }
148
+
149
+
150
+ # ============================================================================
151
+ # Sub-type Mappings
152
+ # ============================================================================
153
+
154
+ QuestionType = Literal[
155
+ "multiplechoice",
156
+ "truefalse",
157
+ "shortanswer",
158
+ "fillinblank",
159
+ "frq",
160
+ ]
161
+
162
+
163
+ def get_flashcard_namespace_suffix(card_type: Optional[FlashCardType]) -> str:
164
+ """Map FlashCardType to namespace suffix."""
165
+ mapping = {
166
+ "BASIC": "basic",
167
+ "CLOZE": "cloze",
168
+ "FILL_IN_THE_BLANK": "fillintheblank",
169
+ "MULTIPLE_CHOICE": "multiplechoice",
170
+ }
171
+ return mapping.get(card_type or "BASIC", "basic")
172
+
173
+
174
+ def get_question_namespace_suffix(question_type: Optional[QuestionType]) -> str:
175
+ """Map question type string to namespace suffix."""
176
+ return question_type or "multiplechoice"
177
+
178
+
179
+ # ============================================================================
180
+ # Namespace Derivation
181
+ # ============================================================================
182
+
183
+
184
+ def get_turbopuffer_namespace(
185
+ tool_collection: ToolCollection,
186
+ sub_type: Optional[str] = None,
187
+ ) -> str:
188
+ """
189
+ Derive the TurboPuffer namespace for a tool.
190
+
191
+ Args:
192
+ tool_collection: The tool collection type
193
+ sub_type: The sub-type (FlashCardType or QuestionType)
194
+
195
+ Returns:
196
+ The derived namespace string
197
+ """
198
+ config = TOOL_CONFIGS[tool_collection]
199
+ pattern = config.turbopuffer.namespace_pattern
200
+
201
+ # AudioRecapV2Section doesn't have sub-types
202
+ if tool_collection == "AudioRecapV2Section":
203
+ return pattern
204
+
205
+ # Derive the type suffix
206
+ if tool_collection == "FlashCard":
207
+ type_suffix = get_flashcard_namespace_suffix(sub_type) # type: ignore
208
+ else:
209
+ type_suffix = get_question_namespace_suffix(sub_type) # type: ignore
210
+
211
+ return pattern.replace("{type}", type_suffix)
212
+
213
+
214
+ def get_pinecone_namespace(
215
+ tool_collection: ToolCollection,
216
+ sub_type: Optional[str] = None,
217
+ ) -> str:
218
+ """
219
+ Derive the Pinecone namespace for a tool.
220
+
221
+ Args:
222
+ tool_collection: The tool collection type
223
+ sub_type: The sub-type (FlashCardType or QuestionType)
224
+
225
+ Returns:
226
+ The derived namespace string
227
+ """
228
+ config = TOOL_CONFIGS[tool_collection]
229
+ pattern = config.pinecone.namespace_pattern
230
+
231
+ # AudioRecapV2Section doesn't have sub-types
232
+ if tool_collection == "AudioRecapV2Section":
233
+ return pattern
234
+
235
+ # Derive the type suffix
236
+ if tool_collection == "FlashCard":
237
+ type_suffix = get_flashcard_namespace_suffix(sub_type) # type: ignore
238
+ else:
239
+ type_suffix = get_question_namespace_suffix(sub_type) # type: ignore
240
+
241
+ return pattern.replace("{type}", type_suffix)
242
+
243
+
244
+ def get_tool_config(tool_collection: ToolCollection) -> ToolConfig:
245
+ """
246
+ Get the tool configuration for a tool collection.
247
+
248
+ Args:
249
+ tool_collection: The tool collection type
250
+
251
+ Returns:
252
+ The tool configuration
253
+ """
254
+ return TOOL_CONFIGS[tool_collection]