sf-vector-sdk 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sf_vector_sdk-0.2.0.dist-info/METADATA +476 -0
- sf_vector_sdk-0.2.0.dist-info/RECORD +27 -0
- sf_vector_sdk-0.2.0.dist-info/WHEEL +4 -0
- vector_sdk/__init__.py +262 -0
- vector_sdk/client.py +538 -0
- vector_sdk/content_types.py +233 -0
- vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.py +57 -0
- vector_sdk/generated/embedding_pipeline/content_types/v1/content_types_pb2.pyi +141 -0
- vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.py +58 -0
- vector_sdk/generated/embedding_pipeline/db/vectors/v1/vectors_pb2.pyi +145 -0
- vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.py +58 -0
- vector_sdk/generated/embedding_pipeline/query/v1/query_pb2.pyi +109 -0
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.py +39 -0
- vector_sdk/generated/embedding_pipeline/tools/v1/tools_pb2.pyi +31 -0
- vector_sdk/hash/__init__.py +31 -0
- vector_sdk/hash/hasher.py +259 -0
- vector_sdk/hash/types.py +67 -0
- vector_sdk/namespaces/__init__.py +13 -0
- vector_sdk/namespaces/base.py +45 -0
- vector_sdk/namespaces/db.py +230 -0
- vector_sdk/namespaces/embeddings.py +268 -0
- vector_sdk/namespaces/search.py +258 -0
- vector_sdk/structured/__init__.py +60 -0
- vector_sdk/structured/router.py +190 -0
- vector_sdk/structured/structured_embeddings.py +431 -0
- vector_sdk/structured/tool_config.py +254 -0
- vector_sdk/types.py +864 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
|
+
# source: embedding_pipeline/query/v1/query.proto
|
|
5
|
+
# Protobuf Python Version: 6.33.4
|
|
6
|
+
"""Generated protocol buffer code."""
|
|
7
|
+
from google.protobuf import descriptor as _descriptor
|
|
8
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
9
|
+
from google.protobuf import runtime_version as _runtime_version
|
|
10
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
11
|
+
from google.protobuf.internal import builder as _builder
|
|
12
|
+
_runtime_version.ValidateProtobufRuntimeVersion(
|
|
13
|
+
_runtime_version.Domain.PUBLIC,
|
|
14
|
+
6,
|
|
15
|
+
33,
|
|
16
|
+
4,
|
|
17
|
+
'',
|
|
18
|
+
'embedding_pipeline/query/v1/query.proto'
|
|
19
|
+
)
|
|
20
|
+
# @@protoc_insertion_point(imports)
|
|
21
|
+
|
|
22
|
+
_sym_db = _symbol_database.Default()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
from embedding_pipeline.content_types.v1 import content_types_pb2 as embedding__pipeline_dot_content__types_dot_v1_dot_content__types__pb2
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/query/v1/query.proto\x12\x1b\x65mbedding_pipeline.query.v1\x1a\x37\x65mbedding_pipeline/content_types/v1/content_types.proto\"\xc7\x04\n\x0cQueryRequest\x12\x1d\n\nrequest_id\x18\x01 \x01(\tR\trequestId\x12\x1d\n\nquery_text\x18\x02 \x01(\tR\tqueryText\x12O\n\x08\x64\x61tabase\x18\x03 \x01(\x0e\x32\x33.embedding_pipeline.content_types.v1.VectorDatabaseR\x08\x64\x61tabase\x12_\n\x10\x65mbedding_config\x18\x04 \x01(\x0b\x32\x34.embedding_pipeline.content_types.v1.EmbeddingConfigR\x0f\x65mbeddingConfig\x12K\n\x0cquery_config\x18\x05 \x01(\x0b\x32(.embedding_pipeline.query.v1.QueryConfigR\x0bqueryConfig\x12I\n\x08priority\x18\x06 \x01(\x0e\x32-.embedding_pipeline.content_types.v1.PriorityR\x08priority\x12S\n\x08metadata\x18\x07 \x03(\x0b\x32\x37.embedding_pipeline.query.v1.QueryRequest.MetadataEntryR\x08metadata\x12\x1d\n\ncreated_at\x18\x08 \x01(\tR\tcreatedAt\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\xfa\x02\n\x0bQueryConfig\x12\x13\n\x05top_k\x18\x01 \x01(\x05R\x04topK\x12\x1b\n\tmin_score\x18\x02 \x01(\x02R\x08minScore\x12O\n\x07\x66ilters\x18\x03 \x03(\x0b\x32\x35.embedding_pipeline.query.v1.QueryConfig.FiltersEntryR\x07\x66ilters\x12\x1c\n\tnamespace\x18\x04 \x01(\tR\tnamespace\x12\x1e\n\ncollection\x18\x05 \x01(\tR\ncollection\x12\x1a\n\x08\x64\x61tabase\x18\x06 \x01(\tR\x08\x64\x61tabase\x12\'\n\x0finclude_vectors\x18\x07 \x01(\x08R\x0eincludeVectors\x12)\n\x10include_metadata\x18\x08 \x01(\x08R\x0fincludeMetadata\x1a:\n\x0c\x46iltersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\x83\x02\n\x0bQueryResult\x12\x1d\n\nrequest_id\x18\x01 \x01(\tR\trequestId\x12\x16\n\x06status\x18\x02 \x01(\tR\x06status\x12\x42\n\x07matches\x18\x03 \x03(\x0b\x32(.embedding_pipeline.query.v1.VectorMatchR\x07matches\x12\x14\n\x05\x65rror\x18\x04 \x01(\tR\x05\x65rror\x12@\n\x06timing\x18\x05 \x01(\x0b\x32(.embedding_pipeline.query.v1.QueryTimingR\x06timing\x12!\n\x0c\x63ompleted_at\x18\x06 \x01(\tR\x0b\x63ompletedAt\"\xdc\x01\n\x0bVectorMatch\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x14\n\x05score\x18\x02 \x01(\x02R\x05score\x12R\n\x08metadata\x18\x03 \x03(\x0b\x32\x36.embedding_pipeline.query.v1.VectorMatch.MetadataEntryR\x08metadata\x12\x16\n\x06vector\x18\x04 \x03(\x02R\x06vector\x1a;\n\rMetadataEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\"\x8c\x01\n\x0bQueryTiming\x12\"\n\rqueue_wait_ms\x18\x01 \x01(\x03R\x0bqueueWaitMs\x12!\n\x0c\x65mbedding_ms\x18\x02 \x01(\x03R\x0b\x65mbeddingMs\x12\x1b\n\tsearch_ms\x18\x03 \x01(\x03R\x08searchMs\x12\x19\n\x08total_ms\x18\x04 \x01(\x03R\x07totalMsBgZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/query/v1b\x06proto3')
|
|
29
|
+
|
|
30
|
+
_globals = globals()
|
|
31
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
32
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pipeline.query.v1.query_pb2', _globals)
|
|
33
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
|
34
|
+
_globals['DESCRIPTOR']._loaded_options = None
|
|
35
|
+
_globals['DESCRIPTOR']._serialized_options = b'Zegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/query/v1'
|
|
36
|
+
_globals['_QUERYREQUEST_METADATAENTRY']._loaded_options = None
|
|
37
|
+
_globals['_QUERYREQUEST_METADATAENTRY']._serialized_options = b'8\001'
|
|
38
|
+
_globals['_QUERYCONFIG_FILTERSENTRY']._loaded_options = None
|
|
39
|
+
_globals['_QUERYCONFIG_FILTERSENTRY']._serialized_options = b'8\001'
|
|
40
|
+
_globals['_VECTORMATCH_METADATAENTRY']._loaded_options = None
|
|
41
|
+
_globals['_VECTORMATCH_METADATAENTRY']._serialized_options = b'8\001'
|
|
42
|
+
_globals['_QUERYREQUEST']._serialized_start=130
|
|
43
|
+
_globals['_QUERYREQUEST']._serialized_end=713
|
|
44
|
+
_globals['_QUERYREQUEST_METADATAENTRY']._serialized_start=654
|
|
45
|
+
_globals['_QUERYREQUEST_METADATAENTRY']._serialized_end=713
|
|
46
|
+
_globals['_QUERYCONFIG']._serialized_start=716
|
|
47
|
+
_globals['_QUERYCONFIG']._serialized_end=1094
|
|
48
|
+
_globals['_QUERYCONFIG_FILTERSENTRY']._serialized_start=1036
|
|
49
|
+
_globals['_QUERYCONFIG_FILTERSENTRY']._serialized_end=1094
|
|
50
|
+
_globals['_QUERYRESULT']._serialized_start=1097
|
|
51
|
+
_globals['_QUERYRESULT']._serialized_end=1356
|
|
52
|
+
_globals['_VECTORMATCH']._serialized_start=1359
|
|
53
|
+
_globals['_VECTORMATCH']._serialized_end=1579
|
|
54
|
+
_globals['_VECTORMATCH_METADATAENTRY']._serialized_start=654
|
|
55
|
+
_globals['_VECTORMATCH_METADATAENTRY']._serialized_end=713
|
|
56
|
+
_globals['_QUERYTIMING']._serialized_start=1582
|
|
57
|
+
_globals['_QUERYTIMING']._serialized_end=1722
|
|
58
|
+
# @@protoc_insertion_point(module_scope)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from embedding_pipeline.content_types.v1 import content_types_pb2 as _content_types_pb2
|
|
2
|
+
from google.protobuf.internal import containers as _containers
|
|
3
|
+
from google.protobuf import descriptor as _descriptor
|
|
4
|
+
from google.protobuf import message as _message
|
|
5
|
+
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
|
|
6
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
|
7
|
+
|
|
8
|
+
DESCRIPTOR: _descriptor.FileDescriptor
|
|
9
|
+
|
|
10
|
+
class QueryRequest(_message.Message):
|
|
11
|
+
__slots__ = ("request_id", "query_text", "database", "embedding_config", "query_config", "priority", "metadata", "created_at")
|
|
12
|
+
class MetadataEntry(_message.Message):
|
|
13
|
+
__slots__ = ("key", "value")
|
|
14
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
15
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
16
|
+
key: str
|
|
17
|
+
value: str
|
|
18
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
19
|
+
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
|
20
|
+
QUERY_TEXT_FIELD_NUMBER: _ClassVar[int]
|
|
21
|
+
DATABASE_FIELD_NUMBER: _ClassVar[int]
|
|
22
|
+
EMBEDDING_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
|
23
|
+
QUERY_CONFIG_FIELD_NUMBER: _ClassVar[int]
|
|
24
|
+
PRIORITY_FIELD_NUMBER: _ClassVar[int]
|
|
25
|
+
METADATA_FIELD_NUMBER: _ClassVar[int]
|
|
26
|
+
CREATED_AT_FIELD_NUMBER: _ClassVar[int]
|
|
27
|
+
request_id: str
|
|
28
|
+
query_text: str
|
|
29
|
+
database: _content_types_pb2.VectorDatabase
|
|
30
|
+
embedding_config: _content_types_pb2.EmbeddingConfig
|
|
31
|
+
query_config: QueryConfig
|
|
32
|
+
priority: _content_types_pb2.Priority
|
|
33
|
+
metadata: _containers.ScalarMap[str, str]
|
|
34
|
+
created_at: str
|
|
35
|
+
def __init__(self, request_id: _Optional[str] = ..., query_text: _Optional[str] = ..., database: _Optional[_Union[_content_types_pb2.VectorDatabase, str]] = ..., embedding_config: _Optional[_Union[_content_types_pb2.EmbeddingConfig, _Mapping]] = ..., query_config: _Optional[_Union[QueryConfig, _Mapping]] = ..., priority: _Optional[_Union[_content_types_pb2.Priority, str]] = ..., metadata: _Optional[_Mapping[str, str]] = ..., created_at: _Optional[str] = ...) -> None: ...
|
|
36
|
+
|
|
37
|
+
class QueryConfig(_message.Message):
|
|
38
|
+
__slots__ = ("top_k", "min_score", "filters", "namespace", "collection", "database", "include_vectors", "include_metadata")
|
|
39
|
+
class FiltersEntry(_message.Message):
|
|
40
|
+
__slots__ = ("key", "value")
|
|
41
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
42
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
43
|
+
key: str
|
|
44
|
+
value: str
|
|
45
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
46
|
+
TOP_K_FIELD_NUMBER: _ClassVar[int]
|
|
47
|
+
MIN_SCORE_FIELD_NUMBER: _ClassVar[int]
|
|
48
|
+
FILTERS_FIELD_NUMBER: _ClassVar[int]
|
|
49
|
+
NAMESPACE_FIELD_NUMBER: _ClassVar[int]
|
|
50
|
+
COLLECTION_FIELD_NUMBER: _ClassVar[int]
|
|
51
|
+
DATABASE_FIELD_NUMBER: _ClassVar[int]
|
|
52
|
+
INCLUDE_VECTORS_FIELD_NUMBER: _ClassVar[int]
|
|
53
|
+
INCLUDE_METADATA_FIELD_NUMBER: _ClassVar[int]
|
|
54
|
+
top_k: int
|
|
55
|
+
min_score: float
|
|
56
|
+
filters: _containers.ScalarMap[str, str]
|
|
57
|
+
namespace: str
|
|
58
|
+
collection: str
|
|
59
|
+
database: str
|
|
60
|
+
include_vectors: bool
|
|
61
|
+
include_metadata: bool
|
|
62
|
+
def __init__(self, top_k: _Optional[int] = ..., min_score: _Optional[float] = ..., filters: _Optional[_Mapping[str, str]] = ..., namespace: _Optional[str] = ..., collection: _Optional[str] = ..., database: _Optional[str] = ..., include_vectors: _Optional[bool] = ..., include_metadata: _Optional[bool] = ...) -> None: ...
|
|
63
|
+
|
|
64
|
+
class QueryResult(_message.Message):
|
|
65
|
+
__slots__ = ("request_id", "status", "matches", "error", "timing", "completed_at")
|
|
66
|
+
REQUEST_ID_FIELD_NUMBER: _ClassVar[int]
|
|
67
|
+
STATUS_FIELD_NUMBER: _ClassVar[int]
|
|
68
|
+
MATCHES_FIELD_NUMBER: _ClassVar[int]
|
|
69
|
+
ERROR_FIELD_NUMBER: _ClassVar[int]
|
|
70
|
+
TIMING_FIELD_NUMBER: _ClassVar[int]
|
|
71
|
+
COMPLETED_AT_FIELD_NUMBER: _ClassVar[int]
|
|
72
|
+
request_id: str
|
|
73
|
+
status: str
|
|
74
|
+
matches: _containers.RepeatedCompositeFieldContainer[VectorMatch]
|
|
75
|
+
error: str
|
|
76
|
+
timing: QueryTiming
|
|
77
|
+
completed_at: str
|
|
78
|
+
def __init__(self, request_id: _Optional[str] = ..., status: _Optional[str] = ..., matches: _Optional[_Iterable[_Union[VectorMatch, _Mapping]]] = ..., error: _Optional[str] = ..., timing: _Optional[_Union[QueryTiming, _Mapping]] = ..., completed_at: _Optional[str] = ...) -> None: ...
|
|
79
|
+
|
|
80
|
+
class VectorMatch(_message.Message):
|
|
81
|
+
__slots__ = ("id", "score", "metadata", "vector")
|
|
82
|
+
class MetadataEntry(_message.Message):
|
|
83
|
+
__slots__ = ("key", "value")
|
|
84
|
+
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
85
|
+
VALUE_FIELD_NUMBER: _ClassVar[int]
|
|
86
|
+
key: str
|
|
87
|
+
value: str
|
|
88
|
+
def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
|
|
89
|
+
ID_FIELD_NUMBER: _ClassVar[int]
|
|
90
|
+
SCORE_FIELD_NUMBER: _ClassVar[int]
|
|
91
|
+
METADATA_FIELD_NUMBER: _ClassVar[int]
|
|
92
|
+
VECTOR_FIELD_NUMBER: _ClassVar[int]
|
|
93
|
+
id: str
|
|
94
|
+
score: float
|
|
95
|
+
metadata: _containers.ScalarMap[str, str]
|
|
96
|
+
vector: _containers.RepeatedScalarFieldContainer[float]
|
|
97
|
+
def __init__(self, id: _Optional[str] = ..., score: _Optional[float] = ..., metadata: _Optional[_Mapping[str, str]] = ..., vector: _Optional[_Iterable[float]] = ...) -> None: ...
|
|
98
|
+
|
|
99
|
+
class QueryTiming(_message.Message):
|
|
100
|
+
__slots__ = ("queue_wait_ms", "embedding_ms", "search_ms", "total_ms")
|
|
101
|
+
QUEUE_WAIT_MS_FIELD_NUMBER: _ClassVar[int]
|
|
102
|
+
EMBEDDING_MS_FIELD_NUMBER: _ClassVar[int]
|
|
103
|
+
SEARCH_MS_FIELD_NUMBER: _ClassVar[int]
|
|
104
|
+
TOTAL_MS_FIELD_NUMBER: _ClassVar[int]
|
|
105
|
+
queue_wait_ms: int
|
|
106
|
+
embedding_ms: int
|
|
107
|
+
search_ms: int
|
|
108
|
+
total_ms: int
|
|
109
|
+
def __init__(self, queue_wait_ms: _Optional[int] = ..., embedding_ms: _Optional[int] = ..., search_ms: _Optional[int] = ..., total_ms: _Optional[int] = ...) -> None: ...
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# NO CHECKED-IN PROTOBUF GENCODE
|
|
4
|
+
# source: embedding_pipeline/tools/v1/tools.proto
|
|
5
|
+
# Protobuf Python Version: 6.33.4
|
|
6
|
+
"""Generated protocol buffer code."""
|
|
7
|
+
from google.protobuf import descriptor as _descriptor
|
|
8
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
9
|
+
from google.protobuf import runtime_version as _runtime_version
|
|
10
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
11
|
+
from google.protobuf.internal import builder as _builder
|
|
12
|
+
_runtime_version.ValidateProtobufRuntimeVersion(
|
|
13
|
+
_runtime_version.Domain.PUBLIC,
|
|
14
|
+
6,
|
|
15
|
+
33,
|
|
16
|
+
4,
|
|
17
|
+
'',
|
|
18
|
+
'embedding_pipeline/tools/v1/tools.proto'
|
|
19
|
+
)
|
|
20
|
+
# @@protoc_insertion_point(imports)
|
|
21
|
+
|
|
22
|
+
_sym_db = _symbol_database.Default()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\'embedding_pipeline/tools/v1/tools.proto\x12\x1b\x65mbedding_pipeline.tools.v1*\xc9\x01\n\x0eToolCollection\x12\x1f\n\x1bTOOL_COLLECTION_UNSPECIFIED\x10\x00\x12\x1d\n\x19TOOL_COLLECTION_FLASHCARD\x10\x01\x12!\n\x1dTOOL_COLLECTION_TEST_QUESTION\x10\x02\x12(\n$TOOL_COLLECTION_SPACED_TEST_QUESTION\x10\x03\x12*\n&TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION\x10\x04*\xb2\x01\n\rFlashCardType\x12\x1f\n\x1b\x46LASH_CARD_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15\x46LASH_CARD_TYPE_BASIC\x10\x01\x12\x19\n\x15\x46LASH_CARD_TYPE_CLOZE\x10\x02\x12%\n!FLASH_CARD_TYPE_FILL_IN_THE_BLANK\x10\x03\x12#\n\x1f\x46LASH_CARD_TYPE_MULTIPLE_CHOICE\x10\x04\x42gZegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1b\x06proto3')
|
|
28
|
+
|
|
29
|
+
_globals = globals()
|
|
30
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
31
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pipeline.tools.v1.tools_pb2', _globals)
|
|
32
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
|
33
|
+
_globals['DESCRIPTOR']._loaded_options = None
|
|
34
|
+
_globals['DESCRIPTOR']._serialized_options = b'Zegithub.com/GoStudyFetchGo/vector-management-monorepo/packages/go/proto-go/embedding_pipeline/tools/v1'
|
|
35
|
+
_globals['_TOOLCOLLECTION']._serialized_start=73
|
|
36
|
+
_globals['_TOOLCOLLECTION']._serialized_end=274
|
|
37
|
+
_globals['_FLASHCARDTYPE']._serialized_start=277
|
|
38
|
+
_globals['_FLASHCARDTYPE']._serialized_end=455
|
|
39
|
+
# @@protoc_insertion_point(module_scope)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
2
|
+
from google.protobuf import descriptor as _descriptor
|
|
3
|
+
from typing import ClassVar as _ClassVar
|
|
4
|
+
|
|
5
|
+
DESCRIPTOR: _descriptor.FileDescriptor
|
|
6
|
+
|
|
7
|
+
class ToolCollection(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
8
|
+
__slots__ = ()
|
|
9
|
+
TOOL_COLLECTION_UNSPECIFIED: _ClassVar[ToolCollection]
|
|
10
|
+
TOOL_COLLECTION_FLASHCARD: _ClassVar[ToolCollection]
|
|
11
|
+
TOOL_COLLECTION_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
12
|
+
TOOL_COLLECTION_SPACED_TEST_QUESTION: _ClassVar[ToolCollection]
|
|
13
|
+
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: _ClassVar[ToolCollection]
|
|
14
|
+
|
|
15
|
+
class FlashCardType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
16
|
+
__slots__ = ()
|
|
17
|
+
FLASH_CARD_TYPE_UNSPECIFIED: _ClassVar[FlashCardType]
|
|
18
|
+
FLASH_CARD_TYPE_BASIC: _ClassVar[FlashCardType]
|
|
19
|
+
FLASH_CARD_TYPE_CLOZE: _ClassVar[FlashCardType]
|
|
20
|
+
FLASH_CARD_TYPE_FILL_IN_THE_BLANK: _ClassVar[FlashCardType]
|
|
21
|
+
FLASH_CARD_TYPE_MULTIPLE_CHOICE: _ClassVar[FlashCardType]
|
|
22
|
+
TOOL_COLLECTION_UNSPECIFIED: ToolCollection
|
|
23
|
+
TOOL_COLLECTION_FLASHCARD: ToolCollection
|
|
24
|
+
TOOL_COLLECTION_TEST_QUESTION: ToolCollection
|
|
25
|
+
TOOL_COLLECTION_SPACED_TEST_QUESTION: ToolCollection
|
|
26
|
+
TOOL_COLLECTION_AUDIO_RECAP_V2_SECTION: ToolCollection
|
|
27
|
+
FLASH_CARD_TYPE_UNSPECIFIED: FlashCardType
|
|
28
|
+
FLASH_CARD_TYPE_BASIC: FlashCardType
|
|
29
|
+
FLASH_CARD_TYPE_CLOZE: FlashCardType
|
|
30
|
+
FLASH_CARD_TYPE_FILL_IN_THE_BLANK: FlashCardType
|
|
31
|
+
FLASH_CARD_TYPE_MULTIPLE_CHOICE: FlashCardType
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content Hash Module.
|
|
3
|
+
|
|
4
|
+
Generates deterministic content hashes for learning tools.
|
|
5
|
+
Mirrors the TypeScript implementation in packages/ts/vector-sdk/src/common/hash.ts
|
|
6
|
+
|
|
7
|
+
See docs/CONTENT-HASH-SPEC.md for the full specification.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .hasher import compute_content_hash, extract_tool_text
|
|
11
|
+
from .types import (
|
|
12
|
+
AnswerObject,
|
|
13
|
+
AudioRecapSectionData,
|
|
14
|
+
FlashCardData,
|
|
15
|
+
FlashCardType,
|
|
16
|
+
MultipleChoiceOption,
|
|
17
|
+
QuestionData,
|
|
18
|
+
ToolCollection,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"compute_content_hash",
|
|
23
|
+
"extract_tool_text",
|
|
24
|
+
"ToolCollection",
|
|
25
|
+
"FlashCardType",
|
|
26
|
+
"FlashCardData",
|
|
27
|
+
"QuestionData",
|
|
28
|
+
"AudioRecapSectionData",
|
|
29
|
+
"MultipleChoiceOption",
|
|
30
|
+
"AnswerObject",
|
|
31
|
+
]
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content hash computation.
|
|
3
|
+
|
|
4
|
+
Generates deterministic content hashes for learning tools.
|
|
5
|
+
Mirrors the TypeScript implementation in packages/ts/vector-sdk/src/common/hash.ts
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import re
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
from .types import (
|
|
13
|
+
AudioRecapSectionData,
|
|
14
|
+
FlashCardData,
|
|
15
|
+
MultipleChoiceOption,
|
|
16
|
+
QuestionData,
|
|
17
|
+
ToolCollection,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Hash length in hex characters (128 bits = 32 hex chars)
|
|
21
|
+
HASH_LENGTH = 32
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compute_content_hash(
|
|
25
|
+
tool_collection: ToolCollection,
|
|
26
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
27
|
+
) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Compute a deterministic content hash for a learning tool.
|
|
30
|
+
|
|
31
|
+
The hash is computed by:
|
|
32
|
+
1. Extracting text content based on the tool collection type
|
|
33
|
+
2. Computing SHA-256 hash of the text
|
|
34
|
+
3. Truncating to first 32 hex characters (128 bits)
|
|
35
|
+
|
|
36
|
+
For FlashCard, if no type is provided, defaults to "BASIC".
|
|
37
|
+
Other tool collections (TestQuestion, SpacedTestQuestion, AudioRecapV2Section)
|
|
38
|
+
do not have type variants.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
tool_collection: The tool type (FlashCard, TestQuestion, etc.)
|
|
42
|
+
data: Tool-specific data (can be Pydantic model or dict)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The content hash (32 hex chars) or empty string if no content
|
|
46
|
+
"""
|
|
47
|
+
text = extract_tool_text(tool_collection, data)
|
|
48
|
+
if not text:
|
|
49
|
+
return ""
|
|
50
|
+
return _compute_hash(text)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def extract_tool_text(
|
|
54
|
+
tool_collection: ToolCollection,
|
|
55
|
+
data: Union[FlashCardData, QuestionData, AudioRecapSectionData, dict],
|
|
56
|
+
) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Extract the text content from a learning tool for embedding.
|
|
59
|
+
|
|
60
|
+
This function extracts the text that will be used for vector embedding
|
|
61
|
+
based on the tool collection type. Use this when you need the raw text
|
|
62
|
+
for embedding rather than the hash.
|
|
63
|
+
|
|
64
|
+
Text extraction rules:
|
|
65
|
+
- FlashCard (BASIC): "Term: {term} Definition: {definition}"
|
|
66
|
+
- FlashCard (CLOZE/FILL_IN_THE_BLANK): Same as BASIC, with {{...}} syntax stripped
|
|
67
|
+
- FlashCard (MULTIPLE_CHOICE): "Term: {term} Options: {opt1, opt2, ...}"
|
|
68
|
+
- TestQuestion/SpacedTestQuestion: "Question: {question} Answers: {a1, a2, ...} Explanation: {explanation}"
|
|
69
|
+
- AudioRecapV2Section: "Script: {script}"
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
tool_collection: The tool type (FlashCard, TestQuestion, etc.)
|
|
73
|
+
data: Tool-specific data (can be Pydantic model or dict)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The extracted text string or empty string if no content
|
|
77
|
+
|
|
78
|
+
Example:
|
|
79
|
+
>>> text = extract_tool_text(
|
|
80
|
+
... "FlashCard",
|
|
81
|
+
... {"type": "BASIC", "term": "Hello", "definition": "World"}
|
|
82
|
+
... )
|
|
83
|
+
>>> # Returns: "Term: Hello Definition: World"
|
|
84
|
+
"""
|
|
85
|
+
# Convert dict to appropriate model if needed
|
|
86
|
+
if isinstance(data, dict):
|
|
87
|
+
data_dict = data
|
|
88
|
+
else:
|
|
89
|
+
data_dict = data.model_dump(by_alias=True) if hasattr(data, "model_dump") else dict(data)
|
|
90
|
+
|
|
91
|
+
if tool_collection == "FlashCard":
|
|
92
|
+
return _extract_flashcard_text(data_dict)
|
|
93
|
+
elif tool_collection in ("TestQuestion", "SpacedTestQuestion"):
|
|
94
|
+
return _extract_question_text(data_dict)
|
|
95
|
+
elif tool_collection == "AudioRecapV2Section":
|
|
96
|
+
return _extract_audio_recap_text(data_dict)
|
|
97
|
+
else:
|
|
98
|
+
return ""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _compute_hash(text: str) -> str:
|
|
102
|
+
"""Compute SHA-256 hash and truncate to HASH_LENGTH characters."""
|
|
103
|
+
if not text:
|
|
104
|
+
return ""
|
|
105
|
+
hash_bytes = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
106
|
+
return hash_bytes[:HASH_LENGTH]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _extract_flashcard_text(data: dict) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Extract text from a FlashCard document.
|
|
112
|
+
|
|
113
|
+
Handles all variants: BASIC, CLOZE, FILL_IN_THE_BLANK, MULTIPLE_CHOICE
|
|
114
|
+
"""
|
|
115
|
+
parts: list[str] = []
|
|
116
|
+
card_type = data.get("type") or "BASIC"
|
|
117
|
+
|
|
118
|
+
# Extract term (all types have this)
|
|
119
|
+
term = data.get("term")
|
|
120
|
+
if term:
|
|
121
|
+
clean_term = _strip_flashcard_syntax(term.strip())
|
|
122
|
+
if clean_term:
|
|
123
|
+
parts.append(f"Term: {clean_term}")
|
|
124
|
+
|
|
125
|
+
# Handle definition/options based on type
|
|
126
|
+
if card_type == "MULTIPLE_CHOICE":
|
|
127
|
+
# Check both camelCase and snake_case keys
|
|
128
|
+
options = data.get("multipleChoiceOptions") or data.get("multiple_choice_options")
|
|
129
|
+
option_texts = _get_multiple_choice_options(options)
|
|
130
|
+
if option_texts:
|
|
131
|
+
parts.append(f"Options: {', '.join(option_texts)}")
|
|
132
|
+
else:
|
|
133
|
+
# BASIC, CLOZE, FILL_IN_THE_BLANK use definition
|
|
134
|
+
definition = data.get("definition")
|
|
135
|
+
if definition:
|
|
136
|
+
clean_def = _strip_flashcard_syntax(definition.strip())
|
|
137
|
+
if clean_def:
|
|
138
|
+
parts.append(f"Definition: {clean_def}")
|
|
139
|
+
|
|
140
|
+
return " ".join(parts)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _extract_question_text(data: dict) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Extract text from TestQuestion or SpacedTestQuestion.
|
|
146
|
+
|
|
147
|
+
Format: "Question: {question} Answers: {a1, a2, ...} Explanation: {explanation}"
|
|
148
|
+
"""
|
|
149
|
+
parts: list[str] = []
|
|
150
|
+
|
|
151
|
+
question = data.get("question")
|
|
152
|
+
if question:
|
|
153
|
+
trimmed = question.strip()
|
|
154
|
+
if trimmed:
|
|
155
|
+
parts.append(f"Question: {trimmed}")
|
|
156
|
+
|
|
157
|
+
# Extract answers array
|
|
158
|
+
answers = data.get("answers")
|
|
159
|
+
answer_texts = _get_string_array(answers)
|
|
160
|
+
if answer_texts:
|
|
161
|
+
parts.append(f"Answers: {', '.join(answer_texts)}")
|
|
162
|
+
|
|
163
|
+
explanation = data.get("explanation")
|
|
164
|
+
if explanation:
|
|
165
|
+
trimmed = explanation.strip()
|
|
166
|
+
if trimmed:
|
|
167
|
+
parts.append(f"Explanation: {trimmed}")
|
|
168
|
+
|
|
169
|
+
return " ".join(parts)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _extract_audio_recap_text(data: dict) -> str:
|
|
173
|
+
"""
|
|
174
|
+
Extract text from AudioRecapV2Section.
|
|
175
|
+
|
|
176
|
+
Format: "Script: {script}"
|
|
177
|
+
"""
|
|
178
|
+
script = data.get("script")
|
|
179
|
+
if script:
|
|
180
|
+
trimmed = script.strip()
|
|
181
|
+
if trimmed:
|
|
182
|
+
return f"Script: {trimmed}"
|
|
183
|
+
return ""
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _strip_flashcard_syntax(text: str) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Strip {{...}} markers from cloze/fill-in-blank text.
|
|
189
|
+
|
|
190
|
+
Example: "The {{mitochondria}} is the powerhouse" -> "The mitochondria is the powerhouse"
|
|
191
|
+
"""
|
|
192
|
+
if not text:
|
|
193
|
+
return ""
|
|
194
|
+
# Replace {{word}} with just word
|
|
195
|
+
return re.sub(r"\{\{([^}]+)\}\}", r"\1", text)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _get_multiple_choice_options(options: list | None) -> list[str]:
|
|
199
|
+
"""
|
|
200
|
+
Extract option text from multiple choice options array.
|
|
201
|
+
|
|
202
|
+
Options can be strings or objects with "text" or "option" fields.
|
|
203
|
+
"""
|
|
204
|
+
if not options or not isinstance(options, list):
|
|
205
|
+
return []
|
|
206
|
+
|
|
207
|
+
result: list[str] = []
|
|
208
|
+
for opt in options:
|
|
209
|
+
text = _extract_option_text(opt)
|
|
210
|
+
if text:
|
|
211
|
+
result.append(text)
|
|
212
|
+
return result
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _extract_option_text(item: str | dict | MultipleChoiceOption) -> str:
|
|
216
|
+
"""Extract text from a single option (string or object)."""
|
|
217
|
+
if isinstance(item, str):
|
|
218
|
+
return item.strip()
|
|
219
|
+
|
|
220
|
+
if isinstance(item, dict):
|
|
221
|
+
# Try "text" field first, then "option"
|
|
222
|
+
text = item.get("text")
|
|
223
|
+
if text and isinstance(text, str):
|
|
224
|
+
return text.strip()
|
|
225
|
+
option = item.get("option")
|
|
226
|
+
if option and isinstance(option, str):
|
|
227
|
+
return option.strip()
|
|
228
|
+
|
|
229
|
+
if hasattr(item, "text") and item.text:
|
|
230
|
+
return item.text.strip()
|
|
231
|
+
if hasattr(item, "option") and item.option:
|
|
232
|
+
return item.option.strip()
|
|
233
|
+
|
|
234
|
+
return ""
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _get_string_array(answers: list | None) -> list[str]:
|
|
238
|
+
"""Extract string array from answers (handles both string[] and AnswerObject[])."""
|
|
239
|
+
if not answers or not isinstance(answers, list):
|
|
240
|
+
return []
|
|
241
|
+
|
|
242
|
+
result: list[str] = []
|
|
243
|
+
for ans in answers:
|
|
244
|
+
if isinstance(ans, str):
|
|
245
|
+
trimmed = ans.strip()
|
|
246
|
+
if trimmed:
|
|
247
|
+
result.append(trimmed)
|
|
248
|
+
elif isinstance(ans, dict):
|
|
249
|
+
text = ans.get("text")
|
|
250
|
+
if text and isinstance(text, str):
|
|
251
|
+
trimmed = text.strip()
|
|
252
|
+
if trimmed:
|
|
253
|
+
result.append(trimmed)
|
|
254
|
+
elif hasattr(ans, "text") and ans.text:
|
|
255
|
+
trimmed = ans.text.strip()
|
|
256
|
+
if trimmed:
|
|
257
|
+
result.append(trimmed)
|
|
258
|
+
|
|
259
|
+
return result
|
vector_sdk/hash/types.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for content hashing.
|
|
3
|
+
|
|
4
|
+
These types mirror the TypeScript SDK types and are derived from the proto definitions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Literal, Optional, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
# Tool collection types
|
|
12
|
+
ToolCollection = Literal["FlashCard", "TestQuestion", "SpacedTestQuestion", "AudioRecapV2Section"]
|
|
13
|
+
|
|
14
|
+
# FlashCard type variants
|
|
15
|
+
FlashCardType = Literal["BASIC", "CLOZE", "FILL_IN_THE_BLANK", "MULTIPLE_CHOICE"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MultipleChoiceOption(BaseModel):
|
|
19
|
+
"""Multiple choice option structure."""
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(extra="allow")
|
|
22
|
+
|
|
23
|
+
text: Optional[str] = None
|
|
24
|
+
option: Optional[str] = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AnswerObject(BaseModel):
|
|
28
|
+
"""Answer object structure (can be string or object with text field)."""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(extra="allow")
|
|
31
|
+
|
|
32
|
+
text: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FlashCardData(BaseModel):
|
|
36
|
+
"""
|
|
37
|
+
FlashCard data for content hashing.
|
|
38
|
+
|
|
39
|
+
If type is not provided, defaults to "BASIC".
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
model_config = ConfigDict(extra="allow", populate_by_name=True)
|
|
43
|
+
|
|
44
|
+
type: Optional[FlashCardType] = "BASIC"
|
|
45
|
+
term: Optional[str] = None
|
|
46
|
+
definition: Optional[str] = None
|
|
47
|
+
multiple_choice_options: Optional[list[Union[str, MultipleChoiceOption, dict]]] = Field(
|
|
48
|
+
default=None, alias="multipleChoiceOptions"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class QuestionData(BaseModel):
|
|
53
|
+
"""Question data for TestQuestion and SpacedTestQuestion."""
|
|
54
|
+
|
|
55
|
+
model_config = ConfigDict(extra="allow")
|
|
56
|
+
|
|
57
|
+
question: Optional[str] = None
|
|
58
|
+
answers: Optional[list[Union[str, AnswerObject, dict]]] = None
|
|
59
|
+
explanation: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AudioRecapSectionData(BaseModel):
|
|
63
|
+
"""AudioRecapV2Section data for content hashing."""
|
|
64
|
+
|
|
65
|
+
model_config = ConfigDict(extra="allow")
|
|
66
|
+
|
|
67
|
+
script: Optional[str] = None
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Namespace exports for the Vector SDK."""
|
|
2
|
+
|
|
3
|
+
from vector_sdk.namespaces.base import BaseNamespace
|
|
4
|
+
from vector_sdk.namespaces.db import DBNamespace
|
|
5
|
+
from vector_sdk.namespaces.embeddings import EmbeddingsNamespace
|
|
6
|
+
from vector_sdk.namespaces.search import SearchNamespace
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseNamespace",
|
|
10
|
+
"EmbeddingsNamespace",
|
|
11
|
+
"SearchNamespace",
|
|
12
|
+
"DBNamespace",
|
|
13
|
+
]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base namespace class providing shared context for all namespace implementations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from redis import Redis
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseNamespace:
|
|
11
|
+
"""
|
|
12
|
+
Base class for all namespace implementations.
|
|
13
|
+
Provides access to shared Redis and HTTP clients.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, redis: Redis, http_url: Optional[str] = None):
|
|
17
|
+
"""
|
|
18
|
+
Initialize the base namespace.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
redis: Redis client instance
|
|
22
|
+
http_url: Optional HTTP URL for query-gateway API
|
|
23
|
+
"""
|
|
24
|
+
self._redis = redis
|
|
25
|
+
self._http_url = http_url
|
|
26
|
+
|
|
27
|
+
def _require_http_url(self, method_name: str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Helper to require http_url for HTTP-based operations.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
method_name: Name of the method requiring http_url
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The http_url
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ValueError: If http_url is not configured
|
|
39
|
+
"""
|
|
40
|
+
if not self._http_url:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"http_url is required for {method_name}. "
|
|
43
|
+
"Set it in VectorClient constructor."
|
|
44
|
+
)
|
|
45
|
+
return self._http_url
|