zvec 0.2.1b2.dev1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. _zvec.cp310-win_amd64.pyd +0 -0
  2. gflags_nothreads.dll +0 -0
  3. glog.dll +0 -0
  4. lib/zvec_ailego.lib +0 -0
  5. lib/zvec_core.lib +0 -0
  6. lib/zvec_turbo.lib +0 -0
  7. libprotobuf.dll +0 -0
  8. roaring.dll +0 -0
  9. zvec/__init__.py +168 -0
  10. zvec/__init__.pyi +171 -0
  11. zvec/common/__init__.py +18 -0
  12. zvec/common/constants.py +33 -0
  13. zvec/executor/__init__.py +26 -0
  14. zvec/executor/query_executor.py +307 -0
  15. zvec/extension/__init__.py +55 -0
  16. zvec/extension/bm25_embedding_function.py +375 -0
  17. zvec/extension/embedding_function.py +147 -0
  18. zvec/extension/http_embedding_function.py +162 -0
  19. zvec/extension/jina_embedding_function.py +240 -0
  20. zvec/extension/jina_function.py +182 -0
  21. zvec/extension/multi_vector_reranker.py +174 -0
  22. zvec/extension/openai_embedding_function.py +238 -0
  23. zvec/extension/openai_function.py +149 -0
  24. zvec/extension/qwen_embedding_function.py +537 -0
  25. zvec/extension/qwen_function.py +186 -0
  26. zvec/extension/qwen_rerank_function.py +162 -0
  27. zvec/extension/rerank_function.py +69 -0
  28. zvec/extension/sentence_transformer_embedding_function.py +839 -0
  29. zvec/extension/sentence_transformer_function.py +150 -0
  30. zvec/extension/sentence_transformer_rerank_function.py +384 -0
  31. zvec/model/__init__.py +22 -0
  32. zvec/model/collection.py +411 -0
  33. zvec/model/convert.py +54 -0
  34. zvec/model/doc.py +173 -0
  35. zvec/model/param/__init__.py +46 -0
  36. zvec/model/param/__init__.pyi +823 -0
  37. zvec/model/param/vector_query.py +80 -0
  38. zvec/model/schema/__init__.py +21 -0
  39. zvec/model/schema/__init__.pyi +109 -0
  40. zvec/model/schema/collection_schema.py +215 -0
  41. zvec/model/schema/field_schema.py +300 -0
  42. zvec/py.typed +0 -0
  43. zvec/tool/__init__.py +18 -0
  44. zvec/tool/util.py +63 -0
  45. zvec/typing/__init__.py +32 -0
  46. zvec/typing/__init__.pyi +404 -0
  47. zvec/typing/enum.py +62 -0
  48. zvec/zvec.py +226 -0
  49. zvec-0.2.1b2.dev1.dist-info/METADATA +183 -0
  50. zvec-0.2.1b2.dev1.dist-info/RECORD +52 -0
  51. zvec-0.2.1b2.dev1.dist-info/WHEEL +5 -0
  52. zvec-0.2.1b2.dev1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,307 @@
1
+ # Copyright 2025-present the zvec project
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from typing import Optional, Union, final
20
+
21
+ import numpy as np
22
+ from _zvec import _Collection
23
+ from _zvec.param import _VectorQuery
24
+
25
+ from ..extension import ReRanker, RrfReRanker, WeightedReRanker
26
+ from ..model.convert import convert_to_py_doc
27
+ from ..model.doc import Doc
28
+ from ..model.param.vector_query import VectorQuery
29
+ from ..model.schema import CollectionSchema
30
+ from ..typing import DataType
31
+
32
+ __all__ = [
33
+ "QueryContext",
34
+ "QueryExecutor",
35
+ "QueryExecutorFactory",
36
+ ]
37
+
38
+ DTYPE_MAP = {
39
+ DataType.VECTOR_FP16.value: np.float16,
40
+ DataType.VECTOR_FP32.value: np.float32,
41
+ DataType.VECTOR_FP64.value: np.float64,
42
+ DataType.VECTOR_INT8.value: np.int8,
43
+ }
44
+
45
+
46
+ def convert_to_numpy(vec: Union[list, np.ndarray], dtype: np.dtype) -> np.ndarray:
47
+ if isinstance(vec, np.ndarray):
48
+ if vec.dtype == dtype and vec.ndim == 1:
49
+ return vec
50
+ return np.asarray(vec, dtype=dtype).flatten()
51
+
52
+ try:
53
+ arr = np.asarray(vec, dtype=dtype)
54
+ if arr.ndim != 1:
55
+ arr = arr.flatten()
56
+ return arr
57
+ except (ValueError, TypeError) as e:
58
+ raise TypeError(
59
+ f"Cannot convert input to 1D numpy array with dtype={dtype}: {type(vec)}"
60
+ ) from e
61
+
62
+
63
+ class QueryContext:
64
+ def __init__(
65
+ self,
66
+ topk: int,
67
+ filter: Optional[str] = None,
68
+ include_vector: bool = False,
69
+ queries: Optional[list[VectorQuery]] = None,
70
+ output_fields: Optional[list[str]] = None,
71
+ reranker: Optional[ReRanker] = None,
72
+ ):
73
+ # query param
74
+ self._filter = filter
75
+ self._queries = queries or []
76
+ self._topk = topk
77
+ self._include_vector = include_vector
78
+ self._output_fields = output_fields
79
+
80
+ # reranker
81
+ self._reranker = reranker
82
+
83
+ # core vectors
84
+ self._core_vectors = []
85
+
86
+ @property
87
+ def topk(self):
88
+ return self._topk
89
+
90
+ @property
91
+ def queries(self):
92
+ return self._queries
93
+
94
+ @property
95
+ def filter(self):
96
+ return self._filter
97
+
98
+ @property
99
+ def reranker(self):
100
+ return self._reranker
101
+
102
+ @property
103
+ def output_fields(self):
104
+ return self._output_fields
105
+
106
+ @property
107
+ def include_vector(self):
108
+ return self._include_vector
109
+
110
+ @property
111
+ def core_vectors(self):
112
+ return self._core_vectors
113
+
114
+ @core_vectors.setter
115
+ def core_vectors(self, core_vectors: list[_VectorQuery]):
116
+ self._core_vectors = core_vectors
117
+
118
+
119
+ class QueryExecutor(ABC):
120
+ def __init__(self, schema: CollectionSchema):
121
+ self._schema = schema
122
+ self._concurrency = max(1, int(os.getenv("ZVEC_QUERY_CONCURRENCY", "1")))
123
+
124
+ @abstractmethod
125
+ def _do_validate(self, ctx: QueryContext) -> None:
126
+ pass
127
+
128
+ @abstractmethod
129
+ def _do_build(
130
+ self, ctx: QueryContext, collection: _Collection
131
+ ) -> list[_VectorQuery]:
132
+ pass
133
+
134
+ def _do_build_query_wo_vector(self, ctx: QueryContext) -> _VectorQuery:
135
+ core_vector = _VectorQuery()
136
+ core_vector.topk = ctx.topk
137
+ core_vector.include_vector = ctx.include_vector
138
+ if ctx.filter:
139
+ core_vector.filter = ctx.filter
140
+ if ctx.output_fields:
141
+ core_vector.output_fields = ctx.output_fields
142
+ return core_vector
143
+
144
+ def _do_build_query_with_vector(
145
+ self, ctx: QueryContext, query: VectorQuery, collection: _Collection
146
+ ) -> _VectorQuery:
147
+ core_vector = self._do_build_query_wo_vector(ctx)
148
+ core_vector.field_name = query.field_name
149
+ if query.param:
150
+ core_vector.query_params = query.param
151
+
152
+ vector_schema = (
153
+ self._schema.vector(query.field_name) if query else self._schema.vectors[0]
154
+ )
155
+
156
+ if vector_schema is None:
157
+ raise ValueError("No vector field found")
158
+
159
+ # set output_fields
160
+ core_vector.output_fields = ctx.output_fields
161
+
162
+ # set vector
163
+ if query.has_vector():
164
+ vec_data = query.vector
165
+ else:
166
+ fetched = collection.Fetch([query.id])
167
+ doc = next(iter(fetched.values()))
168
+ if not doc:
169
+ return core_vector
170
+ vec_data = doc.get_any(vector_schema.name, vector_schema.data_type)
171
+
172
+ target_dtype = DTYPE_MAP.get(vector_schema.data_type.value)
173
+ core_vector.set_vector(
174
+ vector_schema._get_object(),
175
+ convert_to_numpy(vec_data, target_dtype) if target_dtype else vec_data,
176
+ )
177
+ return core_vector
178
+
179
+ def _do_execute(
180
+ self, vectors: list[_VectorQuery], collection: _Collection
181
+ ) -> dict[str, list[Doc]]:
182
+ query_cnt = len(vectors)
183
+ if query_cnt == 0:
184
+ raise ValueError("No query to execute")
185
+
186
+ if len(vectors) == 1 or self._concurrency == 1:
187
+ results = {}
188
+ for query in vectors:
189
+ docs = collection.Query(query)
190
+ results[query.field_name] = [
191
+ convert_to_py_doc(doc, self._schema) for doc in docs
192
+ ]
193
+ return results
194
+
195
+ results = {}
196
+ with ThreadPoolExecutor(max_workers=self._concurrency) as executor:
197
+ future_to_query = {
198
+ executor.submit(collection.Query, query): query.field_name
199
+ for query in vectors
200
+ }
201
+
202
+ for future in as_completed(future_to_query):
203
+ field_name = future_to_query[future]
204
+ try:
205
+ docs = future.result()
206
+ results[field_name] = [
207
+ convert_to_py_doc(doc, self._schema) for doc in docs
208
+ ]
209
+ except Exception as e:
210
+ raise e
211
+ return results
212
+
213
+ def _do_merge_rerank_results(
214
+ self, ctx: QueryContext, docs_map: dict[str, list[Doc]]
215
+ ) -> list[Doc]:
216
+ query_result_cnt = len(docs_map) if docs_map else 0
217
+ if query_result_cnt == 0:
218
+ raise ValueError("Query results is none and dost not to rerank")
219
+ if query_result_cnt == 1:
220
+ if not ctx.reranker or isinstance(
221
+ ctx.reranker, (RrfReRanker, WeightedReRanker)
222
+ ):
223
+ return next(iter(docs_map.values()))
224
+ return ctx.reranker.rerank(docs_map)
225
+ return ctx.reranker.rerank(docs_map)
226
+
227
+ @final
228
+ def execute(self, ctx: QueryContext, collection: _Collection) -> list[Doc]:
229
+ # 1. validate query
230
+ self._do_validate(ctx)
231
+ # 2. build query vector
232
+ query_vectors = self._do_build(ctx, collection)
233
+ if not query_vectors:
234
+ raise ValueError("No query to execute")
235
+ # 3. execute query
236
+ docs = self._do_execute(query_vectors, collection)
237
+ # 4. merge and rerank result
238
+ return self._do_merge_rerank_results(ctx, docs)
239
+
240
+
241
+ class NoVectorQueryExecutor(QueryExecutor):
242
+ def __init__(self, schema: CollectionSchema):
243
+ super().__init__(schema)
244
+
245
+ def _do_validate(self, ctx: QueryContext) -> None:
246
+ if len(ctx.queries) > 0:
247
+ raise ValueError("Collection does not support query with vector or id")
248
+
249
+ def _do_build(
250
+ self, ctx: QueryContext, _collection: _Collection
251
+ ) -> list[_VectorQuery]:
252
+ return [self._do_build_query_wo_vector(ctx)]
253
+
254
+
255
+ class SingleVectorQueryExecutor(NoVectorQueryExecutor):
256
+ def __init__(self, schema: CollectionSchema) -> None:
257
+ super().__init__(schema)
258
+
259
+ def _do_validate(self, ctx: QueryContext) -> None:
260
+ if len(ctx.queries) > 1:
261
+ raise ValueError(
262
+ "Collection has only one vector field, cannot query with multiple vectors"
263
+ )
264
+ for query in ctx.queries:
265
+ query._validate()
266
+
267
+ def _do_build(
268
+ self, ctx: QueryContext, collection: _Collection
269
+ ) -> list[_VectorQuery]:
270
+ if len(ctx.queries) == 0:
271
+ return [self._do_build_query_wo_vector(ctx)]
272
+ vectors = []
273
+ for query in ctx.queries:
274
+ vectors.append(self._do_build_query_with_vector(ctx, query, collection))
275
+ return vectors
276
+
277
+
278
+ class MultiVectorQueryExecutor(SingleVectorQueryExecutor):
279
+ def __init__(self, schema: CollectionSchema) -> None:
280
+ super().__init__(schema)
281
+
282
+ def _do_validate(self, ctx: QueryContext) -> None:
283
+ if len(ctx.queries) > 1 and ctx.reranker is None:
284
+ raise ValueError("Reranker is required for multi-vector query")
285
+ seen_fields = set()
286
+ for query in ctx.queries:
287
+ query._validate()
288
+ field = query.field_name
289
+ if field in seen_fields:
290
+ raise ValueError(f"Query field name '{field}' appears more than once")
291
+ seen_fields.add(field)
292
+
293
+ def _do_execute(
294
+ self, vectors: list[_VectorQuery], collection: _Collection
295
+ ) -> dict[str, list[Doc]]:
296
+ return super()._do_execute(vectors, collection)
297
+
298
+
299
+ class QueryExecutorFactory:
300
+ @staticmethod
301
+ def create(schema: CollectionSchema) -> QueryExecutor:
302
+ vectors = schema.vectors
303
+ if len(vectors) == 0:
304
+ return NoVectorQueryExecutor(schema)
305
+ if len(vectors) == 1:
306
+ return SingleVectorQueryExecutor(schema)
307
+ return MultiVectorQueryExecutor(schema)
@@ -0,0 +1,55 @@
1
+ # Copyright 2025-present the zvec project
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from __future__ import annotations
15
+
16
+ from .bm25_embedding_function import BM25EmbeddingFunction
17
+ from .embedding_function import DenseEmbeddingFunction, SparseEmbeddingFunction
18
+ from .http_embedding_function import HTTPDenseEmbedding
19
+ from .jina_embedding_function import JinaDenseEmbedding
20
+ from .jina_function import JinaFunctionBase
21
+ from .multi_vector_reranker import RrfReRanker, WeightedReRanker
22
+ from .openai_embedding_function import OpenAIDenseEmbedding
23
+ from .openai_function import OpenAIFunctionBase
24
+ from .qwen_embedding_function import QwenDenseEmbedding, QwenSparseEmbedding
25
+ from .qwen_function import QwenFunctionBase
26
+ from .qwen_rerank_function import QwenReRanker
27
+ from .rerank_function import RerankFunction as ReRanker
28
+ from .sentence_transformer_embedding_function import (
29
+ DefaultLocalDenseEmbedding,
30
+ DefaultLocalSparseEmbedding,
31
+ )
32
+ from .sentence_transformer_function import SentenceTransformerFunctionBase
33
+ from .sentence_transformer_rerank_function import DefaultLocalReRanker
34
+
35
+ __all__ = [
36
+ "BM25EmbeddingFunction",
37
+ "DefaultLocalDenseEmbedding",
38
+ "DefaultLocalReRanker",
39
+ "DefaultLocalSparseEmbedding",
40
+ "DenseEmbeddingFunction",
41
+ "HTTPDenseEmbedding",
42
+ "JinaDenseEmbedding",
43
+ "JinaFunctionBase",
44
+ "OpenAIDenseEmbedding",
45
+ "OpenAIFunctionBase",
46
+ "QwenDenseEmbedding",
47
+ "QwenFunctionBase",
48
+ "QwenReRanker",
49
+ "QwenSparseEmbedding",
50
+ "ReRanker",
51
+ "RrfReRanker",
52
+ "SentenceTransformerFunctionBase",
53
+ "SparseEmbeddingFunction",
54
+ "WeightedReRanker",
55
+ ]