zvec 0.3.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. _zvec.cp312-win_amd64.pyd +0 -0
  2. bin/zvec_c_api.dll +0 -0
  3. include/include/zvec/ailego/buffer/buffer_manager.h +263 -0
  4. include/include/zvec/ailego/buffer/buffer_pool.h +173 -0
  5. include/include/zvec/ailego/buffer/concurrentqueue.h +4410 -0
  6. include/include/zvec/ailego/container/blob.h +131 -0
  7. include/include/zvec/ailego/container/cube.h +414 -0
  8. include/include/zvec/ailego/container/heap.h +234 -0
  9. include/include/zvec/ailego/container/hypercube.h +268 -0
  10. include/include/zvec/ailego/container/params.h +776 -0
  11. include/include/zvec/ailego/container/vector.h +1055 -0
  12. include/include/zvec/ailego/encoding/json/mod_json.h +1382 -0
  13. include/include/zvec/ailego/encoding/json/mod_json_plus.h +3446 -0
  14. include/include/zvec/ailego/encoding/json.h +17 -0
  15. include/include/zvec/ailego/hash/crc32c.h +35 -0
  16. include/include/zvec/ailego/hash/jump_hash.h +34 -0
  17. include/include/zvec/ailego/internal/platform.h +516 -0
  18. include/include/zvec/ailego/io/file.h +310 -0
  19. include/include/zvec/ailego/io/mmap_file.h +256 -0
  20. include/include/zvec/ailego/logger/logger.h +184 -0
  21. include/include/zvec/ailego/math_batch/utils.h +22 -0
  22. include/include/zvec/ailego/parallel/thread_pool.h +402 -0
  23. include/include/zvec/ailego/parallel/thread_queue.h +291 -0
  24. include/include/zvec/ailego/pattern/closure.h +530 -0
  25. include/include/zvec/ailego/pattern/expected.hpp +2605 -0
  26. include/include/zvec/ailego/pattern/factory.h +172 -0
  27. include/include/zvec/ailego/pattern/singleton.h +51 -0
  28. include/include/zvec/ailego/string/string_concat_helper.h +120 -0
  29. include/include/zvec/ailego/string/string_view.h +65 -0
  30. include/include/zvec/ailego/utility/file_helper.h +98 -0
  31. include/include/zvec/ailego/utility/float_helper.h +237 -0
  32. include/include/zvec/ailego/utility/string_helper.h +321 -0
  33. include/include/zvec/ailego/utility/string_helper_impl.h +173 -0
  34. include/include/zvec/ailego/utility/time_helper.h +206 -0
  35. include/include/zvec/ailego/utility/type_helper.h +128 -0
  36. include/include/zvec/c_api.h +3251 -0
  37. include/include/zvec/core/framework/index_builder.h +61 -0
  38. include/include/zvec/core/framework/index_bundle.h +275 -0
  39. include/include/zvec/core/framework/index_cluster.h +291 -0
  40. include/include/zvec/core/framework/index_context.h +265 -0
  41. include/include/zvec/core/framework/index_converter.h +231 -0
  42. include/include/zvec/core/framework/index_document.h +317 -0
  43. include/include/zvec/core/framework/index_dumper.h +163 -0
  44. include/include/zvec/core/framework/index_error.h +170 -0
  45. include/include/zvec/core/framework/index_factory.h +287 -0
  46. include/include/zvec/core/framework/index_features.h +604 -0
  47. include/include/zvec/core/framework/index_filter.h +74 -0
  48. include/include/zvec/core/framework/index_flow.h +647 -0
  49. include/include/zvec/core/framework/index_format.h +185 -0
  50. include/include/zvec/core/framework/index_framework.h +32 -0
  51. include/include/zvec/core/framework/index_groupby.h +54 -0
  52. include/include/zvec/core/framework/index_helper.h +60 -0
  53. include/include/zvec/core/framework/index_holder.h +1777 -0
  54. include/include/zvec/core/framework/index_logger.h +169 -0
  55. include/include/zvec/core/framework/index_mapping.h +217 -0
  56. include/include/zvec/core/framework/index_memory.h +266 -0
  57. include/include/zvec/core/framework/index_meta.h +714 -0
  58. include/include/zvec/core/framework/index_metric.h +143 -0
  59. include/include/zvec/core/framework/index_module.h +67 -0
  60. include/include/zvec/core/framework/index_packer.h +223 -0
  61. include/include/zvec/core/framework/index_plugin.h +117 -0
  62. include/include/zvec/core/framework/index_provider.h +470 -0
  63. include/include/zvec/core/framework/index_reducer.h +229 -0
  64. include/include/zvec/core/framework/index_refiner.h +92 -0
  65. include/include/zvec/core/framework/index_reformer.h +134 -0
  66. include/include/zvec/core/framework/index_runner.h +743 -0
  67. include/include/zvec/core/framework/index_searcher.h +57 -0
  68. include/include/zvec/core/framework/index_segment_storage.h +236 -0
  69. include/include/zvec/core/framework/index_stats.h +69 -0
  70. include/include/zvec/core/framework/index_storage.h +270 -0
  71. include/include/zvec/core/framework/index_streamer.h +55 -0
  72. include/include/zvec/core/framework/index_threads.h +168 -0
  73. include/include/zvec/core/framework/index_trainer.h +112 -0
  74. include/include/zvec/core/framework/index_unpacker.h +317 -0
  75. include/include/zvec/core/framework/index_version.h +31 -0
  76. include/include/zvec/core/interface/constants.h +30 -0
  77. include/include/zvec/core/interface/index.h +320 -0
  78. include/include/zvec/core/interface/index_factory.h +54 -0
  79. include/include/zvec/core/interface/index_param.h +365 -0
  80. include/include/zvec/core/interface/index_param_builders.h +410 -0
  81. include/include/zvec/db/collection.h +108 -0
  82. include/include/zvec/db/config.h +177 -0
  83. include/include/zvec/db/doc.h +407 -0
  84. include/include/zvec/db/index_params.h +431 -0
  85. include/include/zvec/db/options.h +69 -0
  86. include/include/zvec/db/query_params.h +175 -0
  87. include/include/zvec/db/schema.h +401 -0
  88. include/include/zvec/db/stats.h +35 -0
  89. include/include/zvec/db/status.h +181 -0
  90. include/include/zvec/db/type.h +142 -0
  91. include/include/zvec/turbo/turbo.h +55 -0
  92. include/zvec/c_api.h +3251 -0
  93. lib/zvec_ailego.lib +0 -0
  94. lib/zvec_c_api.lib +0 -0
  95. lib/zvec_core.lib +0 -0
  96. lib/zvec_db.lib +0 -0
  97. lib/zvec_turbo.lib +0 -0
  98. zvec/__init__.py +168 -0
  99. zvec/__init__.pyi +175 -0
  100. zvec/common/__init__.py +18 -0
  101. zvec/common/constants.py +33 -0
  102. zvec/executor/__init__.py +26 -0
  103. zvec/executor/query_executor.py +307 -0
  104. zvec/extension/__init__.py +55 -0
  105. zvec/extension/bm25_embedding_function.py +375 -0
  106. zvec/extension/embedding_function.py +147 -0
  107. zvec/extension/http_embedding_function.py +162 -0
  108. zvec/extension/jina_embedding_function.py +240 -0
  109. zvec/extension/jina_function.py +182 -0
  110. zvec/extension/multi_vector_reranker.py +174 -0
  111. zvec/extension/openai_embedding_function.py +238 -0
  112. zvec/extension/openai_function.py +149 -0
  113. zvec/extension/qwen_embedding_function.py +537 -0
  114. zvec/extension/qwen_function.py +186 -0
  115. zvec/extension/qwen_rerank_function.py +162 -0
  116. zvec/extension/rerank_function.py +69 -0
  117. zvec/extension/sentence_transformer_embedding_function.py +839 -0
  118. zvec/extension/sentence_transformer_function.py +150 -0
  119. zvec/extension/sentence_transformer_rerank_function.py +384 -0
  120. zvec/model/__init__.py +22 -0
  121. zvec/model/collection.py +421 -0
  122. zvec/model/convert.py +54 -0
  123. zvec/model/doc.py +173 -0
  124. zvec/model/param/__init__.py +46 -0
  125. zvec/model/param/__init__.pyi +823 -0
  126. zvec/model/param/vector_query.py +80 -0
  127. zvec/model/schema/__init__.py +21 -0
  128. zvec/model/schema/__init__.pyi +109 -0
  129. zvec/model/schema/collection_schema.py +215 -0
  130. zvec/model/schema/field_schema.py +303 -0
  131. zvec/py.typed +0 -0
  132. zvec/tool/__init__.py +18 -0
  133. zvec/tool/util.py +63 -0
  134. zvec/typing/__init__.py +32 -0
  135. zvec/typing/__init__.pyi +404 -0
  136. zvec/typing/enum.py +62 -0
  137. zvec/zvec.py +226 -0
  138. zvec-0.3.0.dist-info/METADATA +184 -0
  139. zvec-0.3.0.dist-info/RECORD +141 -0
  140. zvec-0.3.0.dist-info/WHEEL +5 -0
  141. zvec-0.3.0.dist-info/licenses/LICENSE +201 -0
Binary file
bin/zvec_c_api.dll ADDED
Binary file
@@ -0,0 +1,263 @@
1
+ // Copyright 2025-present the zvec project
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+
16
+ #pragma once
17
+
18
+
19
+ #include <sys/stat.h>
20
+ #include <chrono>
21
+ #include <cstdint>
22
+ #include <filesystem>
23
+ #include <memory>
24
+ #include <vector>
25
+ #include <zvec/ailego/io/file.h>
26
+ #include <zvec/ailego/pattern/singleton.h>
27
+
28
+ namespace arrow {
29
+ class ChunkedArray;
30
+ class Array;
31
+ class DataType;
32
+ class Scalar;
33
+ template <typename T>
34
+ class Result;
35
+ class Status;
36
+ class Buffer;
37
+ } // namespace arrow
38
+
39
+ namespace zvec {
40
+
41
+
42
+ namespace ailego {
43
+
44
+
45
+ struct BufferID;
46
+ class BufferManager;
47
+ class BufferHandle;
48
+
49
+
50
+ struct BufferID {
51
+ struct ParquetPos {
52
+ int column;
53
+ int row_group;
54
+ };
55
+ struct VectorPos {
56
+ uint32_t offset;
57
+ uint32_t length;
58
+ };
59
+ union Position {
60
+ explicit Position() = default;
61
+ ParquetPos forward;
62
+ VectorPos vector;
63
+ };
64
+ enum TYPE {
65
+ PARQUET = 1,
66
+ VECTOR = 2,
67
+ UNKNOWN = 0,
68
+ };
69
+
70
+
71
+ static std::uint64_t getLastModifiedNs(const std::filesystem::path &p) {
72
+ auto ftime = std::filesystem::last_write_time(p);
73
+ return static_cast<std::uint64_t>(ftime.time_since_epoch().count());
74
+ }
75
+
76
+ // Cross-platform helper to get nanosecond modification time
77
+ // static long get_st_mtime_nsec(const struct stat &file_stat) {
78
+ // #ifdef __APPLE__
79
+ // return file_stat.st_mtim.tv_nsec;
80
+ // #else
81
+ // return file_stat.st_mtim.tv_nsec;
82
+ // #endif
83
+ // }
84
+
85
+ static BufferID ParquetID(const std::string &file_name, int column,
86
+ int row_group) {
87
+ BufferID buffer_id{};
88
+ buffer_id.type = TYPE::PARQUET;
89
+ buffer_id.file_name = file_name;
90
+ buffer_id.pos.forward.column = column;
91
+ buffer_id.pos.forward.row_group = row_group;
92
+ struct stat file_stat;
93
+ if (stat(file_name.c_str(), &file_stat) == 0) {
94
+ // file_stat.st_ino contains the inode number
95
+ // file_stat.st_dev contains the device ID
96
+ // Together they uniquely identify a file
97
+ buffer_id.file_id = file_stat.st_ino;
98
+ std::filesystem::path p(file_name);
99
+ buffer_id.mtime = getLastModifiedNs(p);
100
+ }
101
+ return buffer_id;
102
+ }
103
+
104
+ static BufferID VectorID(const std::string &file_name, uint32_t offset,
105
+ uint32_t length) {
106
+ BufferID buffer_id{};
107
+ buffer_id.type = TYPE::VECTOR;
108
+ buffer_id.file_name = file_name;
109
+ struct stat file_stat;
110
+ if (stat(file_name.c_str(), &file_stat) == 0) {
111
+ buffer_id.file_id = file_stat.st_ino;
112
+ std::filesystem::path p(file_name);
113
+ buffer_id.mtime = getLastModifiedNs(p);
114
+ }
115
+ buffer_id.pos.vector.offset = offset;
116
+ buffer_id.pos.vector.length = length;
117
+ return buffer_id;
118
+ }
119
+
120
+ explicit BufferID() = default;
121
+
122
+ // Type of the file backing this buffer
123
+ TYPE type{UNKNOWN};
124
+
125
+ // Name of the file backing this buffer
126
+ std::string file_name{};
127
+
128
+ // Unique file id
129
+ uint64_t file_id{};
130
+
131
+ long mtime{};
132
+
133
+ // To identify which part of the backing file should be loaded into the buffer
134
+ Position pos{};
135
+
136
+
137
+ // Get the forward ID
138
+ const inline struct ParquetPos &parquet() const {
139
+ return pos.forward;
140
+ }
141
+
142
+
143
+ // Get the vector ID
144
+ const inline struct VectorPos &vector() const {
145
+ return pos.vector;
146
+ }
147
+
148
+
149
+ // Get debug string
150
+ const std::string to_string() const {
151
+ std::string msg{"Buffer["};
152
+ if (type == TYPE::PARQUET) {
153
+ msg += "parquet: " + file_name + "[" + std::to_string(file_id) + "]" +
154
+ ", column: " + std::to_string(parquet().column) +
155
+ ", row_group: " + std::to_string(parquet().row_group);
156
+ } else if (type == TYPE::VECTOR) {
157
+ msg += "vector: " + file_name + "[" + std::to_string(file_id) + "]" +
158
+ ", offset: " + std::to_string(vector().offset);
159
+ } else {
160
+ msg += "unknown";
161
+ }
162
+ msg += ", mtime: " + std::to_string(mtime);
163
+ msg += "]";
164
+ return msg;
165
+ }
166
+ };
167
+
168
+
169
+ // Thread-safe LRU buffer implementation.
170
+ class BufferManager : public Singleton<BufferManager> {
171
+ friend BufferHandle;
172
+
173
+ public:
174
+ void init(uint64_t limit, uint32_t num_shards = 1);
175
+
176
+ BufferHandle acquire(BufferID &buffer_id);
177
+
178
+ std::unique_ptr<BufferHandle> acquire_ptr(BufferID &buffer_id);
179
+
180
+ uint64_t total_size_in_bytes() const;
181
+
182
+ void cleanup();
183
+
184
+ ~BufferManager();
185
+
186
+ private:
187
+ struct BufferContext;
188
+
189
+ class BufferPool;
190
+
191
+ // Custom deleter for Arrow buffer that automatically notifies us when the
192
+ // buffer is no longer referenced by Arrow
193
+ struct ArrowBufferDeleter {
194
+ explicit ArrowBufferDeleter(BufferContext *c);
195
+ BufferContext *context;
196
+ // Only reduces the reference count but does not actually release the
197
+ // buffer, since the buffer memory is managed by the BufferManager.
198
+ void operator()(arrow::Buffer *);
199
+ };
200
+
201
+ std::vector<BufferPool *> pools_;
202
+ };
203
+
204
+
205
+ class BufferHandle {
206
+ public:
207
+ typedef std::unique_ptr<BufferHandle> Pointer;
208
+
209
+ explicit BufferHandle(BufferManager::BufferContext *context = nullptr);
210
+ BufferHandle(const BufferHandle &) = delete;
211
+ BufferHandle(BufferHandle &&) = default;
212
+ BufferHandle &operator=(const BufferHandle &) = delete;
213
+ BufferHandle &operator=(BufferHandle &&) = default;
214
+
215
+
216
+ ~BufferHandle();
217
+
218
+
219
+ // Pin parquet data in memory by allocating arrow buffers of appropriate size
220
+ // and reading data from the backing file.
221
+ // The lifecycle of the allocated memory is automatically managed through
222
+ // shared pointers. The buffers are guaranteed to be held until they are not
223
+ // referenced.
224
+ // Returns a pointer to the loaded ChunkedArray in Arrow format.
225
+ std::shared_ptr<arrow::ChunkedArray> pin_parquet_data();
226
+
227
+
228
+ // Pin vector data in memory by allocating a buffer of appropriate size and
229
+ // loading data from the backing file.
230
+ // The memory is guaranteed to be held until unpin() is called. The caller
231
+ // must call unpin() to release the memory when it is no longer needed.
232
+ // Returns a raw memory address.
233
+ void *pin_vector_data();
234
+
235
+
236
+ // Reduce the reference count for this vector buffer.
237
+ // Returns true if this was the last reference.
238
+ // When reference count is zero, the buffer is moved to the eviction list and
239
+ // becomes eligible for removal under memory pressure.
240
+ bool unpin_vector_data();
241
+
242
+
243
+ // Get the current reference count.
244
+ uint32_t references() const;
245
+
246
+
247
+ // Get the buffer size.
248
+ uint32_t size() const;
249
+
250
+
251
+ private:
252
+ using BufferContext = BufferManager::BufferContext;
253
+ using BufferPool = BufferManager::BufferPool;
254
+
255
+ BufferContext *context_;
256
+ BufferPool *pool_;
257
+ };
258
+
259
+
260
+ } // namespace ailego
261
+
262
+
263
+ } // namespace zvec
@@ -0,0 +1,173 @@
1
+ #pragma once
2
+
3
+ #include <sys/stat.h>
4
+ #include <fcntl.h>
5
+ #include <atomic>
6
+ #include <cassert>
7
+ #include <cstdio>
8
+ #include <cstdlib>
9
+ #include <cstring>
10
+ #include <iostream>
11
+ #include <limits>
12
+ #include <map>
13
+ #include <memory>
14
+ #include <mutex>
15
+ #include <queue>
16
+ #include <stdexcept>
17
+ #include <string>
18
+ #include <unordered_map>
19
+ #include <zvec/ailego/internal/platform.h>
20
+ #include "concurrentqueue.h"
21
+
22
+ #if defined(_MSC_VER)
23
+ #include <io.h>
24
+ #endif
25
+
26
+ namespace zvec {
27
+ namespace ailego {
28
+
29
+ using block_id_t = size_t;
30
+ using version_t = size_t;
31
+
32
+ class LPMap;
33
+
34
+ class LRUCache {
35
+ public:
36
+ typedef std::pair<block_id_t, version_t> BlockType;
37
+ typedef moodycamel::ConcurrentQueue<BlockType> ConcurrentQueue;
38
+
39
+ int init(size_t block_size);
40
+
41
+ bool evict_single_block(BlockType &item);
42
+
43
+ bool add_single_block(const LPMap *lp_map, const BlockType &block,
44
+ int block_type);
45
+
46
+ void clear_dead_node(const LPMap *lp_map);
47
+
48
+ private:
49
+ constexpr static size_t CATCH_QUEUE_NUM = 3;
50
+ size_t block_size_{0};
51
+ std::vector<ConcurrentQueue> queues_;
52
+ alignas(64) std::atomic<size_t> evict_queue_insertions_{0};
53
+ };
54
+
55
+ class LPMap {
56
+ struct Entry {
57
+ alignas(64) std::atomic<int> ref_count;
58
+ alignas(64) std::atomic<version_t> load_count;
59
+ char *buffer;
60
+ };
61
+
62
+ public:
63
+ LPMap() : entry_num_(0), entries_(nullptr) {}
64
+ ~LPMap() {
65
+ delete[] entries_;
66
+ }
67
+
68
+ void init(size_t entry_num);
69
+
70
+ char *acquire_block(block_id_t block_id, bool lru_mode);
71
+
72
+ void release_block(block_id_t block_id);
73
+
74
+ char *evict_block(block_id_t block_id);
75
+
76
+ char *set_block_acquired(block_id_t block_id, char *buffer);
77
+
78
+ void recycle(moodycamel::ConcurrentQueue<char *> &free_buffers);
79
+
80
+ size_t entry_num() const {
81
+ return entry_num_;
82
+ }
83
+
84
+ inline bool isDeadBlock(LRUCache::BlockType block) const {
85
+ Entry &entry = entries_[block.first];
86
+ return block.second != entry.load_count.load();
87
+ }
88
+
89
+ private:
90
+ size_t entry_num_{0};
91
+ Entry *entries_{nullptr};
92
+ LRUCache cache_;
93
+ };
94
+
95
+ class VecBufferPoolHandle;
96
+
97
+ class VecBufferPool {
98
+ public:
99
+ typedef std::shared_ptr<VecBufferPool> Pointer;
100
+
101
+ VecBufferPool(const std::string &filename);
102
+ ~VecBufferPool() {
103
+ // Free all buffers in the free list
104
+ char *buf = nullptr;
105
+ while (free_buffers_.try_dequeue(buf)) {
106
+ ailego_free(buf);
107
+ }
108
+ // Free any buffers still pinned in the map
109
+ for (size_t i = 0; i < lp_map_.entry_num(); ++i) {
110
+ char *b = lp_map_.evict_block(i);
111
+ if (b) ailego_free(b);
112
+ }
113
+ #if defined(_MSC_VER)
114
+ _close(fd_);
115
+ #else
116
+ close(fd_);
117
+ #endif
118
+ }
119
+
120
+ int init(size_t pool_capacity, size_t block_size, size_t segment_count);
121
+
122
+ VecBufferPoolHandle get_handle();
123
+
124
+ char *acquire_buffer(block_id_t block_id, size_t offset, size_t size,
125
+ int retry = 0);
126
+
127
+ int get_meta(size_t offset, size_t length, char *buffer);
128
+
129
+ size_t file_size() const {
130
+ return file_size_;
131
+ }
132
+
133
+ bool no_lru_mode() {
134
+ return no_lru_mode_;
135
+ }
136
+
137
+ private:
138
+ int fd_;
139
+ size_t file_size_;
140
+ size_t pool_capacity_;
141
+ bool no_lru_mode_;
142
+
143
+ public:
144
+ LPMap lp_map_;
145
+
146
+ private:
147
+ std::vector<std::unique_ptr<std::mutex>> mutex_vec_;
148
+ moodycamel::ConcurrentQueue<char *> free_buffers_;
149
+ };
150
+
151
+ class VecBufferPoolHandle {
152
+ public:
153
+ VecBufferPoolHandle(VecBufferPool &pool) : pool_(pool) {}
154
+ VecBufferPoolHandle(VecBufferPoolHandle &&other) : pool_(other.pool_) {}
155
+
156
+ ~VecBufferPoolHandle() = default;
157
+
158
+ typedef std::shared_ptr<VecBufferPoolHandle> Pointer;
159
+
160
+ char *get_block(size_t offset, size_t size, size_t block_id);
161
+
162
+ int get_meta(size_t offset, size_t length, char *buffer);
163
+
164
+ void release_one(block_id_t block_id);
165
+
166
+ void acquire_one(block_id_t block_id);
167
+
168
+ private:
169
+ VecBufferPool &pool_;
170
+ };
171
+
172
+ } // namespace ailego
173
+ } // namespace zvec