logosdb 0.7.7 → 0.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <functional>
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ namespace logosdb
9
+ {
10
+ namespace internal
11
+ {
12
+
13
+ // Write-Ahead Log for atomic Put operations.
14
+ //
15
+ // The WAL records a Put intent before any store is modified. On crash,
16
+ // incomplete entries are replayed on the next open() to ensure consistency
17
+ // across vector storage, metadata, and HNSW index.
18
+ //
19
+ // File format (append-only binary):
20
+ // [magic "LOGW" (4 bytes)]
21
+ // [version uint32 (4 bytes)]
22
+ // [sequence of WALEntry records]
23
+ //
24
+ // Each WALEntry:
25
+ // [state uint8: 0=pending, 1=committed]
26
+ // [dim uint32 (4 bytes)]
27
+ // [vector_bytes uint32 (4 bytes)]
28
+ // [vector data (dim*4 bytes)]
29
+ // [text_len uint32 (4 bytes)]
30
+ // [text (text_len bytes)]
31
+ // [ts_len uint32 (4 bytes)]
32
+ // [timestamp (ts_len bytes)]
33
+ // [expected_id uint64 (8 bytes)]
34
+
35
+ enum class WALState : uint8_t
36
+ {
37
+ PENDING = 0,
38
+ COMMITTED = 1,
39
+ ABORTED = 2
40
+ };
41
+
42
+ struct WALEntry
43
+ {
44
+ WALState state = WALState::PENDING;
45
+ uint32_t dim = 0;
46
+ std::vector<float> vector;
47
+ std::string text;
48
+ std::string timestamp;
49
+ uint64_t expected_id = 0; // Expected row id (for validation)
50
+ };
51
+
52
+ class WriteAheadLog
53
+ {
54
+ public:
55
+ WriteAheadLog() = default;
56
+ ~WriteAheadLog();
57
+
58
+ WriteAheadLog(const WriteAheadLog&) = delete;
59
+ WriteAheadLog& operator=(const WriteAheadLog&) = delete;
60
+
61
+ // Open or create WAL file at the given path.
62
+ bool open(const std::string& path, std::string& err);
63
+ void close();
64
+
65
+ // Append a new pending entry. Returns the entry offset in the file
66
+ // (needed to mark committed later), or -1 on error.
67
+ int64_t append_pending(const float* vec,
68
+ int dim,
69
+ const char* text,
70
+ const char* timestamp,
71
+ uint64_t expected_id,
72
+ std::string& err);
73
+
74
+ // Mark an entry as committed by its offset.
75
+ bool mark_committed(int64_t offset, std::string& err);
76
+
77
+ // Replay all pending entries, calling the provided function for each.
78
+ // Entries are marked committed after successful replay.
79
+ // Returns number of entries replayed, or -1 on error.
80
+ int replay_pending(std::function<bool(const WALEntry&, std::string&)> replay_fn,
81
+ std::string& err);
82
+
83
+ // Sync WAL to disk.
84
+ bool sync(std::string& err);
85
+
86
+ // Get count of pending entries (for debugging/metrics).
87
+ size_t pending_count() const { return pending_count_; }
88
+
89
+ private:
90
+ bool read_entry_at(int64_t offset, WALEntry& entry, std::string& err);
91
+ bool write_state_at(int64_t offset, WALState state, std::string& err);
92
+
93
+ std::string path_;
94
+ int fd_ = -1;
95
+ size_t pending_count_ = 0;
96
+ };
97
+
98
+ } // namespace internal
99
+ } // namespace logosdb
@@ -0,0 +1,163 @@
1
+ #pragma once
2
+ #include <unordered_map>
3
+ #include <fstream>
4
+ #include <mutex>
5
+ #include <algorithm>
6
+ #include <assert.h>
7
+
8
+ namespace hnswlib {
9
+ template<typename dist_t>
10
+ class BruteforceSearch : public AlgorithmInterface<dist_t> {
11
+ public:
12
+ char *data_;
13
+ size_t maxelements_;
14
+ size_t cur_element_count;
15
+ size_t size_per_element_;
16
+
17
+ size_t data_size_;
18
+ DISTFUNC <dist_t> fstdistfunc_;
19
+ void *dist_func_param_;
20
+ std::mutex index_lock;
21
+
22
+ std::unordered_map<labeltype, size_t > dict_external_to_internal;
23
+
24
+
25
+ BruteforceSearch(SpaceInterface <dist_t> *s)
26
+ : data_(nullptr),
27
+ maxelements_(0),
28
+ cur_element_count(0),
29
+ size_per_element_(0),
30
+ data_size_(0),
31
+ dist_func_param_(nullptr) {
32
+ }
33
+
34
+
35
+ BruteforceSearch(SpaceInterface<dist_t> *s, const std::string &location)
36
+ : data_(nullptr),
37
+ maxelements_(0),
38
+ cur_element_count(0),
39
+ size_per_element_(0),
40
+ data_size_(0),
41
+ dist_func_param_(nullptr) {
42
+ loadIndex(location, s);
43
+ }
44
+
45
+
46
+ BruteforceSearch(SpaceInterface <dist_t> *s, size_t maxElements) {
47
+ maxelements_ = maxElements;
48
+ data_size_ = s->get_data_size();
49
+ fstdistfunc_ = s->get_dist_func();
50
+ dist_func_param_ = s->get_dist_func_param();
51
+ size_per_element_ = data_size_ + sizeof(labeltype);
52
+ data_ = (char *) malloc(maxElements * size_per_element_);
53
+ if (data_ == nullptr)
54
+ throw std::runtime_error("Not enough memory: BruteforceSearch failed to allocate data");
55
+ cur_element_count = 0;
56
+ }
57
+
58
+
59
+ ~BruteforceSearch() {
60
+ free(data_);
61
+ }
62
+
63
+
64
+ void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) {
65
+ int idx;
66
+ {
67
+ std::unique_lock<std::mutex> lock(index_lock);
68
+
69
+ auto search = dict_external_to_internal.find(label);
70
+ if (search != dict_external_to_internal.end()) {
71
+ idx = search->second;
72
+ } else {
73
+ if (cur_element_count >= maxelements_) {
74
+ throw std::runtime_error("The number of elements exceeds the specified limit\n");
75
+ }
76
+ idx = cur_element_count;
77
+ dict_external_to_internal[label] = idx;
78
+ cur_element_count++;
79
+ }
80
+ }
81
+ memcpy(data_ + size_per_element_ * idx + data_size_, &label, sizeof(labeltype));
82
+ memcpy(data_ + size_per_element_ * idx, datapoint, data_size_);
83
+ }
84
+
85
+
86
+ void removePoint(labeltype cur_external) {
87
+ std::unique_lock<std::mutex> lock(index_lock);
88
+
89
+ auto found = dict_external_to_internal.find(cur_external);
90
+ if (found == dict_external_to_internal.end()) {
91
+ return;
92
+ }
93
+
94
+ dict_external_to_internal.erase(found);
95
+
96
+ size_t cur_c = found->second;
97
+ labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
98
+ dict_external_to_internal[label] = cur_c;
99
+ memcpy(data_ + size_per_element_ * cur_c,
100
+ data_ + size_per_element_ * (cur_element_count-1),
101
+ data_size_+sizeof(labeltype));
102
+ cur_element_count--;
103
+ }
104
+
105
+
106
+ std::priority_queue<std::pair<dist_t, labeltype >>
107
+ searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const {
108
+ assert(k <= cur_element_count);
109
+ std::priority_queue<std::pair<dist_t, labeltype >> topResults;
110
+ dist_t lastdist = std::numeric_limits<dist_t>::max();
111
+ for (int i = 0; i < cur_element_count; i++) {
112
+ dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
113
+ if (dist <= lastdist || topResults.size() < k) {
114
+ labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
115
+ if ((!isIdAllowed) || (*isIdAllowed)(label)) {
116
+ topResults.emplace(dist, label);
117
+ if (topResults.size() > k)
118
+ topResults.pop();
119
+ if (!topResults.empty())
120
+ lastdist = topResults.top().first;
121
+ }
122
+ }
123
+ }
124
+ return topResults;
125
+ }
126
+
127
+
128
+ void saveIndex(const std::string &location) {
129
+ std::ofstream output(location, std::ios::binary);
130
+ std::streampos position;
131
+
132
+ writeBinaryPOD(output, maxelements_);
133
+ writeBinaryPOD(output, size_per_element_);
134
+ writeBinaryPOD(output, cur_element_count);
135
+
136
+ output.write(data_, maxelements_ * size_per_element_);
137
+
138
+ output.close();
139
+ }
140
+
141
+
142
+ void loadIndex(const std::string &location, SpaceInterface<dist_t> *s) {
143
+ std::ifstream input(location, std::ios::binary);
144
+ std::streampos position;
145
+
146
+ readBinaryPOD(input, maxelements_);
147
+ readBinaryPOD(input, size_per_element_);
148
+ readBinaryPOD(input, cur_element_count);
149
+
150
+ data_size_ = s->get_data_size();
151
+ fstdistfunc_ = s->get_dist_func();
152
+ dist_func_param_ = s->get_dist_func_param();
153
+ size_per_element_ = data_size_ + sizeof(labeltype);
154
+ data_ = (char *) malloc(maxelements_ * size_per_element_);
155
+ if (data_ == nullptr)
156
+ throw std::runtime_error("Not enough memory: loadIndex failed to allocate data");
157
+
158
+ input.read(data_, maxelements_ * size_per_element_);
159
+
160
+ input.close();
161
+ }
162
+ };
163
+ } // namespace hnswlib