hnswlib 0.6.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +14 -7
- data/ext/hnswlib/hnswlibext.cpp +1 -3
- data/ext/hnswlib/hnswlibext.hpp +326 -93
- data/ext/hnswlib/src/bruteforce.h +142 -131
- data/ext/hnswlib/src/hnswalg.h +1028 -964
- data/ext/hnswlib/src/hnswlib.h +74 -66
- data/ext/hnswlib/src/space_ip.h +299 -299
- data/ext/hnswlib/src/space_l2.h +268 -273
- data/ext/hnswlib/src/visited_list_pool.h +54 -55
- data/lib/hnswlib/version.rb +2 -2
- data/lib/hnswlib.rb +21 -18
- data/sig/hnswlib.rbs +9 -7
- metadata +3 -3
data/ext/hnswlib/src/hnswalg.h
CHANGED
@@ -10,1199 +10,1263 @@
|
|
10
10
|
#include <list>
|
11
11
|
|
12
12
|
namespace hnswlib {
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
13
|
+
typedef unsigned int tableint;
|
14
|
+
typedef unsigned int linklistsizeint;
|
15
|
+
|
16
|
+
template<typename dist_t>
|
17
|
+
class HierarchicalNSW : public AlgorithmInterface<dist_t> {
|
18
|
+
public:
|
19
|
+
static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
|
20
|
+
static const unsigned char DELETE_MARK = 0x01;
|
21
|
+
|
22
|
+
size_t max_elements_{0};
|
23
|
+
mutable std::atomic<size_t> cur_element_count{0}; // current number of elements
|
24
|
+
size_t size_data_per_element_{0};
|
25
|
+
size_t size_links_per_element_{0};
|
26
|
+
mutable std::atomic<size_t> num_deleted_{0}; // number of deleted elements
|
27
|
+
size_t M_{0};
|
28
|
+
size_t maxM_{0};
|
29
|
+
size_t maxM0_{0};
|
30
|
+
size_t ef_construction_{0};
|
31
|
+
size_t ef_{ 0 };
|
32
|
+
|
33
|
+
double mult_{0.0}, revSize_{0.0};
|
34
|
+
int maxlevel_{0};
|
35
|
+
|
36
|
+
VisitedListPool *visited_list_pool_{nullptr};
|
37
|
+
|
38
|
+
// Locks operations with element by label value
|
39
|
+
mutable std::vector<std::mutex> label_op_locks_;
|
40
|
+
|
41
|
+
std::mutex global;
|
42
|
+
std::vector<std::mutex> link_list_locks_;
|
43
|
+
|
44
|
+
tableint enterpoint_node_{0};
|
45
|
+
|
46
|
+
size_t size_links_level0_{0};
|
47
|
+
size_t offsetData_{0}, offsetLevel0_{0}, label_offset_{ 0 };
|
48
|
+
|
49
|
+
char *data_level0_memory_{nullptr};
|
50
|
+
char **linkLists_{nullptr};
|
51
|
+
std::vector<int> element_levels_; // keeps level of each element
|
52
|
+
|
53
|
+
size_t data_size_{0};
|
54
|
+
|
55
|
+
DISTFUNC<dist_t> fstdistfunc_;
|
56
|
+
void *dist_func_param_{nullptr};
|
57
|
+
|
58
|
+
mutable std::mutex label_lookup_lock; // lock for label_lookup_
|
59
|
+
std::unordered_map<labeltype, tableint> label_lookup_;
|
60
|
+
|
61
|
+
std::default_random_engine level_generator_;
|
62
|
+
std::default_random_engine update_probability_generator_;
|
63
|
+
|
64
|
+
mutable std::atomic<long> metric_distance_computations{0};
|
65
|
+
mutable std::atomic<long> metric_hops{0};
|
66
|
+
|
67
|
+
bool allow_replace_deleted_ = false; // flag to replace deleted elements (marked as deleted) during insertions
|
68
|
+
|
69
|
+
std::mutex deleted_elements_lock; // lock for deleted_elements
|
70
|
+
std::unordered_set<tableint> deleted_elements; // contains internal ids of deleted elements
|
71
|
+
|
72
|
+
HierarchicalNSW() { }
|
73
|
+
|
74
|
+
HierarchicalNSW(SpaceInterface<dist_t> *s) {
|
75
|
+
}
|
76
|
+
|
77
|
+
|
78
|
+
HierarchicalNSW(
|
79
|
+
SpaceInterface<dist_t> *s,
|
80
|
+
const std::string &location,
|
81
|
+
bool nmslib = false,
|
82
|
+
size_t max_elements = 0,
|
83
|
+
bool allow_replace_deleted = false)
|
84
|
+
: allow_replace_deleted_(allow_replace_deleted) {
|
85
|
+
loadIndex(location, s, max_elements);
|
86
|
+
}
|
87
|
+
|
88
|
+
|
89
|
+
HierarchicalNSW(
|
90
|
+
SpaceInterface<dist_t> *s,
|
91
|
+
size_t max_elements,
|
92
|
+
size_t M = 16,
|
93
|
+
size_t ef_construction = 200,
|
94
|
+
size_t random_seed = 100,
|
95
|
+
bool allow_replace_deleted = false)
|
96
|
+
: link_list_locks_(max_elements),
|
97
|
+
label_op_locks_(MAX_LABEL_OPERATION_LOCKS),
|
98
|
+
element_levels_(max_elements),
|
99
|
+
allow_replace_deleted_(allow_replace_deleted) {
|
100
|
+
max_elements_ = max_elements;
|
101
|
+
num_deleted_ = 0;
|
102
|
+
data_size_ = s->get_data_size();
|
103
|
+
fstdistfunc_ = s->get_dist_func();
|
104
|
+
dist_func_param_ = s->get_dist_func_param();
|
105
|
+
M_ = M;
|
106
|
+
maxM_ = M_;
|
107
|
+
maxM0_ = M_ * 2;
|
108
|
+
ef_construction_ = std::max(ef_construction, M_);
|
109
|
+
ef_ = 10;
|
110
|
+
|
111
|
+
level_generator_.seed(random_seed);
|
112
|
+
update_probability_generator_.seed(random_seed + 1);
|
113
|
+
|
114
|
+
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
|
115
|
+
size_data_per_element_ = size_links_level0_ + data_size_ + sizeof(labeltype);
|
116
|
+
offsetData_ = size_links_level0_;
|
117
|
+
label_offset_ = size_links_level0_ + data_size_;
|
118
|
+
offsetLevel0_ = 0;
|
119
|
+
|
120
|
+
data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_);
|
121
|
+
if (data_level0_memory_ == nullptr)
|
122
|
+
throw std::runtime_error("Not enough memory");
|
123
|
+
|
124
|
+
cur_element_count = 0;
|
125
|
+
|
126
|
+
visited_list_pool_ = new VisitedListPool(1, max_elements);
|
23
127
|
|
24
|
-
|
25
|
-
|
26
|
-
|
128
|
+
// initializations for special treatment of the first node
|
129
|
+
enterpoint_node_ = -1;
|
130
|
+
maxlevel_ = -1;
|
131
|
+
|
132
|
+
linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
|
133
|
+
if (linkLists_ == nullptr)
|
134
|
+
throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists");
|
135
|
+
size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
|
136
|
+
mult_ = 1 / log(1.0 * M_);
|
137
|
+
revSize_ = 1.0 / mult_;
|
138
|
+
}
|
27
139
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
fstdistfunc_ = s->get_dist_func();
|
35
|
-
dist_func_param_ = s->get_dist_func_param();
|
36
|
-
M_ = M;
|
37
|
-
maxM_ = M_;
|
38
|
-
maxM0_ = M_ * 2;
|
39
|
-
ef_construction_ = std::max(ef_construction,M_);
|
40
|
-
ef_ = 10;
|
41
|
-
|
42
|
-
level_generator_.seed(random_seed);
|
43
|
-
update_probability_generator_.seed(random_seed + 1);
|
44
|
-
|
45
|
-
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
|
46
|
-
size_data_per_element_ = size_links_level0_ + data_size_ + sizeof(labeltype);
|
47
|
-
offsetData_ = size_links_level0_;
|
48
|
-
label_offset_ = size_links_level0_ + data_size_;
|
49
|
-
offsetLevel0_ = 0;
|
50
|
-
|
51
|
-
data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_);
|
52
|
-
if (data_level0_memory_ == nullptr)
|
53
|
-
throw std::runtime_error("Not enough memory");
|
54
|
-
|
55
|
-
cur_element_count = 0;
|
56
|
-
|
57
|
-
visited_list_pool_ = new VisitedListPool(1, max_elements);
|
58
|
-
|
59
|
-
//initializations for special treatment of the first node
|
60
|
-
enterpoint_node_ = -1;
|
61
|
-
maxlevel_ = -1;
|
62
|
-
|
63
|
-
linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
|
64
|
-
if (linkLists_ == nullptr)
|
65
|
-
throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists");
|
66
|
-
size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
|
67
|
-
mult_ = 1 / log(1.0 * M_);
|
68
|
-
revSize_ = 1.0 / mult_;
|
140
|
+
|
141
|
+
~HierarchicalNSW() {
|
142
|
+
free(data_level0_memory_);
|
143
|
+
for (tableint i = 0; i < cur_element_count; i++) {
|
144
|
+
if (element_levels_[i] > 0)
|
145
|
+
free(linkLists_[i]);
|
69
146
|
}
|
147
|
+
free(linkLists_);
|
148
|
+
delete visited_list_pool_;
|
149
|
+
}
|
70
150
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
};
|
77
|
-
|
78
|
-
~HierarchicalNSW() {
|
79
|
-
if (data_level0_memory_) free(data_level0_memory_);
|
80
|
-
if (linkLists_) {
|
81
|
-
for (tableint i = 0; i < cur_element_count; i++) {
|
82
|
-
if (element_levels_[i] > 0)
|
83
|
-
if (linkLists_[i]) free(linkLists_[i]);
|
84
|
-
}
|
85
|
-
free(linkLists_);
|
86
|
-
}
|
87
|
-
if (visited_list_pool_) delete visited_list_pool_;
|
151
|
+
|
152
|
+
struct CompareByFirst {
|
153
|
+
constexpr bool operator()(std::pair<dist_t, tableint> const& a,
|
154
|
+
std::pair<dist_t, tableint> const& b) const noexcept {
|
155
|
+
return a.first < b.first;
|
88
156
|
}
|
157
|
+
};
|
89
158
|
|
90
|
-
size_t max_elements_;
|
91
|
-
size_t cur_element_count;
|
92
|
-
size_t size_data_per_element_;
|
93
|
-
size_t size_links_per_element_;
|
94
|
-
size_t num_deleted_;
|
95
159
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
size_t ef_construction_;
|
160
|
+
void setEf(size_t ef) {
|
161
|
+
ef_ = ef;
|
162
|
+
}
|
100
163
|
|
101
|
-
double mult_, revSize_;
|
102
|
-
int maxlevel_;
|
103
164
|
|
165
|
+
inline std::mutex& getLabelOpMutex(labeltype label) const {
|
166
|
+
// calculate hash
|
167
|
+
size_t lock_id = label & (MAX_LABEL_OPERATION_LOCKS - 1);
|
168
|
+
return label_op_locks_[lock_id];
|
169
|
+
}
|
104
170
|
|
105
|
-
VisitedListPool *visited_list_pool_;
|
106
|
-
std::mutex cur_element_count_guard_;
|
107
171
|
|
108
|
-
|
172
|
+
inline labeltype getExternalLabel(tableint internal_id) const {
|
173
|
+
labeltype return_label;
|
174
|
+
memcpy(&return_label, (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype));
|
175
|
+
return return_label;
|
176
|
+
}
|
109
177
|
|
110
|
-
// Locks to prevent race condition during update/insert of an element at same time.
|
111
|
-
// Note: Locks for additions can also be used to prevent this race condition if the querying of KNN is not exposed along with update/inserts i.e multithread insert/update/query in parallel.
|
112
|
-
std::vector<std::mutex> link_list_update_locks_;
|
113
|
-
tableint enterpoint_node_;
|
114
178
|
|
115
|
-
|
116
|
-
|
179
|
+
inline void setExternalLabel(tableint internal_id, labeltype label) const {
|
180
|
+
memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype));
|
181
|
+
}
|
117
182
|
|
118
|
-
char *data_level0_memory_;
|
119
|
-
char **linkLists_;
|
120
|
-
std::vector<int> element_levels_;
|
121
183
|
|
122
|
-
|
184
|
+
inline labeltype *getExternalLabeLp(tableint internal_id) const {
|
185
|
+
return (labeltype *) (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_);
|
186
|
+
}
|
123
187
|
|
124
|
-
size_t label_offset_;
|
125
|
-
DISTFUNC<dist_t> fstdistfunc_;
|
126
|
-
void *dist_func_param_;
|
127
|
-
std::unordered_map<labeltype, tableint> label_lookup_;
|
128
188
|
|
129
|
-
|
130
|
-
|
189
|
+
inline char *getDataByInternalId(tableint internal_id) const {
|
190
|
+
return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_);
|
191
|
+
}
|
131
192
|
|
132
|
-
inline labeltype getExternalLabel(tableint internal_id) const {
|
133
|
-
labeltype return_label;
|
134
|
-
memcpy(&return_label,(data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype));
|
135
|
-
return return_label;
|
136
|
-
}
|
137
193
|
|
138
|
-
|
139
|
-
|
140
|
-
|
194
|
+
int getRandomLevel(double reverse_size) {
|
195
|
+
std::uniform_real_distribution<double> distribution(0.0, 1.0);
|
196
|
+
double r = -log(distribution(level_generator_)) * reverse_size;
|
197
|
+
return (int) r;
|
198
|
+
}
|
141
199
|
|
142
|
-
|
143
|
-
|
144
|
-
|
200
|
+
size_t getMaxElements() {
|
201
|
+
return max_elements_;
|
202
|
+
}
|
145
203
|
|
146
|
-
|
147
|
-
|
148
|
-
|
204
|
+
size_t getCurrentElementCount() {
|
205
|
+
return cur_element_count;
|
206
|
+
}
|
149
207
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
return (int) r;
|
154
|
-
}
|
208
|
+
size_t getDeletedCount() {
|
209
|
+
return num_deleted_;
|
210
|
+
}
|
155
211
|
|
212
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
|
213
|
+
searchBaseLayer(tableint ep_id, const void *data_point, int layer) {
|
214
|
+
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
|
215
|
+
vl_type *visited_array = vl->mass;
|
216
|
+
vl_type visited_array_tag = vl->curV;
|
156
217
|
|
157
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
|
158
|
-
|
159
|
-
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
|
160
|
-
vl_type *visited_array = vl->mass;
|
161
|
-
vl_type visited_array_tag = vl->curV;
|
218
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
|
219
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidateSet;
|
162
220
|
|
163
|
-
|
164
|
-
|
221
|
+
dist_t lowerBound;
|
222
|
+
if (!isMarkedDeleted(ep_id)) {
|
223
|
+
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
|
224
|
+
top_candidates.emplace(dist, ep_id);
|
225
|
+
lowerBound = dist;
|
226
|
+
candidateSet.emplace(-dist, ep_id);
|
227
|
+
} else {
|
228
|
+
lowerBound = std::numeric_limits<dist_t>::max();
|
229
|
+
candidateSet.emplace(-lowerBound, ep_id);
|
230
|
+
}
|
231
|
+
visited_array[ep_id] = visited_array_tag;
|
165
232
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
lowerBound = dist;
|
171
|
-
candidateSet.emplace(-dist, ep_id);
|
172
|
-
} else {
|
173
|
-
lowerBound = std::numeric_limits<dist_t>::max();
|
174
|
-
candidateSet.emplace(-lowerBound, ep_id);
|
233
|
+
while (!candidateSet.empty()) {
|
234
|
+
std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
|
235
|
+
if ((-curr_el_pair.first) > lowerBound && top_candidates.size() == ef_construction_) {
|
236
|
+
break;
|
175
237
|
}
|
176
|
-
|
177
|
-
|
178
|
-
while (!candidateSet.empty()) {
|
179
|
-
std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
|
180
|
-
if ((-curr_el_pair.first) > lowerBound && top_candidates.size() == ef_construction_) {
|
181
|
-
break;
|
182
|
-
}
|
183
|
-
candidateSet.pop();
|
238
|
+
candidateSet.pop();
|
184
239
|
|
185
|
-
|
240
|
+
tableint curNodeNum = curr_el_pair.second;
|
186
241
|
|
187
|
-
|
242
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[curNodeNum]);
|
188
243
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
244
|
+
int *data; // = (int *)(linkList0_ + curNodeNum * size_links_per_element0_);
|
245
|
+
if (layer == 0) {
|
246
|
+
data = (int*)get_linklist0(curNodeNum);
|
247
|
+
} else {
|
248
|
+
data = (int*)get_linklist(curNodeNum, layer);
|
194
249
|
// data = (int *) (linkLists_[curNodeNum] + (layer - 1) * size_links_per_element_);
|
195
|
-
|
196
|
-
|
197
|
-
|
250
|
+
}
|
251
|
+
size_t size = getListCount((linklistsizeint*)data);
|
252
|
+
tableint *datal = (tableint *) (data + 1);
|
198
253
|
#ifdef USE_SSE
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
254
|
+
_mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
|
255
|
+
_mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
|
256
|
+
_mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0);
|
257
|
+
_mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0);
|
203
258
|
#endif
|
204
259
|
|
205
|
-
|
206
|
-
|
260
|
+
for (size_t j = 0; j < size; j++) {
|
261
|
+
tableint candidate_id = *(datal + j);
|
207
262
|
// if (candidate_id == 0) continue;
|
208
263
|
#ifdef USE_SSE
|
209
|
-
|
210
|
-
|
264
|
+
_mm_prefetch((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0);
|
265
|
+
_mm_prefetch(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0);
|
211
266
|
#endif
|
212
|
-
|
213
|
-
|
214
|
-
|
267
|
+
if (visited_array[candidate_id] == visited_array_tag) continue;
|
268
|
+
visited_array[candidate_id] = visited_array_tag;
|
269
|
+
char *currObj1 = (getDataByInternalId(candidate_id));
|
215
270
|
|
216
|
-
|
217
|
-
|
218
|
-
|
271
|
+
dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_);
|
272
|
+
if (top_candidates.size() < ef_construction_ || lowerBound > dist1) {
|
273
|
+
candidateSet.emplace(-dist1, candidate_id);
|
219
274
|
#ifdef USE_SSE
|
220
|
-
|
275
|
+
_mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0);
|
221
276
|
#endif
|
222
277
|
|
223
|
-
|
224
|
-
|
278
|
+
if (!isMarkedDeleted(candidate_id))
|
279
|
+
top_candidates.emplace(dist1, candidate_id);
|
225
280
|
|
226
|
-
|
227
|
-
|
281
|
+
if (top_candidates.size() > ef_construction_)
|
282
|
+
top_candidates.pop();
|
228
283
|
|
229
|
-
|
230
|
-
|
231
|
-
}
|
284
|
+
if (!top_candidates.empty())
|
285
|
+
lowerBound = top_candidates.top().first;
|
232
286
|
}
|
233
287
|
}
|
234
|
-
|
235
|
-
|
236
|
-
|
288
|
+
}
|
289
|
+
visited_list_pool_->releaseVisitedList(vl);
|
290
|
+
|
291
|
+
return top_candidates;
|
292
|
+
}
|
293
|
+
|
294
|
+
|
295
|
+
template <bool has_deletions, bool collect_metrics = false>
|
296
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
|
297
|
+
searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, BaseFilterFunctor* isIdAllowed = nullptr) const {
|
298
|
+
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
|
299
|
+
vl_type *visited_array = vl->mass;
|
300
|
+
vl_type visited_array_tag = vl->curV;
|
301
|
+
|
302
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
|
303
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set;
|
304
|
+
|
305
|
+
dist_t lowerBound;
|
306
|
+
if ((!has_deletions || !isMarkedDeleted(ep_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(ep_id)))) {
|
307
|
+
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
|
308
|
+
lowerBound = dist;
|
309
|
+
top_candidates.emplace(dist, ep_id);
|
310
|
+
candidate_set.emplace(-dist, ep_id);
|
311
|
+
} else {
|
312
|
+
lowerBound = std::numeric_limits<dist_t>::max();
|
313
|
+
candidate_set.emplace(-lowerBound, ep_id);
|
237
314
|
}
|
238
315
|
|
239
|
-
|
240
|
-
mutable std::atomic<long> metric_hops;
|
241
|
-
|
242
|
-
template <bool has_deletions, bool collect_metrics=false>
|
243
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
|
244
|
-
searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef) const {
|
245
|
-
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
|
246
|
-
vl_type *visited_array = vl->mass;
|
247
|
-
vl_type visited_array_tag = vl->curV;
|
248
|
-
|
249
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
|
250
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set;
|
251
|
-
|
252
|
-
dist_t lowerBound;
|
253
|
-
if (!has_deletions || !isMarkedDeleted(ep_id)) {
|
254
|
-
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_);
|
255
|
-
lowerBound = dist;
|
256
|
-
top_candidates.emplace(dist, ep_id);
|
257
|
-
candidate_set.emplace(-dist, ep_id);
|
258
|
-
} else {
|
259
|
-
lowerBound = std::numeric_limits<dist_t>::max();
|
260
|
-
candidate_set.emplace(-lowerBound, ep_id);
|
261
|
-
}
|
262
|
-
|
263
|
-
visited_array[ep_id] = visited_array_tag;
|
264
|
-
|
265
|
-
while (!candidate_set.empty()) {
|
316
|
+
visited_array[ep_id] = visited_array_tag;
|
266
317
|
|
267
|
-
|
318
|
+
while (!candidate_set.empty()) {
|
319
|
+
std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
|
268
320
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
321
|
+
if ((-current_node_pair.first) > lowerBound &&
|
322
|
+
(top_candidates.size() == ef || (!isIdAllowed && !has_deletions))) {
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
candidate_set.pop();
|
273
326
|
|
274
|
-
|
275
|
-
|
276
|
-
|
327
|
+
tableint current_node_id = current_node_pair.second;
|
328
|
+
int *data = (int *) get_linklist0(current_node_id);
|
329
|
+
size_t size = getListCount((linklistsizeint*)data);
|
277
330
|
// bool cur_node_deleted = isMarkedDeleted(current_node_id);
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
331
|
+
if (collect_metrics) {
|
332
|
+
metric_hops++;
|
333
|
+
metric_distance_computations+=size;
|
334
|
+
}
|
282
335
|
|
283
336
|
#ifdef USE_SSE
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
337
|
+
_mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
|
338
|
+
_mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
|
339
|
+
_mm_prefetch(data_level0_memory_ + (*(data + 1)) * size_data_per_element_ + offsetData_, _MM_HINT_T0);
|
340
|
+
_mm_prefetch((char *) (data + 2), _MM_HINT_T0);
|
288
341
|
#endif
|
289
342
|
|
290
|
-
|
291
|
-
|
343
|
+
for (size_t j = 1; j <= size; j++) {
|
344
|
+
int candidate_id = *(data + j);
|
292
345
|
// if (candidate_id == 0) continue;
|
293
346
|
#ifdef USE_SSE
|
294
|
-
|
295
|
-
|
296
|
-
|
347
|
+
_mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0);
|
348
|
+
_mm_prefetch(data_level0_memory_ + (*(data + j + 1)) * size_data_per_element_ + offsetData_,
|
349
|
+
_MM_HINT_T0); ////////////
|
297
350
|
#endif
|
298
|
-
|
299
|
-
|
300
|
-
visited_array[candidate_id] = visited_array_tag;
|
351
|
+
if (!(visited_array[candidate_id] == visited_array_tag)) {
|
352
|
+
visited_array[candidate_id] = visited_array_tag;
|
301
353
|
|
302
|
-
|
303
|
-
|
354
|
+
char *currObj1 = (getDataByInternalId(candidate_id));
|
355
|
+
dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_);
|
304
356
|
|
305
|
-
|
306
|
-
|
357
|
+
if (top_candidates.size() < ef || lowerBound > dist) {
|
358
|
+
candidate_set.emplace(-dist, candidate_id);
|
307
359
|
#ifdef USE_SSE
|
308
|
-
|
309
|
-
|
310
|
-
|
360
|
+
_mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ +
|
361
|
+
offsetLevel0_, ///////////
|
362
|
+
_MM_HINT_T0); ////////////////////////
|
311
363
|
#endif
|
312
364
|
|
313
|
-
|
314
|
-
|
365
|
+
if ((!has_deletions || !isMarkedDeleted(candidate_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id))))
|
366
|
+
top_candidates.emplace(dist, candidate_id);
|
315
367
|
|
316
|
-
|
317
|
-
|
368
|
+
if (top_candidates.size() > ef)
|
369
|
+
top_candidates.pop();
|
318
370
|
|
319
|
-
|
320
|
-
|
321
|
-
}
|
371
|
+
if (!top_candidates.empty())
|
372
|
+
lowerBound = top_candidates.top().first;
|
322
373
|
}
|
323
374
|
}
|
324
375
|
}
|
325
|
-
|
326
|
-
visited_list_pool_->releaseVisitedList(vl);
|
327
|
-
return top_candidates;
|
328
376
|
}
|
329
377
|
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
if (top_candidates.size() < M) {
|
334
|
-
return;
|
335
|
-
}
|
378
|
+
visited_list_pool_->releaseVisitedList(vl);
|
379
|
+
return top_candidates;
|
380
|
+
}
|
336
381
|
|
337
|
-
std::priority_queue<std::pair<dist_t, tableint>> queue_closest;
|
338
|
-
std::vector<std::pair<dist_t, tableint>> return_list;
|
339
|
-
while (top_candidates.size() > 0) {
|
340
|
-
queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
|
341
|
-
top_candidates.pop();
|
342
|
-
}
|
343
382
|
|
344
|
-
|
345
|
-
|
383
|
+
void getNeighborsByHeuristic2(
|
384
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
|
385
|
+
const size_t M) {
|
386
|
+
if (top_candidates.size() < M) {
|
387
|
+
return;
|
388
|
+
}
|
389
|
+
|
390
|
+
std::priority_queue<std::pair<dist_t, tableint>> queue_closest;
|
391
|
+
std::vector<std::pair<dist_t, tableint>> return_list;
|
392
|
+
while (top_candidates.size() > 0) {
|
393
|
+
queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
|
394
|
+
top_candidates.pop();
|
395
|
+
}
|
396
|
+
|
397
|
+
while (queue_closest.size()) {
|
398
|
+
if (return_list.size() >= M)
|
399
|
+
break;
|
400
|
+
std::pair<dist_t, tableint> curent_pair = queue_closest.top();
|
401
|
+
dist_t dist_to_query = -curent_pair.first;
|
402
|
+
queue_closest.pop();
|
403
|
+
bool good = true;
|
404
|
+
|
405
|
+
for (std::pair<dist_t, tableint> second_pair : return_list) {
|
406
|
+
dist_t curdist =
|
407
|
+
fstdistfunc_(getDataByInternalId(second_pair.second),
|
408
|
+
getDataByInternalId(curent_pair.second),
|
409
|
+
dist_func_param_);
|
410
|
+
if (curdist < dist_to_query) {
|
411
|
+
good = false;
|
346
412
|
break;
|
347
|
-
std::pair<dist_t, tableint> curent_pair = queue_closest.top();
|
348
|
-
dist_t dist_to_query = -curent_pair.first;
|
349
|
-
queue_closest.pop();
|
350
|
-
bool good = true;
|
351
|
-
|
352
|
-
for (std::pair<dist_t, tableint> second_pair : return_list) {
|
353
|
-
dist_t curdist =
|
354
|
-
fstdistfunc_(getDataByInternalId(second_pair.second),
|
355
|
-
getDataByInternalId(curent_pair.second),
|
356
|
-
dist_func_param_);;
|
357
|
-
if (curdist < dist_to_query) {
|
358
|
-
good = false;
|
359
|
-
break;
|
360
|
-
}
|
361
|
-
}
|
362
|
-
if (good) {
|
363
|
-
return_list.push_back(curent_pair);
|
364
413
|
}
|
365
414
|
}
|
366
|
-
|
367
|
-
|
368
|
-
top_candidates.emplace(-curent_pair.first, curent_pair.second);
|
415
|
+
if (good) {
|
416
|
+
return_list.push_back(curent_pair);
|
369
417
|
}
|
370
418
|
}
|
371
419
|
|
420
|
+
for (std::pair<dist_t, tableint> curent_pair : return_list) {
|
421
|
+
top_candidates.emplace(-curent_pair.first, curent_pair.second);
|
422
|
+
}
|
423
|
+
}
|
372
424
|
|
373
|
-
linklistsizeint *get_linklist0(tableint internal_id) const {
|
374
|
-
return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_);
|
375
|
-
};
|
376
425
|
|
377
|
-
|
378
|
-
|
379
|
-
|
426
|
+
linklistsizeint *get_linklist0(tableint internal_id) const {
|
427
|
+
return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_);
|
428
|
+
}
|
380
429
|
|
381
|
-
linklistsizeint *get_linklist(tableint internal_id, int level) const {
|
382
|
-
return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_);
|
383
|
-
};
|
384
430
|
|
385
|
-
|
386
|
-
|
387
|
-
|
431
|
+
linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const {
|
432
|
+
return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_);
|
433
|
+
}
|
388
434
|
|
389
|
-
tableint mutuallyConnectNewElement(const void *data_point, tableint cur_c,
|
390
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
|
391
|
-
int level, bool isUpdate) {
|
392
|
-
size_t Mcurmax = level ? maxM_ : maxM0_;
|
393
|
-
getNeighborsByHeuristic2(top_candidates, M_);
|
394
|
-
if (top_candidates.size() > M_)
|
395
|
-
throw std::runtime_error("Should be not be more than M_ candidates returned by the heuristic");
|
396
435
|
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
selectedNeighbors.push_back(top_candidates.top().second);
|
401
|
-
top_candidates.pop();
|
402
|
-
}
|
436
|
+
linklistsizeint *get_linklist(tableint internal_id, int level) const {
|
437
|
+
return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_);
|
438
|
+
}
|
403
439
|
|
404
|
-
tableint next_closest_entry_point = selectedNeighbors.back();
|
405
440
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
ll_cur = get_linklist0(cur_c);
|
410
|
-
else
|
411
|
-
ll_cur = get_linklist(cur_c, level);
|
441
|
+
linklistsizeint *get_linklist_at_level(tableint internal_id, int level) const {
|
442
|
+
return level == 0 ? get_linklist0(internal_id) : get_linklist(internal_id, level);
|
443
|
+
}
|
412
444
|
|
413
|
-
if (*ll_cur && !isUpdate) {
|
414
|
-
throw std::runtime_error("The newly inserted element should have blank link list");
|
415
|
-
}
|
416
|
-
setListCount(ll_cur,selectedNeighbors.size());
|
417
|
-
tableint *data = (tableint *) (ll_cur + 1);
|
418
|
-
for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
|
419
|
-
if (data[idx] && !isUpdate)
|
420
|
-
throw std::runtime_error("Possible memory corruption");
|
421
|
-
if (level > element_levels_[selectedNeighbors[idx]])
|
422
|
-
throw std::runtime_error("Trying to make a link on a non-existent level");
|
423
445
|
|
424
|
-
|
446
|
+
tableint mutuallyConnectNewElement(
|
447
|
+
const void *data_point,
|
448
|
+
tableint cur_c,
|
449
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
|
450
|
+
int level,
|
451
|
+
bool isUpdate) {
|
452
|
+
size_t Mcurmax = level ? maxM_ : maxM0_;
|
453
|
+
getNeighborsByHeuristic2(top_candidates, M_);
|
454
|
+
if (top_candidates.size() > M_)
|
455
|
+
throw std::runtime_error("Should be not be more than M_ candidates returned by the heuristic");
|
425
456
|
|
426
|
-
|
457
|
+
std::vector<tableint> selectedNeighbors;
|
458
|
+
selectedNeighbors.reserve(M_);
|
459
|
+
while (top_candidates.size() > 0) {
|
460
|
+
selectedNeighbors.push_back(top_candidates.top().second);
|
461
|
+
top_candidates.pop();
|
462
|
+
}
|
463
|
+
|
464
|
+
tableint next_closest_entry_point = selectedNeighbors.back();
|
465
|
+
|
466
|
+
{
|
467
|
+
// lock only during the update
|
468
|
+
// because during the addition the lock for cur_c is already acquired
|
469
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[cur_c], std::defer_lock);
|
470
|
+
if (isUpdate) {
|
471
|
+
lock.lock();
|
427
472
|
}
|
473
|
+
linklistsizeint *ll_cur;
|
474
|
+
if (level == 0)
|
475
|
+
ll_cur = get_linklist0(cur_c);
|
476
|
+
else
|
477
|
+
ll_cur = get_linklist(cur_c, level);
|
428
478
|
|
479
|
+
if (*ll_cur && !isUpdate) {
|
480
|
+
throw std::runtime_error("The newly inserted element should have blank link list");
|
481
|
+
}
|
482
|
+
setListCount(ll_cur, selectedNeighbors.size());
|
483
|
+
tableint *data = (tableint *) (ll_cur + 1);
|
429
484
|
for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
|
485
|
+
if (data[idx] && !isUpdate)
|
486
|
+
throw std::runtime_error("Possible memory corruption");
|
487
|
+
if (level > element_levels_[selectedNeighbors[idx]])
|
488
|
+
throw std::runtime_error("Trying to make a link on a non-existent level");
|
430
489
|
|
431
|
-
|
490
|
+
data[idx] = selectedNeighbors[idx];
|
491
|
+
}
|
492
|
+
}
|
432
493
|
|
433
|
-
|
434
|
-
|
435
|
-
ll_other = get_linklist0(selectedNeighbors[idx]);
|
436
|
-
else
|
437
|
-
ll_other = get_linklist(selectedNeighbors[idx], level);
|
494
|
+
for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
|
495
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[selectedNeighbors[idx]]);
|
438
496
|
|
439
|
-
|
497
|
+
linklistsizeint *ll_other;
|
498
|
+
if (level == 0)
|
499
|
+
ll_other = get_linklist0(selectedNeighbors[idx]);
|
500
|
+
else
|
501
|
+
ll_other = get_linklist(selectedNeighbors[idx], level);
|
440
502
|
|
441
|
-
|
442
|
-
throw std::runtime_error("Bad value of sz_link_list_other");
|
443
|
-
if (selectedNeighbors[idx] == cur_c)
|
444
|
-
throw std::runtime_error("Trying to connect an element to itself");
|
445
|
-
if (level > element_levels_[selectedNeighbors[idx]])
|
446
|
-
throw std::runtime_error("Trying to make a link on a non-existent level");
|
503
|
+
size_t sz_link_list_other = getListCount(ll_other);
|
447
504
|
|
448
|
-
|
505
|
+
if (sz_link_list_other > Mcurmax)
|
506
|
+
throw std::runtime_error("Bad value of sz_link_list_other");
|
507
|
+
if (selectedNeighbors[idx] == cur_c)
|
508
|
+
throw std::runtime_error("Trying to connect an element to itself");
|
509
|
+
if (level > element_levels_[selectedNeighbors[idx]])
|
510
|
+
throw std::runtime_error("Trying to make a link on a non-existent level");
|
449
511
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
512
|
+
tableint *data = (tableint *) (ll_other + 1);
|
513
|
+
|
514
|
+
bool is_cur_c_present = false;
|
515
|
+
if (isUpdate) {
|
516
|
+
for (size_t j = 0; j < sz_link_list_other; j++) {
|
517
|
+
if (data[j] == cur_c) {
|
518
|
+
is_cur_c_present = true;
|
519
|
+
break;
|
457
520
|
}
|
458
521
|
}
|
522
|
+
}
|
459
523
|
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
for (size_t j = 0; j < sz_link_list_other; j++) {
|
474
|
-
candidates.emplace(
|
475
|
-
fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(selectedNeighbors[idx]),
|
476
|
-
dist_func_param_), data[j]);
|
477
|
-
}
|
524
|
+
// If cur_c is already present in the neighboring connections of `selectedNeighbors[idx]` then no need to modify any connections or run the heuristics.
|
525
|
+
if (!is_cur_c_present) {
|
526
|
+
if (sz_link_list_other < Mcurmax) {
|
527
|
+
data[sz_link_list_other] = cur_c;
|
528
|
+
setListCount(ll_other, sz_link_list_other + 1);
|
529
|
+
} else {
|
530
|
+
// finding the "weakest" element to replace it with the new one
|
531
|
+
dist_t d_max = fstdistfunc_(getDataByInternalId(cur_c), getDataByInternalId(selectedNeighbors[idx]),
|
532
|
+
dist_func_param_);
|
533
|
+
// Heuristic:
|
534
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidates;
|
535
|
+
candidates.emplace(d_max, cur_c);
|
478
536
|
|
479
|
-
|
537
|
+
for (size_t j = 0; j < sz_link_list_other; j++) {
|
538
|
+
candidates.emplace(
|
539
|
+
fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(selectedNeighbors[idx]),
|
540
|
+
dist_func_param_), data[j]);
|
541
|
+
}
|
480
542
|
|
481
|
-
|
482
|
-
while (candidates.size() > 0) {
|
483
|
-
data[indx] = candidates.top().second;
|
484
|
-
candidates.pop();
|
485
|
-
indx++;
|
486
|
-
}
|
543
|
+
getNeighborsByHeuristic2(candidates, Mcurmax);
|
487
544
|
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
545
|
+
int indx = 0;
|
546
|
+
while (candidates.size() > 0) {
|
547
|
+
data[indx] = candidates.top().second;
|
548
|
+
candidates.pop();
|
549
|
+
indx++;
|
550
|
+
}
|
551
|
+
|
552
|
+
setListCount(ll_other, indx);
|
553
|
+
// Nearest K:
|
554
|
+
/*int indx = -1;
|
555
|
+
for (int j = 0; j < sz_link_list_other; j++) {
|
556
|
+
dist_t d = fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(rez[idx]), dist_func_param_);
|
557
|
+
if (d > d_max) {
|
558
|
+
indx = j;
|
559
|
+
d_max = d;
|
497
560
|
}
|
498
|
-
if (indx >= 0) {
|
499
|
-
data[indx] = cur_c;
|
500
|
-
} */
|
501
561
|
}
|
562
|
+
if (indx >= 0) {
|
563
|
+
data[indx] = cur_c;
|
564
|
+
} */
|
502
565
|
}
|
503
566
|
}
|
504
|
-
|
505
|
-
return next_closest_entry_point;
|
506
567
|
}
|
507
568
|
|
508
|
-
|
509
|
-
|
569
|
+
return next_closest_entry_point;
|
570
|
+
}
|
510
571
|
|
511
|
-
void setEf(size_t ef) {
|
512
|
-
ef_ = ef;
|
513
|
-
}
|
514
572
|
|
573
|
+
void resizeIndex(size_t new_max_elements) {
|
574
|
+
if (new_max_elements < cur_element_count)
|
575
|
+
throw std::runtime_error("Cannot resize, max element is less than the current number of elements");
|
515
576
|
|
516
|
-
|
517
|
-
|
518
|
-
if (cur_element_count == 0) return top_candidates;
|
519
|
-
tableint currObj = enterpoint_node_;
|
520
|
-
dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
|
577
|
+
delete visited_list_pool_;
|
578
|
+
visited_list_pool_ = new VisitedListPool(1, new_max_elements);
|
521
579
|
|
522
|
-
|
523
|
-
bool changed = true;
|
524
|
-
while (changed) {
|
525
|
-
changed = false;
|
526
|
-
int *data;
|
527
|
-
data = (int *) get_linklist(currObj,level);
|
528
|
-
int size = getListCount(data);
|
529
|
-
tableint *datal = (tableint *) (data + 1);
|
530
|
-
for (int i = 0; i < size; i++) {
|
531
|
-
tableint cand = datal[i];
|
532
|
-
if (cand < 0 || cand > max_elements_)
|
533
|
-
throw std::runtime_error("cand error");
|
534
|
-
dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
|
580
|
+
element_levels_.resize(new_max_elements);
|
535
581
|
|
536
|
-
|
537
|
-
curdist = d;
|
538
|
-
currObj = cand;
|
539
|
-
changed = true;
|
540
|
-
}
|
541
|
-
}
|
542
|
-
}
|
543
|
-
}
|
582
|
+
std::vector<std::mutex>(new_max_elements).swap(link_list_locks_);
|
544
583
|
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
else{
|
551
|
-
std::priority_queue<std::pair<dist_t, tableint >> top_candidates1=searchBaseLayerST<false>(currObj, query_data,
|
552
|
-
ef_);
|
553
|
-
top_candidates.swap(top_candidates1);
|
554
|
-
}
|
555
|
-
|
556
|
-
while (top_candidates.size() > k) {
|
557
|
-
top_candidates.pop();
|
558
|
-
}
|
559
|
-
return top_candidates;
|
560
|
-
};
|
584
|
+
// Reallocate base layer
|
585
|
+
char * data_level0_memory_new = (char *) realloc(data_level0_memory_, new_max_elements * size_data_per_element_);
|
586
|
+
if (data_level0_memory_new == nullptr)
|
587
|
+
throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer");
|
588
|
+
data_level0_memory_ = data_level0_memory_new;
|
561
589
|
|
562
|
-
|
563
|
-
|
564
|
-
|
590
|
+
// Reallocate all other layers
|
591
|
+
char ** linkLists_new = (char **) realloc(linkLists_, sizeof(void *) * new_max_elements);
|
592
|
+
if (linkLists_new == nullptr)
|
593
|
+
throw std::runtime_error("Not enough memory: resizeIndex failed to allocate other layers");
|
594
|
+
linkLists_ = linkLists_new;
|
565
595
|
|
596
|
+
max_elements_ = new_max_elements;
|
597
|
+
}
|
566
598
|
|
567
|
-
delete visited_list_pool_;
|
568
|
-
visited_list_pool_ = new VisitedListPool(1, new_max_elements);
|
569
599
|
|
600
|
+
void saveIndex(const std::string &location) {
|
601
|
+
std::ofstream output(location, std::ios::binary);
|
602
|
+
std::streampos position;
|
570
603
|
|
571
|
-
|
604
|
+
writeBinaryPOD(output, offsetLevel0_);
|
605
|
+
writeBinaryPOD(output, max_elements_);
|
606
|
+
writeBinaryPOD(output, cur_element_count);
|
607
|
+
writeBinaryPOD(output, size_data_per_element_);
|
608
|
+
writeBinaryPOD(output, label_offset_);
|
609
|
+
writeBinaryPOD(output, offsetData_);
|
610
|
+
writeBinaryPOD(output, maxlevel_);
|
611
|
+
writeBinaryPOD(output, enterpoint_node_);
|
612
|
+
writeBinaryPOD(output, maxM_);
|
572
613
|
|
573
|
-
|
614
|
+
writeBinaryPOD(output, maxM0_);
|
615
|
+
writeBinaryPOD(output, M_);
|
616
|
+
writeBinaryPOD(output, mult_);
|
617
|
+
writeBinaryPOD(output, ef_construction_);
|
574
618
|
|
575
|
-
|
576
|
-
char * data_level0_memory_new = (char *) realloc(data_level0_memory_, new_max_elements * size_data_per_element_);
|
577
|
-
if (data_level0_memory_new == nullptr)
|
578
|
-
throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer");
|
579
|
-
data_level0_memory_ = data_level0_memory_new;
|
619
|
+
output.write(data_level0_memory_, cur_element_count * size_data_per_element_);
|
580
620
|
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
max_elements_ = new_max_elements;
|
621
|
+
for (size_t i = 0; i < cur_element_count; i++) {
|
622
|
+
unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
|
623
|
+
writeBinaryPOD(output, linkListSize);
|
624
|
+
if (linkListSize)
|
625
|
+
output.write(linkLists_[i], linkListSize);
|
588
626
|
}
|
627
|
+
output.close();
|
628
|
+
}
|
629
|
+
|
630
|
+
|
631
|
+
void loadIndex(const std::string &location, SpaceInterface<dist_t> *s, size_t max_elements_i = 0) {
|
632
|
+
std::ifstream input(location, std::ios::binary);
|
633
|
+
|
634
|
+
if (!input.is_open())
|
635
|
+
throw std::runtime_error("Cannot open file");
|
636
|
+
|
637
|
+
// get file size:
|
638
|
+
input.seekg(0, input.end);
|
639
|
+
std::streampos total_filesize = input.tellg();
|
640
|
+
input.seekg(0, input.beg);
|
641
|
+
|
642
|
+
readBinaryPOD(input, offsetLevel0_);
|
643
|
+
readBinaryPOD(input, max_elements_);
|
644
|
+
readBinaryPOD(input, cur_element_count);
|
645
|
+
|
646
|
+
size_t max_elements = max_elements_i;
|
647
|
+
if (max_elements < cur_element_count)
|
648
|
+
max_elements = max_elements_;
|
649
|
+
max_elements_ = max_elements;
|
650
|
+
readBinaryPOD(input, size_data_per_element_);
|
651
|
+
readBinaryPOD(input, label_offset_);
|
652
|
+
readBinaryPOD(input, offsetData_);
|
653
|
+
readBinaryPOD(input, maxlevel_);
|
654
|
+
readBinaryPOD(input, enterpoint_node_);
|
655
|
+
|
656
|
+
readBinaryPOD(input, maxM_);
|
657
|
+
readBinaryPOD(input, maxM0_);
|
658
|
+
readBinaryPOD(input, M_);
|
659
|
+
readBinaryPOD(input, mult_);
|
660
|
+
readBinaryPOD(input, ef_construction_);
|
661
|
+
|
662
|
+
data_size_ = s->get_data_size();
|
663
|
+
fstdistfunc_ = s->get_dist_func();
|
664
|
+
dist_func_param_ = s->get_dist_func_param();
|
665
|
+
|
666
|
+
auto pos = input.tellg();
|
667
|
+
|
668
|
+
/// Optional - check if index is ok:
|
669
|
+
input.seekg(cur_element_count * size_data_per_element_, input.cur);
|
670
|
+
for (size_t i = 0; i < cur_element_count; i++) {
|
671
|
+
if (input.tellg() < 0 || input.tellg() >= total_filesize) {
|
672
|
+
throw std::runtime_error("Index seems to be corrupted or unsupported");
|
673
|
+
}
|
589
674
|
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
writeBinaryPOD(output, offsetLevel0_);
|
595
|
-
writeBinaryPOD(output, max_elements_);
|
596
|
-
writeBinaryPOD(output, cur_element_count);
|
597
|
-
writeBinaryPOD(output, size_data_per_element_);
|
598
|
-
writeBinaryPOD(output, label_offset_);
|
599
|
-
writeBinaryPOD(output, offsetData_);
|
600
|
-
writeBinaryPOD(output, maxlevel_);
|
601
|
-
writeBinaryPOD(output, enterpoint_node_);
|
602
|
-
writeBinaryPOD(output, maxM_);
|
603
|
-
|
604
|
-
writeBinaryPOD(output, maxM0_);
|
605
|
-
writeBinaryPOD(output, M_);
|
606
|
-
writeBinaryPOD(output, mult_);
|
607
|
-
writeBinaryPOD(output, ef_construction_);
|
608
|
-
|
609
|
-
output.write(data_level0_memory_, cur_element_count * size_data_per_element_);
|
610
|
-
|
611
|
-
for (size_t i = 0; i < cur_element_count; i++) {
|
612
|
-
unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
|
613
|
-
writeBinaryPOD(output, linkListSize);
|
614
|
-
if (linkListSize)
|
615
|
-
output.write(linkLists_[i], linkListSize);
|
675
|
+
unsigned int linkListSize;
|
676
|
+
readBinaryPOD(input, linkListSize);
|
677
|
+
if (linkListSize != 0) {
|
678
|
+
input.seekg(linkListSize, input.cur);
|
616
679
|
}
|
617
|
-
output.close();
|
618
680
|
}
|
619
681
|
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
682
|
+
// throw exception if it either corrupted or old index
|
683
|
+
if (input.tellg() != total_filesize)
|
684
|
+
throw std::runtime_error("Index seems to be corrupted or unsupported");
|
685
|
+
|
686
|
+
input.clear();
|
687
|
+
/// Optional check end
|
688
|
+
|
689
|
+
input.seekg(pos, input.beg);
|
690
|
+
|
691
|
+
data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
|
692
|
+
if (data_level0_memory_ == nullptr)
|
693
|
+
throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
|
694
|
+
input.read(data_level0_memory_, cur_element_count * size_data_per_element_);
|
695
|
+
|
696
|
+
size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
|
697
|
+
|
698
|
+
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
|
699
|
+
std::vector<std::mutex>(max_elements).swap(link_list_locks_);
|
700
|
+
std::vector<std::mutex>(MAX_LABEL_OPERATION_LOCKS).swap(label_op_locks_);
|
701
|
+
|
702
|
+
visited_list_pool_ = new VisitedListPool(1, max_elements);
|
703
|
+
|
704
|
+
linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
|
705
|
+
if (linkLists_ == nullptr)
|
706
|
+
throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
|
707
|
+
element_levels_ = std::vector<int>(max_elements);
|
708
|
+
revSize_ = 1.0 / mult_;
|
709
|
+
ef_ = 10;
|
710
|
+
for (size_t i = 0; i < cur_element_count; i++) {
|
711
|
+
label_lookup_[getExternalLabel(i)] = i;
|
712
|
+
unsigned int linkListSize;
|
713
|
+
readBinaryPOD(input, linkListSize);
|
714
|
+
if (linkListSize == 0) {
|
715
|
+
element_levels_[i] = 0;
|
716
|
+
linkLists_[i] = nullptr;
|
717
|
+
} else {
|
718
|
+
element_levels_[i] = linkListSize / size_links_per_element_;
|
719
|
+
linkLists_[i] = (char *) malloc(linkListSize);
|
720
|
+
if (linkLists_[i] == nullptr)
|
721
|
+
throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist");
|
722
|
+
input.read(linkLists_[i], linkListSize);
|
723
|
+
}
|
724
|
+
}
|
634
725
|
|
635
|
-
|
636
|
-
if(
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
readBinaryPOD(input, offsetData_);
|
642
|
-
readBinaryPOD(input, maxlevel_);
|
643
|
-
readBinaryPOD(input, enterpoint_node_);
|
726
|
+
for (size_t i = 0; i < cur_element_count; i++) {
|
727
|
+
if (isMarkedDeleted(i)) {
|
728
|
+
num_deleted_ += 1;
|
729
|
+
if (allow_replace_deleted_) deleted_elements.insert(i);
|
730
|
+
}
|
731
|
+
}
|
644
732
|
|
645
|
-
|
646
|
-
readBinaryPOD(input, maxM0_);
|
647
|
-
readBinaryPOD(input, M_);
|
648
|
-
readBinaryPOD(input, mult_);
|
649
|
-
readBinaryPOD(input, ef_construction_);
|
733
|
+
input.close();
|
650
734
|
|
735
|
+
return;
|
736
|
+
}
|
651
737
|
|
652
|
-
data_size_ = s->get_data_size();
|
653
|
-
fstdistfunc_ = s->get_dist_func();
|
654
|
-
dist_func_param_ = s->get_dist_func_param();
|
655
738
|
|
656
|
-
|
739
|
+
template<typename data_t>
|
740
|
+
std::vector<data_t> getDataByLabel(labeltype label) const {
|
741
|
+
// lock all operations with element by label
|
742
|
+
std::unique_lock <std::mutex> lock_label(getLabelOpMutex(label));
|
657
743
|
|
744
|
+
std::unique_lock <std::mutex> lock_table(label_lookup_lock);
|
745
|
+
auto search = label_lookup_.find(label);
|
746
|
+
if (search == label_lookup_.end() || isMarkedDeleted(search->second)) {
|
747
|
+
throw std::runtime_error("Label not found");
|
748
|
+
}
|
749
|
+
tableint internalId = search->second;
|
750
|
+
lock_table.unlock();
|
751
|
+
|
752
|
+
char* data_ptrv = getDataByInternalId(internalId);
|
753
|
+
size_t dim = *((size_t *) dist_func_param_);
|
754
|
+
std::vector<data_t> data;
|
755
|
+
data_t* data_ptr = (data_t*) data_ptrv;
|
756
|
+
for (int i = 0; i < dim; i++) {
|
757
|
+
data.push_back(*data_ptr);
|
758
|
+
data_ptr += 1;
|
759
|
+
}
|
760
|
+
return data;
|
761
|
+
}
|
658
762
|
|
659
|
-
/// Optional - check if index is ok:
|
660
763
|
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
764
|
+
/*
|
765
|
+
* Marks an element with the given label deleted, does NOT really change the current graph.
|
766
|
+
*/
|
767
|
+
void markDelete(labeltype label) {
|
768
|
+
// lock all operations with element by label
|
769
|
+
std::unique_lock <std::mutex> lock_label(getLabelOpMutex(label));
|
666
770
|
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
771
|
+
std::unique_lock <std::mutex> lock_table(label_lookup_lock);
|
772
|
+
auto search = label_lookup_.find(label);
|
773
|
+
if (search == label_lookup_.end()) {
|
774
|
+
throw std::runtime_error("Label not found");
|
775
|
+
}
|
776
|
+
tableint internalId = search->second;
|
777
|
+
lock_table.unlock();
|
778
|
+
|
779
|
+
markDeletedInternal(internalId);
|
780
|
+
}
|
781
|
+
|
782
|
+
|
783
|
+
/*
|
784
|
+
* Uses the last 16 bits of the memory for the linked list size to store the mark,
|
785
|
+
* whereas maxM0_ has to be limited to the lower 16 bits, however, still large enough in almost all cases.
|
786
|
+
*/
|
787
|
+
void markDeletedInternal(tableint internalId) {
|
788
|
+
assert(internalId < cur_element_count);
|
789
|
+
if (!isMarkedDeleted(internalId)) {
|
790
|
+
unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
|
791
|
+
*ll_cur |= DELETE_MARK;
|
792
|
+
num_deleted_ += 1;
|
793
|
+
if (allow_replace_deleted_) {
|
794
|
+
std::unique_lock <std::mutex> lock_deleted_elements(deleted_elements_lock);
|
795
|
+
deleted_elements.insert(internalId);
|
672
796
|
}
|
797
|
+
} else {
|
798
|
+
throw std::runtime_error("The requested to delete element is already deleted");
|
799
|
+
}
|
800
|
+
}
|
801
|
+
|
802
|
+
|
803
|
+
/*
|
804
|
+
* Removes the deleted mark of the node, does NOT really change the current graph.
|
805
|
+
*
|
806
|
+
* Note: the method is not safe to use when replacement of deleted elements is enabled,
|
807
|
+
* because elements marked as deleted can be completely removed by addPoint
|
808
|
+
*/
|
809
|
+
void unmarkDelete(labeltype label) {
|
810
|
+
// lock all operations with element by label
|
811
|
+
std::unique_lock <std::mutex> lock_label(getLabelOpMutex(label));
|
812
|
+
|
813
|
+
std::unique_lock <std::mutex> lock_table(label_lookup_lock);
|
814
|
+
auto search = label_lookup_.find(label);
|
815
|
+
if (search == label_lookup_.end()) {
|
816
|
+
throw std::runtime_error("Label not found");
|
817
|
+
}
|
818
|
+
tableint internalId = search->second;
|
819
|
+
lock_table.unlock();
|
820
|
+
|
821
|
+
unmarkDeletedInternal(internalId);
|
822
|
+
}
|
823
|
+
|
824
|
+
|
825
|
+
|
826
|
+
/*
|
827
|
+
* Remove the deleted mark of the node.
|
828
|
+
*/
|
829
|
+
void unmarkDeletedInternal(tableint internalId) {
|
830
|
+
assert(internalId < cur_element_count);
|
831
|
+
if (isMarkedDeleted(internalId)) {
|
832
|
+
unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId)) + 2;
|
833
|
+
*ll_cur &= ~DELETE_MARK;
|
834
|
+
num_deleted_ -= 1;
|
835
|
+
if (allow_replace_deleted_) {
|
836
|
+
std::unique_lock <std::mutex> lock_deleted_elements(deleted_elements_lock);
|
837
|
+
deleted_elements.erase(internalId);
|
838
|
+
}
|
839
|
+
} else {
|
840
|
+
throw std::runtime_error("The requested to undelete element is not deleted");
|
841
|
+
}
|
842
|
+
}
|
673
843
|
|
674
|
-
// throw exception if it either corrupted or old index
|
675
|
-
if(input.tellg()!=total_filesize)
|
676
|
-
throw std::runtime_error("Index seems to be corrupted or unsupported");
|
677
|
-
|
678
|
-
input.clear();
|
679
|
-
|
680
|
-
/// Optional check end
|
681
|
-
|
682
|
-
input.seekg(pos,input.beg);
|
683
|
-
|
684
|
-
data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
|
685
|
-
if (data_level0_memory_ == nullptr)
|
686
|
-
throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
|
687
|
-
input.read(data_level0_memory_, cur_element_count * size_data_per_element_);
|
688
844
|
|
689
|
-
|
845
|
+
/*
|
846
|
+
* Checks the first 16 bits of the memory to see if the element is marked deleted.
|
847
|
+
*/
|
848
|
+
bool isMarkedDeleted(tableint internalId) const {
|
849
|
+
unsigned char *ll_cur = ((unsigned char*)get_linklist0(internalId)) + 2;
|
850
|
+
return *ll_cur & DELETE_MARK;
|
851
|
+
}
|
690
852
|
|
691
|
-
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
|
692
|
-
std::vector<std::mutex>(max_elements).swap(link_list_locks_);
|
693
|
-
std::vector<std::mutex>(max_update_element_locks).swap(link_list_update_locks_);
|
694
853
|
|
695
|
-
|
854
|
+
unsigned short int getListCount(linklistsizeint * ptr) const {
|
855
|
+
return *((unsigned short int *)ptr);
|
856
|
+
}
|
696
857
|
|
697
|
-
linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
|
698
|
-
if (linkLists_ == nullptr)
|
699
|
-
throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
|
700
|
-
element_levels_ = std::vector<int>(max_elements);
|
701
|
-
revSize_ = 1.0 / mult_;
|
702
|
-
ef_ = 10;
|
703
|
-
for (size_t i = 0; i < cur_element_count; i++) {
|
704
|
-
label_lookup_[getExternalLabel(i)]=i;
|
705
|
-
unsigned int linkListSize;
|
706
|
-
readBinaryPOD(input, linkListSize);
|
707
|
-
if (linkListSize == 0) {
|
708
|
-
element_levels_[i] = 0;
|
709
858
|
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
linkLists_[i] = (char *) malloc(linkListSize);
|
714
|
-
if (linkLists_[i] == nullptr)
|
715
|
-
throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist");
|
716
|
-
input.read(linkLists_[i], linkListSize);
|
717
|
-
}
|
718
|
-
}
|
719
|
-
|
720
|
-
for (size_t i = 0; i < cur_element_count; i++) {
|
721
|
-
if(isMarkedDeleted(i))
|
722
|
-
num_deleted_ += 1;
|
723
|
-
}
|
859
|
+
void setListCount(linklistsizeint * ptr, unsigned short int size) const {
|
860
|
+
*((unsigned short int*)(ptr))=*((unsigned short int *)&size);
|
861
|
+
}
|
724
862
|
|
725
|
-
input.close();
|
726
863
|
|
727
|
-
|
864
|
+
/*
|
865
|
+
* Adds point. Updates the point if it is already in the index.
|
866
|
+
* If replacement of deleted elements is enabled: replaces previously deleted point if any, updating it with new point
|
867
|
+
*/
|
868
|
+
void addPoint(const void *data_point, labeltype label, bool replace_deleted = false) {
|
869
|
+
if ((allow_replace_deleted_ == false) && (replace_deleted == true)) {
|
870
|
+
throw std::runtime_error("Replacement of deleted elements is disabled in constructor");
|
728
871
|
}
|
729
872
|
|
730
|
-
|
731
|
-
std::
|
732
|
-
{
|
733
|
-
|
734
|
-
|
735
|
-
if (search == label_lookup_.end() || isMarkedDeleted(search->second)) {
|
736
|
-
throw std::runtime_error("Label not found");
|
737
|
-
}
|
738
|
-
label_c = search->second;
|
739
|
-
|
740
|
-
char* data_ptrv = getDataByInternalId(label_c);
|
741
|
-
size_t dim = *((size_t *) dist_func_param_);
|
742
|
-
std::vector<data_t> data;
|
743
|
-
data_t* data_ptr = (data_t*) data_ptrv;
|
744
|
-
for (int i = 0; i < dim; i++) {
|
745
|
-
data.push_back(*data_ptr);
|
746
|
-
data_ptr += 1;
|
747
|
-
}
|
748
|
-
return data;
|
873
|
+
// lock all operations with element by label
|
874
|
+
std::unique_lock <std::mutex> lock_label(getLabelOpMutex(label));
|
875
|
+
if (!replace_deleted) {
|
876
|
+
addPoint(data_point, label, -1);
|
877
|
+
return;
|
749
878
|
}
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
void markDelete(labeltype label)
|
758
|
-
{
|
759
|
-
auto search = label_lookup_.find(label);
|
760
|
-
if (search == label_lookup_.end()) {
|
761
|
-
throw std::runtime_error("Label not found");
|
762
|
-
}
|
763
|
-
tableint internalId = search->second;
|
764
|
-
markDeletedInternal(internalId);
|
879
|
+
// check if there is vacant place
|
880
|
+
tableint internal_id_replaced;
|
881
|
+
std::unique_lock <std::mutex> lock_deleted_elements(deleted_elements_lock);
|
882
|
+
bool is_vacant_place = !deleted_elements.empty();
|
883
|
+
if (is_vacant_place) {
|
884
|
+
internal_id_replaced = *deleted_elements.begin();
|
885
|
+
deleted_elements.erase(internal_id_replaced);
|
765
886
|
}
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
887
|
+
lock_deleted_elements.unlock();
|
888
|
+
|
889
|
+
// if there is no vacant place then add or update point
|
890
|
+
// else add point to vacant place
|
891
|
+
if (!is_vacant_place) {
|
892
|
+
addPoint(data_point, label, -1);
|
893
|
+
} else {
|
894
|
+
// we assume that there are no concurrent operations on deleted element
|
895
|
+
labeltype label_replaced = getExternalLabel(internal_id_replaced);
|
896
|
+
setExternalLabel(internal_id_replaced, label);
|
897
|
+
|
898
|
+
std::unique_lock <std::mutex> lock_table(label_lookup_lock);
|
899
|
+
label_lookup_.erase(label_replaced);
|
900
|
+
label_lookup_[label] = internal_id_replaced;
|
901
|
+
lock_table.unlock();
|
902
|
+
|
903
|
+
unmarkDeletedInternal(internal_id_replaced);
|
904
|
+
updatePoint(data_point, internal_id_replaced, 1.0);
|
784
905
|
}
|
906
|
+
}
|
785
907
|
|
786
|
-
/**
|
787
|
-
* Remove the deleted mark of the node, does NOT really change the current graph.
|
788
|
-
* @param label
|
789
|
-
*/
|
790
|
-
void unmarkDelete(labeltype label)
|
791
|
-
{
|
792
|
-
auto search = label_lookup_.find(label);
|
793
|
-
if (search == label_lookup_.end()) {
|
794
|
-
throw std::runtime_error("Label not found");
|
795
|
-
}
|
796
|
-
tableint internalId = search->second;
|
797
|
-
unmarkDeletedInternal(internalId);
|
798
|
-
}
|
799
908
|
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
*/
|
804
|
-
void unmarkDeletedInternal(tableint internalId) {
|
805
|
-
assert(internalId < cur_element_count);
|
806
|
-
if (isMarkedDeleted(internalId))
|
807
|
-
{
|
808
|
-
unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
|
809
|
-
*ll_cur &= ~DELETE_MARK;
|
810
|
-
num_deleted_ -= 1;
|
811
|
-
}
|
812
|
-
else
|
813
|
-
{
|
814
|
-
throw std::runtime_error("The requested to undelete element is not deleted");
|
815
|
-
}
|
816
|
-
}
|
909
|
+
void updatePoint(const void *dataPoint, tableint internalId, float updateNeighborProbability) {
|
910
|
+
// update the feature vector associated with existing point with new vector
|
911
|
+
memcpy(getDataByInternalId(internalId), dataPoint, data_size_);
|
817
912
|
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
bool isMarkedDeleted(tableint internalId) const {
|
824
|
-
unsigned char *ll_cur = ((unsigned char*)get_linklist0(internalId))+2;
|
825
|
-
return *ll_cur & DELETE_MARK;
|
826
|
-
}
|
913
|
+
int maxLevelCopy = maxlevel_;
|
914
|
+
tableint entryPointCopy = enterpoint_node_;
|
915
|
+
// If point to be updated is entry point and graph just contains single element then just return.
|
916
|
+
if (entryPointCopy == internalId && cur_element_count == 1)
|
917
|
+
return;
|
827
918
|
|
828
|
-
|
829
|
-
|
830
|
-
|
919
|
+
int elemLevel = element_levels_[internalId];
|
920
|
+
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
921
|
+
for (int layer = 0; layer <= elemLevel; layer++) {
|
922
|
+
std::unordered_set<tableint> sCand;
|
923
|
+
std::unordered_set<tableint> sNeigh;
|
924
|
+
std::vector<tableint> listOneHop = getConnectionsWithLock(internalId, layer);
|
925
|
+
if (listOneHop.size() == 0)
|
926
|
+
continue;
|
831
927
|
|
832
|
-
|
833
|
-
*((unsigned short int*)(ptr))=*((unsigned short int *)&size);
|
834
|
-
}
|
928
|
+
sCand.insert(internalId);
|
835
929
|
|
836
|
-
|
837
|
-
|
838
|
-
}
|
930
|
+
for (auto&& elOneHop : listOneHop) {
|
931
|
+
sCand.insert(elOneHop);
|
839
932
|
|
840
|
-
|
841
|
-
// update the feature vector associated with existing point with new vector
|
842
|
-
memcpy(getDataByInternalId(internalId), dataPoint, data_size_);
|
843
|
-
|
844
|
-
int maxLevelCopy = maxlevel_;
|
845
|
-
tableint entryPointCopy = enterpoint_node_;
|
846
|
-
// If point to be updated is entry point and graph just contains single element then just return.
|
847
|
-
if (entryPointCopy == internalId && cur_element_count == 1)
|
848
|
-
return;
|
849
|
-
|
850
|
-
int elemLevel = element_levels_[internalId];
|
851
|
-
std::uniform_real_distribution<float> distribution(0.0, 1.0);
|
852
|
-
for (int layer = 0; layer <= elemLevel; layer++) {
|
853
|
-
std::unordered_set<tableint> sCand;
|
854
|
-
std::unordered_set<tableint> sNeigh;
|
855
|
-
std::vector<tableint> listOneHop = getConnectionsWithLock(internalId, layer);
|
856
|
-
if (listOneHop.size() == 0)
|
933
|
+
if (distribution(update_probability_generator_) > updateNeighborProbability)
|
857
934
|
continue;
|
858
935
|
|
859
|
-
|
936
|
+
sNeigh.insert(elOneHop);
|
860
937
|
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
if (distribution(update_probability_generator_) > updateNeighborProbability)
|
865
|
-
continue;
|
866
|
-
|
867
|
-
sNeigh.insert(elOneHop);
|
868
|
-
|
869
|
-
std::vector<tableint> listTwoHop = getConnectionsWithLock(elOneHop, layer);
|
870
|
-
for (auto&& elTwoHop : listTwoHop) {
|
871
|
-
sCand.insert(elTwoHop);
|
872
|
-
}
|
938
|
+
std::vector<tableint> listTwoHop = getConnectionsWithLock(elOneHop, layer);
|
939
|
+
for (auto&& elTwoHop : listTwoHop) {
|
940
|
+
sCand.insert(elTwoHop);
|
873
941
|
}
|
942
|
+
}
|
874
943
|
|
875
|
-
|
876
|
-
|
877
|
-
|
944
|
+
for (auto&& neigh : sNeigh) {
|
945
|
+
// if (neigh == internalId)
|
946
|
+
// continue;
|
878
947
|
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
948
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidates;
|
949
|
+
size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1
|
950
|
+
size_t elementsToKeep = std::min(ef_construction_, size);
|
951
|
+
for (auto&& cand : sCand) {
|
952
|
+
if (cand == neigh)
|
953
|
+
continue;
|
954
|
+
|
955
|
+
dist_t distance = fstdistfunc_(getDataByInternalId(neigh), getDataByInternalId(cand), dist_func_param_);
|
956
|
+
if (candidates.size() < elementsToKeep) {
|
957
|
+
candidates.emplace(distance, cand);
|
958
|
+
} else {
|
959
|
+
if (distance < candidates.top().first) {
|
960
|
+
candidates.pop();
|
888
961
|
candidates.emplace(distance, cand);
|
889
|
-
} else {
|
890
|
-
if (distance < candidates.top().first) {
|
891
|
-
candidates.pop();
|
892
|
-
candidates.emplace(distance, cand);
|
893
|
-
}
|
894
962
|
}
|
895
963
|
}
|
964
|
+
}
|
896
965
|
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
}
|
966
|
+
// Retrieve neighbours using heuristic and set connections.
|
967
|
+
getNeighborsByHeuristic2(candidates, layer == 0 ? maxM0_ : maxM_);
|
968
|
+
|
969
|
+
{
|
970
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[neigh]);
|
971
|
+
linklistsizeint *ll_cur;
|
972
|
+
ll_cur = get_linklist_at_level(neigh, layer);
|
973
|
+
size_t candSize = candidates.size();
|
974
|
+
setListCount(ll_cur, candSize);
|
975
|
+
tableint *data = (tableint *) (ll_cur + 1);
|
976
|
+
for (size_t idx = 0; idx < candSize; idx++) {
|
977
|
+
data[idx] = candidates.top().second;
|
978
|
+
candidates.pop();
|
911
979
|
}
|
912
980
|
}
|
913
981
|
}
|
982
|
+
}
|
914
983
|
|
915
|
-
|
916
|
-
|
984
|
+
repairConnectionsForUpdate(dataPoint, entryPointCopy, internalId, elemLevel, maxLevelCopy);
|
985
|
+
}
|
917
986
|
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
987
|
+
|
988
|
+
void repairConnectionsForUpdate(
|
989
|
+
const void *dataPoint,
|
990
|
+
tableint entryPointInternalId,
|
991
|
+
tableint dataPointInternalId,
|
992
|
+
int dataPointLevel,
|
993
|
+
int maxLevel) {
|
994
|
+
tableint currObj = entryPointInternalId;
|
995
|
+
if (dataPointLevel < maxLevel) {
|
996
|
+
dist_t curdist = fstdistfunc_(dataPoint, getDataByInternalId(currObj), dist_func_param_);
|
997
|
+
for (int level = maxLevel; level > dataPointLevel; level--) {
|
998
|
+
bool changed = true;
|
999
|
+
while (changed) {
|
1000
|
+
changed = false;
|
1001
|
+
unsigned int *data;
|
1002
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[currObj]);
|
1003
|
+
data = get_linklist_at_level(currObj, level);
|
1004
|
+
int size = getListCount(data);
|
1005
|
+
tableint *datal = (tableint *) (data + 1);
|
931
1006
|
#ifdef USE_SSE
|
932
|
-
|
1007
|
+
_mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0);
|
933
1008
|
#endif
|
934
|
-
|
1009
|
+
for (int i = 0; i < size; i++) {
|
935
1010
|
#ifdef USE_SSE
|
936
|
-
|
1011
|
+
_mm_prefetch(getDataByInternalId(*(datal + i + 1)), _MM_HINT_T0);
|
937
1012
|
#endif
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
}
|
1013
|
+
tableint cand = datal[i];
|
1014
|
+
dist_t d = fstdistfunc_(dataPoint, getDataByInternalId(cand), dist_func_param_);
|
1015
|
+
if (d < curdist) {
|
1016
|
+
curdist = d;
|
1017
|
+
currObj = cand;
|
1018
|
+
changed = true;
|
945
1019
|
}
|
946
1020
|
}
|
947
1021
|
}
|
948
1022
|
}
|
1023
|
+
}
|
949
1024
|
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
for (int level = dataPointLevel; level >= 0; level--) {
|
954
|
-
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> topCandidates = searchBaseLayer(
|
955
|
-
currObj, dataPoint, level);
|
1025
|
+
if (dataPointLevel > maxLevel)
|
1026
|
+
throw std::runtime_error("Level of item to be updated cannot be bigger than max level");
|
956
1027
|
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
filteredTopCandidates.push(topCandidates.top());
|
1028
|
+
for (int level = dataPointLevel; level >= 0; level--) {
|
1029
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> topCandidates = searchBaseLayer(
|
1030
|
+
currObj, dataPoint, level);
|
961
1031
|
|
962
|
-
|
963
|
-
|
1032
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> filteredTopCandidates;
|
1033
|
+
while (topCandidates.size() > 0) {
|
1034
|
+
if (topCandidates.top().second != dataPointInternalId)
|
1035
|
+
filteredTopCandidates.push(topCandidates.top());
|
964
1036
|
|
965
|
-
|
966
|
-
|
967
|
-
if (filteredTopCandidates.size() > 0) {
|
968
|
-
bool epDeleted = isMarkedDeleted(entryPointInternalId);
|
969
|
-
if (epDeleted) {
|
970
|
-
filteredTopCandidates.emplace(fstdistfunc_(dataPoint, getDataByInternalId(entryPointInternalId), dist_func_param_), entryPointInternalId);
|
971
|
-
if (filteredTopCandidates.size() > ef_construction_)
|
972
|
-
filteredTopCandidates.pop();
|
973
|
-
}
|
1037
|
+
topCandidates.pop();
|
1038
|
+
}
|
974
1039
|
|
975
|
-
|
1040
|
+
// Since element_levels_ is being used to get `dataPointLevel`, there could be cases where `topCandidates` could just contains entry point itself.
|
1041
|
+
// To prevent self loops, the `topCandidates` is filtered and thus can be empty.
|
1042
|
+
if (filteredTopCandidates.size() > 0) {
|
1043
|
+
bool epDeleted = isMarkedDeleted(entryPointInternalId);
|
1044
|
+
if (epDeleted) {
|
1045
|
+
filteredTopCandidates.emplace(fstdistfunc_(dataPoint, getDataByInternalId(entryPointInternalId), dist_func_param_), entryPointInternalId);
|
1046
|
+
if (filteredTopCandidates.size() > ef_construction_)
|
1047
|
+
filteredTopCandidates.pop();
|
976
1048
|
}
|
1049
|
+
|
1050
|
+
currObj = mutuallyConnectNewElement(dataPoint, dataPointInternalId, filteredTopCandidates, level, true);
|
977
1051
|
}
|
978
1052
|
}
|
1053
|
+
}
|
1054
|
+
|
979
1055
|
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
tableint addPoint(const void *data_point, labeltype label, int level) {
|
991
|
-
|
992
|
-
tableint cur_c = 0;
|
993
|
-
{
|
994
|
-
// Checking if the element with the same label already exists
|
995
|
-
// if so, updating it *instead* of creating a new element.
|
996
|
-
std::unique_lock <std::mutex> templock_curr(cur_element_count_guard_);
|
997
|
-
auto search = label_lookup_.find(label);
|
998
|
-
if (search != label_lookup_.end()) {
|
999
|
-
tableint existingInternalId = search->second;
|
1000
|
-
templock_curr.unlock();
|
1001
|
-
|
1002
|
-
std::unique_lock <std::mutex> lock_el_update(link_list_update_locks_[(existingInternalId & (max_update_element_locks - 1))]);
|
1056
|
+
std::vector<tableint> getConnectionsWithLock(tableint internalId, int level) {
|
1057
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[internalId]);
|
1058
|
+
unsigned int *data = get_linklist_at_level(internalId, level);
|
1059
|
+
int size = getListCount(data);
|
1060
|
+
std::vector<tableint> result(size);
|
1061
|
+
tableint *ll = (tableint *) (data + 1);
|
1062
|
+
memcpy(result.data(), ll, size * sizeof(tableint));
|
1063
|
+
return result;
|
1064
|
+
}
|
1003
1065
|
|
1066
|
+
|
1067
|
+
tableint addPoint(const void *data_point, labeltype label, int level) {
|
1068
|
+
tableint cur_c = 0;
|
1069
|
+
{
|
1070
|
+
// Checking if the element with the same label already exists
|
1071
|
+
// if so, updating it *instead* of creating a new element.
|
1072
|
+
std::unique_lock <std::mutex> lock_table(label_lookup_lock);
|
1073
|
+
auto search = label_lookup_.find(label);
|
1074
|
+
if (search != label_lookup_.end()) {
|
1075
|
+
tableint existingInternalId = search->second;
|
1076
|
+
if (allow_replace_deleted_) {
|
1004
1077
|
if (isMarkedDeleted(existingInternalId)) {
|
1005
|
-
|
1078
|
+
throw std::runtime_error("Can't use addPoint to update deleted elements if replacement of deleted elements is enabled.");
|
1006
1079
|
}
|
1007
|
-
updatePoint(data_point, existingInternalId, 1.0);
|
1008
|
-
|
1009
|
-
return existingInternalId;
|
1010
1080
|
}
|
1081
|
+
lock_table.unlock();
|
1011
1082
|
|
1012
|
-
if (
|
1013
|
-
|
1014
|
-
}
|
1083
|
+
if (isMarkedDeleted(existingInternalId)) {
|
1084
|
+
unmarkDeletedInternal(existingInternalId);
|
1085
|
+
}
|
1086
|
+
updatePoint(data_point, existingInternalId, 1.0);
|
1015
1087
|
|
1016
|
-
|
1017
|
-
cur_element_count++;
|
1018
|
-
label_lookup_[label] = cur_c;
|
1088
|
+
return existingInternalId;
|
1019
1089
|
}
|
1020
1090
|
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
int curlevel = getRandomLevel(mult_);
|
1025
|
-
if (level > 0)
|
1026
|
-
curlevel = level;
|
1091
|
+
if (cur_element_count >= max_elements_) {
|
1092
|
+
throw std::runtime_error("The number of elements exceeds the specified limit");
|
1093
|
+
}
|
1027
1094
|
|
1028
|
-
|
1095
|
+
cur_c = cur_element_count;
|
1096
|
+
cur_element_count++;
|
1097
|
+
label_lookup_[label] = cur_c;
|
1098
|
+
}
|
1029
1099
|
|
1100
|
+
std::unique_lock <std::mutex> lock_el(link_list_locks_[cur_c]);
|
1101
|
+
int curlevel = getRandomLevel(mult_);
|
1102
|
+
if (level > 0)
|
1103
|
+
curlevel = level;
|
1030
1104
|
|
1031
|
-
|
1032
|
-
int maxlevelcopy = maxlevel_;
|
1033
|
-
if (curlevel <= maxlevelcopy)
|
1034
|
-
templock.unlock();
|
1035
|
-
tableint currObj = enterpoint_node_;
|
1036
|
-
tableint enterpoint_copy = enterpoint_node_;
|
1105
|
+
element_levels_[cur_c] = curlevel;
|
1037
1106
|
|
1107
|
+
std::unique_lock <std::mutex> templock(global);
|
1108
|
+
int maxlevelcopy = maxlevel_;
|
1109
|
+
if (curlevel <= maxlevelcopy)
|
1110
|
+
templock.unlock();
|
1111
|
+
tableint currObj = enterpoint_node_;
|
1112
|
+
tableint enterpoint_copy = enterpoint_node_;
|
1038
1113
|
|
1039
|
-
|
1114
|
+
memset(data_level0_memory_ + cur_c * size_data_per_element_ + offsetLevel0_, 0, size_data_per_element_);
|
1040
1115
|
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1116
|
+
// Initialisation of the data and label
|
1117
|
+
memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype));
|
1118
|
+
memcpy(getDataByInternalId(cur_c), data_point, data_size_);
|
1044
1119
|
|
1120
|
+
if (curlevel) {
|
1121
|
+
linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
|
1122
|
+
if (linkLists_[cur_c] == nullptr)
|
1123
|
+
throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
|
1124
|
+
memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1);
|
1125
|
+
}
|
1045
1126
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1127
|
+
if ((signed)currObj != -1) {
|
1128
|
+
if (curlevel < maxlevelcopy) {
|
1129
|
+
dist_t curdist = fstdistfunc_(data_point, getDataByInternalId(currObj), dist_func_param_);
|
1130
|
+
for (int level = maxlevelcopy; level > curlevel; level--) {
|
1131
|
+
bool changed = true;
|
1132
|
+
while (changed) {
|
1133
|
+
changed = false;
|
1134
|
+
unsigned int *data;
|
1135
|
+
std::unique_lock <std::mutex> lock(link_list_locks_[currObj]);
|
1136
|
+
data = get_linklist(currObj, level);
|
1137
|
+
int size = getListCount(data);
|
1052
1138
|
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
changed = false;
|
1064
|
-
unsigned int *data;
|
1065
|
-
std::unique_lock <std::mutex> lock(link_list_locks_[currObj]);
|
1066
|
-
data = get_linklist(currObj,level);
|
1067
|
-
int size = getListCount(data);
|
1068
|
-
|
1069
|
-
tableint *datal = (tableint *) (data + 1);
|
1070
|
-
for (int i = 0; i < size; i++) {
|
1071
|
-
tableint cand = datal[i];
|
1072
|
-
if (cand < 0 || cand > max_elements_)
|
1073
|
-
throw std::runtime_error("cand error");
|
1074
|
-
dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
|
1075
|
-
if (d < curdist) {
|
1076
|
-
curdist = d;
|
1077
|
-
currObj = cand;
|
1078
|
-
changed = true;
|
1079
|
-
}
|
1139
|
+
tableint *datal = (tableint *) (data + 1);
|
1140
|
+
for (int i = 0; i < size; i++) {
|
1141
|
+
tableint cand = datal[i];
|
1142
|
+
if (cand < 0 || cand > max_elements_)
|
1143
|
+
throw std::runtime_error("cand error");
|
1144
|
+
dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
|
1145
|
+
if (d < curdist) {
|
1146
|
+
curdist = d;
|
1147
|
+
currObj = cand;
|
1148
|
+
changed = true;
|
1080
1149
|
}
|
1081
1150
|
}
|
1082
1151
|
}
|
1083
1152
|
}
|
1153
|
+
}
|
1084
1154
|
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
}
|
1097
|
-
currObj = mutuallyConnectNewElement(data_point, cur_c, top_candidates, level, false);
|
1155
|
+
bool epDeleted = isMarkedDeleted(enterpoint_copy);
|
1156
|
+
for (int level = std::min(curlevel, maxlevelcopy); level >= 0; level--) {
|
1157
|
+
if (level > maxlevelcopy || level < 0) // possible?
|
1158
|
+
throw std::runtime_error("Level error");
|
1159
|
+
|
1160
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates = searchBaseLayer(
|
1161
|
+
currObj, data_point, level);
|
1162
|
+
if (epDeleted) {
|
1163
|
+
top_candidates.emplace(fstdistfunc_(data_point, getDataByInternalId(enterpoint_copy), dist_func_param_), enterpoint_copy);
|
1164
|
+
if (top_candidates.size() > ef_construction_)
|
1165
|
+
top_candidates.pop();
|
1098
1166
|
}
|
1099
|
-
|
1100
|
-
|
1101
|
-
} else {
|
1102
|
-
// Do nothing for the first element
|
1103
|
-
enterpoint_node_ = 0;
|
1104
|
-
maxlevel_ = curlevel;
|
1105
|
-
|
1167
|
+
currObj = mutuallyConnectNewElement(data_point, cur_c, top_candidates, level, false);
|
1106
1168
|
}
|
1169
|
+
} else {
|
1170
|
+
// Do nothing for the first element
|
1171
|
+
enterpoint_node_ = 0;
|
1172
|
+
maxlevel_ = curlevel;
|
1173
|
+
}
|
1107
1174
|
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1175
|
+
// Releasing lock for the maximum level
|
1176
|
+
if (curlevel > maxlevelcopy) {
|
1177
|
+
enterpoint_node_ = cur_c;
|
1178
|
+
maxlevel_ = curlevel;
|
1179
|
+
}
|
1180
|
+
return cur_c;
|
1181
|
+
}
|
1115
1182
|
|
1116
|
-
std::priority_queue<std::pair<dist_t, labeltype >>
|
1117
|
-
searchKnn(const void *query_data, size_t k) const {
|
1118
|
-
std::priority_queue<std::pair<dist_t, labeltype >> result;
|
1119
|
-
if (cur_element_count == 0) return result;
|
1120
1183
|
|
1121
|
-
|
1122
|
-
|
1184
|
+
std::priority_queue<std::pair<dist_t, labeltype >>
|
1185
|
+
searchKnn(const void *query_data, size_t k, BaseFilterFunctor* isIdAllowed = nullptr) const {
|
1186
|
+
std::priority_queue<std::pair<dist_t, labeltype >> result;
|
1187
|
+
if (cur_element_count == 0) return result;
|
1123
1188
|
|
1124
|
-
|
1125
|
-
|
1126
|
-
while (changed) {
|
1127
|
-
changed = false;
|
1128
|
-
unsigned int *data;
|
1189
|
+
tableint currObj = enterpoint_node_;
|
1190
|
+
dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
|
1129
1191
|
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1192
|
+
for (int level = maxlevel_; level > 0; level--) {
|
1193
|
+
bool changed = true;
|
1194
|
+
while (changed) {
|
1195
|
+
changed = false;
|
1196
|
+
unsigned int *data;
|
1134
1197
|
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
throw std::runtime_error("cand error");
|
1140
|
-
dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
|
1198
|
+
data = (unsigned int *) get_linklist(currObj, level);
|
1199
|
+
int size = getListCount(data);
|
1200
|
+
metric_hops++;
|
1201
|
+
metric_distance_computations+=size;
|
1141
1202
|
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1203
|
+
tableint *datal = (tableint *) (data + 1);
|
1204
|
+
for (int i = 0; i < size; i++) {
|
1205
|
+
tableint cand = datal[i];
|
1206
|
+
if (cand < 0 || cand > max_elements_)
|
1207
|
+
throw std::runtime_error("cand error");
|
1208
|
+
dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
|
1209
|
+
|
1210
|
+
if (d < curdist) {
|
1211
|
+
curdist = d;
|
1212
|
+
currObj = cand;
|
1213
|
+
changed = true;
|
1147
1214
|
}
|
1148
1215
|
}
|
1149
1216
|
}
|
1217
|
+
}
|
1150
1218
|
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
}
|
1219
|
+
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
|
1220
|
+
if (num_deleted_) {
|
1221
|
+
top_candidates = searchBaseLayerST<true, true>(
|
1222
|
+
currObj, query_data, std::max(ef_, k), isIdAllowed);
|
1223
|
+
} else {
|
1224
|
+
top_candidates = searchBaseLayerST<false, true>(
|
1225
|
+
currObj, query_data, std::max(ef_, k), isIdAllowed);
|
1226
|
+
}
|
1160
1227
|
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
void checkIntegrity(){
|
1173
|
-
int connections_checked=0;
|
1174
|
-
std::vector <int > inbound_connections_num(cur_element_count,0);
|
1175
|
-
for(int i = 0;i < cur_element_count; i++){
|
1176
|
-
for(int l = 0;l <= element_levels_[i]; l++){
|
1177
|
-
linklistsizeint *ll_cur = get_linklist_at_level(i,l);
|
1178
|
-
int size = getListCount(ll_cur);
|
1179
|
-
tableint *data = (tableint *) (ll_cur + 1);
|
1180
|
-
std::unordered_set<tableint> s;
|
1181
|
-
for (int j=0; j<size; j++){
|
1182
|
-
assert(data[j] > 0);
|
1183
|
-
assert(data[j] < cur_element_count);
|
1184
|
-
assert (data[j] != i);
|
1185
|
-
inbound_connections_num[data[j]]++;
|
1186
|
-
s.insert(data[j]);
|
1187
|
-
connections_checked++;
|
1228
|
+
while (top_candidates.size() > k) {
|
1229
|
+
top_candidates.pop();
|
1230
|
+
}
|
1231
|
+
while (top_candidates.size() > 0) {
|
1232
|
+
std::pair<dist_t, tableint> rez = top_candidates.top();
|
1233
|
+
result.push(std::pair<dist_t, labeltype>(rez.first, getExternalLabel(rez.second)));
|
1234
|
+
top_candidates.pop();
|
1235
|
+
}
|
1236
|
+
return result;
|
1237
|
+
}
|
1188
1238
|
|
1189
|
-
|
1190
|
-
|
1239
|
+
|
1240
|
+
void checkIntegrity() {
|
1241
|
+
int connections_checked = 0;
|
1242
|
+
std::vector <int > inbound_connections_num(cur_element_count, 0);
|
1243
|
+
for (int i = 0; i < cur_element_count; i++) {
|
1244
|
+
for (int l = 0; l <= element_levels_[i]; l++) {
|
1245
|
+
linklistsizeint *ll_cur = get_linklist_at_level(i, l);
|
1246
|
+
int size = getListCount(ll_cur);
|
1247
|
+
tableint *data = (tableint *) (ll_cur + 1);
|
1248
|
+
std::unordered_set<tableint> s;
|
1249
|
+
for (int j = 0; j < size; j++) {
|
1250
|
+
assert(data[j] > 0);
|
1251
|
+
assert(data[j] < cur_element_count);
|
1252
|
+
assert(data[j] != i);
|
1253
|
+
inbound_connections_num[data[j]]++;
|
1254
|
+
s.insert(data[j]);
|
1255
|
+
connections_checked++;
|
1191
1256
|
}
|
1257
|
+
assert(s.size() == size);
|
1192
1258
|
}
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n";
|
1259
|
+
}
|
1260
|
+
if (cur_element_count > 1) {
|
1261
|
+
int min1 = inbound_connections_num[0], max1 = inbound_connections_num[0];
|
1262
|
+
for (int i=0; i < cur_element_count; i++) {
|
1263
|
+
assert(inbound_connections_num[i] > 0);
|
1264
|
+
min1 = std::min(inbound_connections_num[i], min1);
|
1265
|
+
max1 = std::max(inbound_connections_num[i], max1);
|
1201
1266
|
}
|
1202
|
-
std::cout << "
|
1203
|
-
|
1267
|
+
std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n";
|
1204
1268
|
}
|
1205
|
-
|
1206
|
-
}
|
1207
|
-
|
1208
|
-
}
|
1269
|
+
std::cout << "integrity ok, checked " << connections_checked << " connections\n";
|
1270
|
+
}
|
1271
|
+
};
|
1272
|
+
} // namespace hnswlib
|