faiss 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Clustering.cpp +32 -0
- data/vendor/faiss/faiss/Clustering.h +14 -0
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/Index2Layer.cpp +19 -92
- data/vendor/faiss/faiss/Index2Layer.h +2 -16
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
- data/vendor/faiss/faiss/{IndexResidual.h → IndexAdditiveQuantizer.h} +101 -58
- data/vendor/faiss/faiss/IndexFlat.cpp +22 -52
- data/vendor/faiss/faiss/IndexFlat.h +9 -15
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
- data/vendor/faiss/faiss/IndexIVF.cpp +79 -7
- data/vendor/faiss/faiss/IndexIVF.h +25 -7
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +9 -12
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +5 -4
- data/vendor/faiss/faiss/IndexIVFPQ.h +1 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +60 -39
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +21 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +4 -30
- data/vendor/faiss/faiss/IndexLSH.h +2 -15
- data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +0 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +2 -51
- data/vendor/faiss/faiss/IndexPQ.h +2 -17
- data/vendor/faiss/faiss/IndexRefine.cpp +28 -0
- data/vendor/faiss/faiss/IndexRefine.h +10 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -28
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -16
- data/vendor/faiss/faiss/VectorTransform.cpp +2 -1
- data/vendor/faiss/faiss/VectorTransform.h +3 -0
- data/vendor/faiss/faiss/clone_index.cpp +3 -2
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -2
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +257 -24
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +69 -9
- data/vendor/faiss/faiss/impl/HNSW.cpp +10 -5
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +393 -210
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +100 -28
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -3
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +357 -47
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +65 -7
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +12 -19
- data/vendor/faiss/faiss/impl/index_read.cpp +102 -19
- data/vendor/faiss/faiss/impl/index_write.cpp +66 -16
- data/vendor/faiss/faiss/impl/io.cpp +1 -1
- data/vendor/faiss/faiss/impl/io_macros.h +20 -0
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
- data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
- data/vendor/faiss/faiss/index_factory.cpp +585 -414
- data/vendor/faiss/faiss/index_factory.h +3 -0
- data/vendor/faiss/faiss/utils/distances.cpp +4 -2
- data/vendor/faiss/faiss/utils/distances.h +36 -3
- data/vendor/faiss/faiss/utils/distances_simd.cpp +50 -0
- data/vendor/faiss/faiss/utils/utils.h +1 -1
- metadata +12 -5
- data/vendor/faiss/faiss/IndexResidual.cpp +0 -291
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15428eb8dd7d27f8a94e3a7797dd765827e5454def33fa785055adb7ef0d20c5
|
4
|
+
data.tar.gz: 3e8eafebc49163c928bcab8d0ebd0f7b69e6659e49f36d7aaeb44e8651853ac9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 598f6e626d5c970e408cff68ec479bf1aa2d6ee18adeeeb2489d1c4fbf627dacbc6e398ce149f0483720080a439df4d2887e5b6c9dc9f465e8ffa1bbeede84a8
|
7
|
+
data.tar.gz: fefebfbbbbceb58ac6c6b02636630943d30db7af4ee92521586b4fb40e73bf1aa2b401e28e4c27872bf9344f60e3c9fb288ec6420abc1ffccd05ffd3ec7379fd
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
3
|
Copyright (c) Facebook, Inc. and its affiliates.
|
4
|
-
Copyright (c) 2020-
|
4
|
+
Copyright (c) 2020-2022 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/lib/faiss/version.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
|
21
21
|
#include <faiss/IndexFlat.h>
|
22
22
|
#include <faiss/impl/FaissAssert.h>
|
23
|
+
#include <faiss/impl/kmeans1d.h>
|
23
24
|
#include <faiss/utils/distances.h>
|
24
25
|
#include <faiss/utils/random.h>
|
25
26
|
#include <faiss/utils/utils.h>
|
@@ -553,6 +554,37 @@ void Clustering::train_encoded(
|
|
553
554
|
}
|
554
555
|
}
|
555
556
|
|
557
|
+
Clustering1D::Clustering1D(int k) : Clustering(1, k) {}
|
558
|
+
|
559
|
+
Clustering1D::Clustering1D(int k, const ClusteringParameters& cp)
|
560
|
+
: Clustering(1, k, cp) {}
|
561
|
+
|
562
|
+
void Clustering1D::train_exact(idx_t n, const float* x) {
|
563
|
+
const float* xt = x;
|
564
|
+
|
565
|
+
std::unique_ptr<uint8_t[]> del;
|
566
|
+
if (n > k * max_points_per_centroid) {
|
567
|
+
uint8_t* x_new;
|
568
|
+
float* weights_new;
|
569
|
+
n = subsample_training_set(
|
570
|
+
*this,
|
571
|
+
n,
|
572
|
+
(uint8_t*)x,
|
573
|
+
sizeof(float) * d,
|
574
|
+
nullptr,
|
575
|
+
&x_new,
|
576
|
+
&weights_new);
|
577
|
+
del.reset(x_new);
|
578
|
+
xt = (float*)x_new;
|
579
|
+
}
|
580
|
+
|
581
|
+
centroids.resize(k);
|
582
|
+
double uf = kmeans1d(xt, n, k, centroids.data());
|
583
|
+
|
584
|
+
ClusteringIterationStats stats = {0.0, 0.0, 0.0, uf, 0};
|
585
|
+
iteration_stats.push_back(stats);
|
586
|
+
}
|
587
|
+
|
556
588
|
float kmeans_clustering(
|
557
589
|
size_t d,
|
558
590
|
size_t n,
|
@@ -111,6 +111,20 @@ struct Clustering : ClusteringParameters {
|
|
111
111
|
virtual ~Clustering() {}
|
112
112
|
};
|
113
113
|
|
114
|
+
/** Exact 1D clustering algorithm
|
115
|
+
*
|
116
|
+
* Since it does not use an index, it does not overload the train() function
|
117
|
+
*/
|
118
|
+
struct Clustering1D : Clustering {
|
119
|
+
explicit Clustering1D(int k);
|
120
|
+
|
121
|
+
Clustering1D(int k, const ClusteringParameters& cp);
|
122
|
+
|
123
|
+
void train_exact(idx_t n, const float* x);
|
124
|
+
|
125
|
+
virtual ~Clustering1D() {}
|
126
|
+
};
|
127
|
+
|
114
128
|
struct ProgressiveDimClusteringParameters : ClusteringParameters {
|
115
129
|
int progressive_dim_steps; ///< number of incremental steps
|
116
130
|
bool apply_pca; ///< apply PCA on input
|
data/vendor/faiss/faiss/Index.h
CHANGED
@@ -30,16 +30,6 @@
|
|
30
30
|
#include <faiss/utils/distances.h>
|
31
31
|
#include <faiss/utils/utils.h>
|
32
32
|
|
33
|
-
/*
|
34
|
-
#include <faiss/utils/Heap.h>
|
35
|
-
|
36
|
-
#include <faiss/Clustering.h>
|
37
|
-
|
38
|
-
#include <faiss/utils/hamming.h>
|
39
|
-
|
40
|
-
|
41
|
-
*/
|
42
|
-
|
43
33
|
namespace faiss {
|
44
34
|
|
45
35
|
/*************************************
|
@@ -52,7 +42,7 @@ Index2Layer::Index2Layer(
|
|
52
42
|
int M,
|
53
43
|
int nbit,
|
54
44
|
MetricType metric)
|
55
|
-
:
|
45
|
+
: IndexFlatCodes(0, quantizer->d, metric),
|
56
46
|
q1(quantizer, nlist),
|
57
47
|
pq(quantizer->d, M, nbit) {
|
58
48
|
is_trained = false;
|
@@ -116,55 +106,6 @@ void Index2Layer::train(idx_t n, const float* x) {
|
|
116
106
|
is_trained = true;
|
117
107
|
}
|
118
108
|
|
119
|
-
void Index2Layer::add(idx_t n, const float* x) {
|
120
|
-
idx_t bs = 32768;
|
121
|
-
if (n > bs) {
|
122
|
-
for (idx_t i0 = 0; i0 < n; i0 += bs) {
|
123
|
-
idx_t i1 = std::min(i0 + bs, n);
|
124
|
-
if (verbose) {
|
125
|
-
printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
|
126
|
-
" / %" PRId64 "\n",
|
127
|
-
i0,
|
128
|
-
i1,
|
129
|
-
n);
|
130
|
-
}
|
131
|
-
add(i1 - i0, x + i0 * d);
|
132
|
-
}
|
133
|
-
return;
|
134
|
-
}
|
135
|
-
|
136
|
-
std::vector<idx_t> codes1(n);
|
137
|
-
q1.quantizer->assign(n, x, codes1.data());
|
138
|
-
std::vector<float> residuals(n * d);
|
139
|
-
for (idx_t i = 0; i < n; i++) {
|
140
|
-
q1.quantizer->compute_residual(
|
141
|
-
x + i * d, residuals.data() + i * d, codes1[i]);
|
142
|
-
}
|
143
|
-
std::vector<uint8_t> codes2(n * code_size_2);
|
144
|
-
|
145
|
-
pq.compute_codes(residuals.data(), codes2.data(), n);
|
146
|
-
|
147
|
-
codes.resize((ntotal + n) * code_size);
|
148
|
-
uint8_t* wp = &codes[ntotal * code_size];
|
149
|
-
|
150
|
-
{
|
151
|
-
int i = 0x11223344;
|
152
|
-
const char* ip = (char*)&i;
|
153
|
-
FAISS_THROW_IF_NOT_MSG(
|
154
|
-
ip[0] == 0x44, "works only on a little-endian CPU");
|
155
|
-
}
|
156
|
-
|
157
|
-
// copy to output table
|
158
|
-
for (idx_t i = 0; i < n; i++) {
|
159
|
-
memcpy(wp, &codes1[i], code_size_1);
|
160
|
-
wp += code_size_1;
|
161
|
-
memcpy(wp, &codes2[i * code_size_2], code_size_2);
|
162
|
-
wp += code_size_2;
|
163
|
-
}
|
164
|
-
|
165
|
-
ntotal += n;
|
166
|
-
}
|
167
|
-
|
168
109
|
void Index2Layer::search(
|
169
110
|
idx_t /*n*/,
|
170
111
|
const float* /*x*/,
|
@@ -174,25 +115,6 @@ void Index2Layer::search(
|
|
174
115
|
FAISS_THROW_MSG("not implemented");
|
175
116
|
}
|
176
117
|
|
177
|
-
void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
|
178
|
-
std::vector<float> recons1(d);
|
179
|
-
FAISS_THROW_IF_NOT(i0 >= 0 && i0 + ni <= ntotal);
|
180
|
-
const uint8_t* rp = &codes[i0 * code_size];
|
181
|
-
|
182
|
-
for (idx_t i = 0; i < ni; i++) {
|
183
|
-
idx_t key = 0;
|
184
|
-
memcpy(&key, rp, code_size_1);
|
185
|
-
q1.quantizer->reconstruct(key, recons1.data());
|
186
|
-
rp += code_size_1;
|
187
|
-
pq.decode(rp, recons);
|
188
|
-
for (idx_t j = 0; j < d; j++) {
|
189
|
-
recons[j] += recons1[j];
|
190
|
-
}
|
191
|
-
rp += code_size_2;
|
192
|
-
recons += d;
|
193
|
-
}
|
194
|
-
}
|
195
|
-
|
196
118
|
void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
|
197
119
|
FAISS_THROW_IF_NOT(other.nlist == q1.nlist);
|
198
120
|
FAISS_THROW_IF_NOT(other.code_size == code_size_2);
|
@@ -211,15 +133,6 @@ void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
|
|
211
133
|
other.ntotal = ntotal;
|
212
134
|
}
|
213
135
|
|
214
|
-
void Index2Layer::reconstruct(idx_t key, float* recons) const {
|
215
|
-
reconstruct_n(key, 1, recons);
|
216
|
-
}
|
217
|
-
|
218
|
-
void Index2Layer::reset() {
|
219
|
-
ntotal = 0;
|
220
|
-
codes.clear();
|
221
|
-
}
|
222
|
-
|
223
136
|
namespace {
|
224
137
|
|
225
138
|
struct Distance2Level : DistanceComputer {
|
@@ -259,7 +172,7 @@ struct DistanceXPQ4 : Distance2Level {
|
|
259
172
|
|
260
173
|
FAISS_ASSERT(quantizer);
|
261
174
|
M = storage.pq.M;
|
262
|
-
pq_l1_tab = quantizer->
|
175
|
+
pq_l1_tab = quantizer->get_xb();
|
263
176
|
}
|
264
177
|
|
265
178
|
float operator()(idx_t i) override {
|
@@ -368,12 +281,26 @@ DistanceComputer* Index2Layer::get_distance_computer() const {
|
|
368
281
|
}
|
369
282
|
|
370
283
|
/* The standalone codec interface */
|
371
|
-
size_t Index2Layer::sa_code_size() const {
|
372
|
-
return code_size;
|
373
|
-
}
|
374
284
|
|
375
285
|
void Index2Layer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
|
376
286
|
FAISS_THROW_IF_NOT(is_trained);
|
287
|
+
|
288
|
+
idx_t bs = 32768;
|
289
|
+
if (n > bs) {
|
290
|
+
for (idx_t i0 = 0; i0 < n; i0 += bs) {
|
291
|
+
idx_t i1 = std::min(i0 + bs, n);
|
292
|
+
if (verbose) {
|
293
|
+
printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
|
294
|
+
" / %" PRId64 "\n",
|
295
|
+
i0,
|
296
|
+
i1,
|
297
|
+
n);
|
298
|
+
}
|
299
|
+
sa_encode(i1 - i0, x + i0 * d, bytes + i0 * code_size);
|
300
|
+
}
|
301
|
+
return;
|
302
|
+
}
|
303
|
+
|
377
304
|
std::unique_ptr<int64_t[]> list_nos(new int64_t[n]);
|
378
305
|
q1.quantizer->assign(n, x, list_nos.get());
|
379
306
|
std::vector<float> residuals(n * d);
|
@@ -11,6 +11,7 @@
|
|
11
11
|
|
12
12
|
#include <vector>
|
13
13
|
|
14
|
+
#include <faiss/IndexFlatCodes.h>
|
14
15
|
#include <faiss/IndexIVF.h>
|
15
16
|
#include <faiss/IndexPQ.h>
|
16
17
|
|
@@ -24,25 +25,19 @@ struct IndexIVFPQ;
|
|
24
25
|
* The class is mainly inteded to store encoded vectors that can be
|
25
26
|
* accessed randomly, the search function is not implemented.
|
26
27
|
*/
|
27
|
-
struct Index2Layer :
|
28
|
+
struct Index2Layer : IndexFlatCodes {
|
28
29
|
/// first level quantizer
|
29
30
|
Level1Quantizer q1;
|
30
31
|
|
31
32
|
/// second level quantizer is always a PQ
|
32
33
|
ProductQuantizer pq;
|
33
34
|
|
34
|
-
/// Codes. Size ntotal * code_size.
|
35
|
-
std::vector<uint8_t> codes;
|
36
|
-
|
37
35
|
/// size of the code for the first level (ceil(log8(q1.nlist)))
|
38
36
|
size_t code_size_1;
|
39
37
|
|
40
38
|
/// size of the code for the second level
|
41
39
|
size_t code_size_2;
|
42
40
|
|
43
|
-
/// code_size_1 + code_size_2
|
44
|
-
size_t code_size;
|
45
|
-
|
46
41
|
Index2Layer(
|
47
42
|
Index* quantizer,
|
48
43
|
size_t nlist,
|
@@ -55,8 +50,6 @@ struct Index2Layer : Index {
|
|
55
50
|
|
56
51
|
void train(idx_t n, const float* x) override;
|
57
52
|
|
58
|
-
void add(idx_t n, const float* x) override;
|
59
|
-
|
60
53
|
/// not implemented
|
61
54
|
void search(
|
62
55
|
idx_t n,
|
@@ -65,19 +58,12 @@ struct Index2Layer : Index {
|
|
65
58
|
float* distances,
|
66
59
|
idx_t* labels) const override;
|
67
60
|
|
68
|
-
void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
|
69
|
-
|
70
|
-
void reconstruct(idx_t key, float* recons) const override;
|
71
|
-
|
72
|
-
void reset() override;
|
73
|
-
|
74
61
|
DistanceComputer* get_distance_computer() const override;
|
75
62
|
|
76
63
|
/// transfer the flat codes to an IVFPQ index
|
77
64
|
void transfer_to_IVFPQ(IndexIVFPQ& other) const;
|
78
65
|
|
79
66
|
/* The standalone codec interface */
|
80
|
-
size_t sa_code_size() const override;
|
81
67
|
void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
|
82
68
|
void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
|
83
69
|
};
|