faiss 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/Clustering.cpp +32 -0
- data/vendor/faiss/faiss/Clustering.h +14 -0
- data/vendor/faiss/faiss/Index.h +1 -1
- data/vendor/faiss/faiss/Index2Layer.cpp +19 -92
- data/vendor/faiss/faiss/Index2Layer.h +2 -16
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
- data/vendor/faiss/faiss/{IndexResidual.h → IndexAdditiveQuantizer.h} +101 -58
- data/vendor/faiss/faiss/IndexFlat.cpp +22 -52
- data/vendor/faiss/faiss/IndexFlat.h +9 -15
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
- data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
- data/vendor/faiss/faiss/IndexIVF.cpp +79 -7
- data/vendor/faiss/faiss/IndexIVF.h +25 -7
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +9 -12
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +5 -4
- data/vendor/faiss/faiss/IndexIVFPQ.h +1 -1
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +60 -39
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +21 -6
- data/vendor/faiss/faiss/IndexLSH.cpp +4 -30
- data/vendor/faiss/faiss/IndexLSH.h +2 -15
- data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +0 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +2 -51
- data/vendor/faiss/faiss/IndexPQ.h +2 -17
- data/vendor/faiss/faiss/IndexRefine.cpp +28 -0
- data/vendor/faiss/faiss/IndexRefine.h +10 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -28
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -16
- data/vendor/faiss/faiss/VectorTransform.cpp +2 -1
- data/vendor/faiss/faiss/VectorTransform.h +3 -0
- data/vendor/faiss/faiss/clone_index.cpp +3 -2
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -2
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +257 -24
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +69 -9
- data/vendor/faiss/faiss/impl/HNSW.cpp +10 -5
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +393 -210
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +100 -28
- data/vendor/faiss/faiss/impl/NSG.cpp +0 -3
- data/vendor/faiss/faiss/impl/NSG.h +1 -1
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +357 -47
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +65 -7
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +12 -19
- data/vendor/faiss/faiss/impl/index_read.cpp +102 -19
- data/vendor/faiss/faiss/impl/index_write.cpp +66 -16
- data/vendor/faiss/faiss/impl/io.cpp +1 -1
- data/vendor/faiss/faiss/impl/io_macros.h +20 -0
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
- data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
- data/vendor/faiss/faiss/index_factory.cpp +585 -414
- data/vendor/faiss/faiss/index_factory.h +3 -0
- data/vendor/faiss/faiss/utils/distances.cpp +4 -2
- data/vendor/faiss/faiss/utils/distances.h +36 -3
- data/vendor/faiss/faiss/utils/distances_simd.cpp +50 -0
- data/vendor/faiss/faiss/utils/utils.h +1 -1
- metadata +12 -5
- data/vendor/faiss/faiss/IndexResidual.cpp +0 -291
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15428eb8dd7d27f8a94e3a7797dd765827e5454def33fa785055adb7ef0d20c5
|
4
|
+
data.tar.gz: 3e8eafebc49163c928bcab8d0ebd0f7b69e6659e49f36d7aaeb44e8651853ac9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 598f6e626d5c970e408cff68ec479bf1aa2d6ee18adeeeb2489d1c4fbf627dacbc6e398ce149f0483720080a439df4d2887e5b6c9dc9f465e8ffa1bbeede84a8
|
7
|
+
data.tar.gz: fefebfbbbbceb58ac6c6b02636630943d30db7af4ee92521586b4fb40e73bf1aa2b401e28e4c27872bf9344f60e3c9fb288ec6420abc1ffccd05ffd3ec7379fd
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
3
|
Copyright (c) Facebook, Inc. and its affiliates.
|
4
|
-
Copyright (c) 2020-
|
4
|
+
Copyright (c) 2020-2022 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/lib/faiss/version.rb
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
|
21
21
|
#include <faiss/IndexFlat.h>
|
22
22
|
#include <faiss/impl/FaissAssert.h>
|
23
|
+
#include <faiss/impl/kmeans1d.h>
|
23
24
|
#include <faiss/utils/distances.h>
|
24
25
|
#include <faiss/utils/random.h>
|
25
26
|
#include <faiss/utils/utils.h>
|
@@ -553,6 +554,37 @@ void Clustering::train_encoded(
|
|
553
554
|
}
|
554
555
|
}
|
555
556
|
|
557
|
+
Clustering1D::Clustering1D(int k) : Clustering(1, k) {}
|
558
|
+
|
559
|
+
Clustering1D::Clustering1D(int k, const ClusteringParameters& cp)
|
560
|
+
: Clustering(1, k, cp) {}
|
561
|
+
|
562
|
+
void Clustering1D::train_exact(idx_t n, const float* x) {
|
563
|
+
const float* xt = x;
|
564
|
+
|
565
|
+
std::unique_ptr<uint8_t[]> del;
|
566
|
+
if (n > k * max_points_per_centroid) {
|
567
|
+
uint8_t* x_new;
|
568
|
+
float* weights_new;
|
569
|
+
n = subsample_training_set(
|
570
|
+
*this,
|
571
|
+
n,
|
572
|
+
(uint8_t*)x,
|
573
|
+
sizeof(float) * d,
|
574
|
+
nullptr,
|
575
|
+
&x_new,
|
576
|
+
&weights_new);
|
577
|
+
del.reset(x_new);
|
578
|
+
xt = (float*)x_new;
|
579
|
+
}
|
580
|
+
|
581
|
+
centroids.resize(k);
|
582
|
+
double uf = kmeans1d(xt, n, k, centroids.data());
|
583
|
+
|
584
|
+
ClusteringIterationStats stats = {0.0, 0.0, 0.0, uf, 0};
|
585
|
+
iteration_stats.push_back(stats);
|
586
|
+
}
|
587
|
+
|
556
588
|
float kmeans_clustering(
|
557
589
|
size_t d,
|
558
590
|
size_t n,
|
@@ -111,6 +111,20 @@ struct Clustering : ClusteringParameters {
|
|
111
111
|
virtual ~Clustering() {}
|
112
112
|
};
|
113
113
|
|
114
|
+
/** Exact 1D clustering algorithm
|
115
|
+
*
|
116
|
+
* Since it does not use an index, it does not overload the train() function
|
117
|
+
*/
|
118
|
+
struct Clustering1D : Clustering {
|
119
|
+
explicit Clustering1D(int k);
|
120
|
+
|
121
|
+
Clustering1D(int k, const ClusteringParameters& cp);
|
122
|
+
|
123
|
+
void train_exact(idx_t n, const float* x);
|
124
|
+
|
125
|
+
virtual ~Clustering1D() {}
|
126
|
+
};
|
127
|
+
|
114
128
|
struct ProgressiveDimClusteringParameters : ClusteringParameters {
|
115
129
|
int progressive_dim_steps; ///< number of incremental steps
|
116
130
|
bool apply_pca; ///< apply PCA on input
|
data/vendor/faiss/faiss/Index.h
CHANGED
@@ -30,16 +30,6 @@
|
|
30
30
|
#include <faiss/utils/distances.h>
|
31
31
|
#include <faiss/utils/utils.h>
|
32
32
|
|
33
|
-
/*
|
34
|
-
#include <faiss/utils/Heap.h>
|
35
|
-
|
36
|
-
#include <faiss/Clustering.h>
|
37
|
-
|
38
|
-
#include <faiss/utils/hamming.h>
|
39
|
-
|
40
|
-
|
41
|
-
*/
|
42
|
-
|
43
33
|
namespace faiss {
|
44
34
|
|
45
35
|
/*************************************
|
@@ -52,7 +42,7 @@ Index2Layer::Index2Layer(
|
|
52
42
|
int M,
|
53
43
|
int nbit,
|
54
44
|
MetricType metric)
|
55
|
-
:
|
45
|
+
: IndexFlatCodes(0, quantizer->d, metric),
|
56
46
|
q1(quantizer, nlist),
|
57
47
|
pq(quantizer->d, M, nbit) {
|
58
48
|
is_trained = false;
|
@@ -116,55 +106,6 @@ void Index2Layer::train(idx_t n, const float* x) {
|
|
116
106
|
is_trained = true;
|
117
107
|
}
|
118
108
|
|
119
|
-
void Index2Layer::add(idx_t n, const float* x) {
|
120
|
-
idx_t bs = 32768;
|
121
|
-
if (n > bs) {
|
122
|
-
for (idx_t i0 = 0; i0 < n; i0 += bs) {
|
123
|
-
idx_t i1 = std::min(i0 + bs, n);
|
124
|
-
if (verbose) {
|
125
|
-
printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
|
126
|
-
" / %" PRId64 "\n",
|
127
|
-
i0,
|
128
|
-
i1,
|
129
|
-
n);
|
130
|
-
}
|
131
|
-
add(i1 - i0, x + i0 * d);
|
132
|
-
}
|
133
|
-
return;
|
134
|
-
}
|
135
|
-
|
136
|
-
std::vector<idx_t> codes1(n);
|
137
|
-
q1.quantizer->assign(n, x, codes1.data());
|
138
|
-
std::vector<float> residuals(n * d);
|
139
|
-
for (idx_t i = 0; i < n; i++) {
|
140
|
-
q1.quantizer->compute_residual(
|
141
|
-
x + i * d, residuals.data() + i * d, codes1[i]);
|
142
|
-
}
|
143
|
-
std::vector<uint8_t> codes2(n * code_size_2);
|
144
|
-
|
145
|
-
pq.compute_codes(residuals.data(), codes2.data(), n);
|
146
|
-
|
147
|
-
codes.resize((ntotal + n) * code_size);
|
148
|
-
uint8_t* wp = &codes[ntotal * code_size];
|
149
|
-
|
150
|
-
{
|
151
|
-
int i = 0x11223344;
|
152
|
-
const char* ip = (char*)&i;
|
153
|
-
FAISS_THROW_IF_NOT_MSG(
|
154
|
-
ip[0] == 0x44, "works only on a little-endian CPU");
|
155
|
-
}
|
156
|
-
|
157
|
-
// copy to output table
|
158
|
-
for (idx_t i = 0; i < n; i++) {
|
159
|
-
memcpy(wp, &codes1[i], code_size_1);
|
160
|
-
wp += code_size_1;
|
161
|
-
memcpy(wp, &codes2[i * code_size_2], code_size_2);
|
162
|
-
wp += code_size_2;
|
163
|
-
}
|
164
|
-
|
165
|
-
ntotal += n;
|
166
|
-
}
|
167
|
-
|
168
109
|
void Index2Layer::search(
|
169
110
|
idx_t /*n*/,
|
170
111
|
const float* /*x*/,
|
@@ -174,25 +115,6 @@ void Index2Layer::search(
|
|
174
115
|
FAISS_THROW_MSG("not implemented");
|
175
116
|
}
|
176
117
|
|
177
|
-
void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
|
178
|
-
std::vector<float> recons1(d);
|
179
|
-
FAISS_THROW_IF_NOT(i0 >= 0 && i0 + ni <= ntotal);
|
180
|
-
const uint8_t* rp = &codes[i0 * code_size];
|
181
|
-
|
182
|
-
for (idx_t i = 0; i < ni; i++) {
|
183
|
-
idx_t key = 0;
|
184
|
-
memcpy(&key, rp, code_size_1);
|
185
|
-
q1.quantizer->reconstruct(key, recons1.data());
|
186
|
-
rp += code_size_1;
|
187
|
-
pq.decode(rp, recons);
|
188
|
-
for (idx_t j = 0; j < d; j++) {
|
189
|
-
recons[j] += recons1[j];
|
190
|
-
}
|
191
|
-
rp += code_size_2;
|
192
|
-
recons += d;
|
193
|
-
}
|
194
|
-
}
|
195
|
-
|
196
118
|
void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
|
197
119
|
FAISS_THROW_IF_NOT(other.nlist == q1.nlist);
|
198
120
|
FAISS_THROW_IF_NOT(other.code_size == code_size_2);
|
@@ -211,15 +133,6 @@ void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
|
|
211
133
|
other.ntotal = ntotal;
|
212
134
|
}
|
213
135
|
|
214
|
-
void Index2Layer::reconstruct(idx_t key, float* recons) const {
|
215
|
-
reconstruct_n(key, 1, recons);
|
216
|
-
}
|
217
|
-
|
218
|
-
void Index2Layer::reset() {
|
219
|
-
ntotal = 0;
|
220
|
-
codes.clear();
|
221
|
-
}
|
222
|
-
|
223
136
|
namespace {
|
224
137
|
|
225
138
|
struct Distance2Level : DistanceComputer {
|
@@ -259,7 +172,7 @@ struct DistanceXPQ4 : Distance2Level {
|
|
259
172
|
|
260
173
|
FAISS_ASSERT(quantizer);
|
261
174
|
M = storage.pq.M;
|
262
|
-
pq_l1_tab = quantizer->
|
175
|
+
pq_l1_tab = quantizer->get_xb();
|
263
176
|
}
|
264
177
|
|
265
178
|
float operator()(idx_t i) override {
|
@@ -368,12 +281,26 @@ DistanceComputer* Index2Layer::get_distance_computer() const {
|
|
368
281
|
}
|
369
282
|
|
370
283
|
/* The standalone codec interface */
|
371
|
-
size_t Index2Layer::sa_code_size() const {
|
372
|
-
return code_size;
|
373
|
-
}
|
374
284
|
|
375
285
|
void Index2Layer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
|
376
286
|
FAISS_THROW_IF_NOT(is_trained);
|
287
|
+
|
288
|
+
idx_t bs = 32768;
|
289
|
+
if (n > bs) {
|
290
|
+
for (idx_t i0 = 0; i0 < n; i0 += bs) {
|
291
|
+
idx_t i1 = std::min(i0 + bs, n);
|
292
|
+
if (verbose) {
|
293
|
+
printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
|
294
|
+
" / %" PRId64 "\n",
|
295
|
+
i0,
|
296
|
+
i1,
|
297
|
+
n);
|
298
|
+
}
|
299
|
+
sa_encode(i1 - i0, x + i0 * d, bytes + i0 * code_size);
|
300
|
+
}
|
301
|
+
return;
|
302
|
+
}
|
303
|
+
|
377
304
|
std::unique_ptr<int64_t[]> list_nos(new int64_t[n]);
|
378
305
|
q1.quantizer->assign(n, x, list_nos.get());
|
379
306
|
std::vector<float> residuals(n * d);
|
@@ -11,6 +11,7 @@
|
|
11
11
|
|
12
12
|
#include <vector>
|
13
13
|
|
14
|
+
#include <faiss/IndexFlatCodes.h>
|
14
15
|
#include <faiss/IndexIVF.h>
|
15
16
|
#include <faiss/IndexPQ.h>
|
16
17
|
|
@@ -24,25 +25,19 @@ struct IndexIVFPQ;
|
|
24
25
|
* The class is mainly inteded to store encoded vectors that can be
|
25
26
|
* accessed randomly, the search function is not implemented.
|
26
27
|
*/
|
27
|
-
struct Index2Layer :
|
28
|
+
struct Index2Layer : IndexFlatCodes {
|
28
29
|
/// first level quantizer
|
29
30
|
Level1Quantizer q1;
|
30
31
|
|
31
32
|
/// second level quantizer is always a PQ
|
32
33
|
ProductQuantizer pq;
|
33
34
|
|
34
|
-
/// Codes. Size ntotal * code_size.
|
35
|
-
std::vector<uint8_t> codes;
|
36
|
-
|
37
35
|
/// size of the code for the first level (ceil(log8(q1.nlist)))
|
38
36
|
size_t code_size_1;
|
39
37
|
|
40
38
|
/// size of the code for the second level
|
41
39
|
size_t code_size_2;
|
42
40
|
|
43
|
-
/// code_size_1 + code_size_2
|
44
|
-
size_t code_size;
|
45
|
-
|
46
41
|
Index2Layer(
|
47
42
|
Index* quantizer,
|
48
43
|
size_t nlist,
|
@@ -55,8 +50,6 @@ struct Index2Layer : Index {
|
|
55
50
|
|
56
51
|
void train(idx_t n, const float* x) override;
|
57
52
|
|
58
|
-
void add(idx_t n, const float* x) override;
|
59
|
-
|
60
53
|
/// not implemented
|
61
54
|
void search(
|
62
55
|
idx_t n,
|
@@ -65,19 +58,12 @@ struct Index2Layer : Index {
|
|
65
58
|
float* distances,
|
66
59
|
idx_t* labels) const override;
|
67
60
|
|
68
|
-
void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
|
69
|
-
|
70
|
-
void reconstruct(idx_t key, float* recons) const override;
|
71
|
-
|
72
|
-
void reset() override;
|
73
|
-
|
74
61
|
DistanceComputer* get_distance_computer() const override;
|
75
62
|
|
76
63
|
/// transfer the flat codes to an IVFPQ index
|
77
64
|
void transfer_to_IVFPQ(IndexIVFPQ& other) const;
|
78
65
|
|
79
66
|
/* The standalone codec interface */
|
80
|
-
size_t sa_code_size() const override;
|
81
67
|
void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
|
82
68
|
void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
|
83
69
|
};
|