faiss 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +292 -291
- data/vendor/faiss/faiss/AutoTune.h +55 -56
- data/vendor/faiss/faiss/Clustering.cpp +334 -195
- data/vendor/faiss/faiss/Clustering.h +88 -35
- data/vendor/faiss/faiss/IVFlib.cpp +171 -195
- data/vendor/faiss/faiss/IVFlib.h +48 -51
- data/vendor/faiss/faiss/Index.cpp +85 -103
- data/vendor/faiss/faiss/Index.h +54 -48
- data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
- data/vendor/faiss/faiss/Index2Layer.h +22 -22
- data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
- data/vendor/faiss/faiss/IndexBinary.h +140 -132
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
- data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
- data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
- data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
- data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
- data/vendor/faiss/faiss/IndexFlat.h +35 -46
- data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
- data/vendor/faiss/faiss/IndexHNSW.h +57 -41
- data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
- data/vendor/faiss/faiss/IndexIVF.h +146 -113
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
- data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
- data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
- data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
- data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
- data/vendor/faiss/faiss/IndexLSH.h +21 -26
- data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
- data/vendor/faiss/faiss/IndexLattice.h +11 -16
- data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
- data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
- data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
- data/vendor/faiss/faiss/IndexNSG.h +85 -0
- data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
- data/vendor/faiss/faiss/IndexPQ.h +64 -67
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
- data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
- data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
- data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
- data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
- data/vendor/faiss/faiss/IndexRefine.h +22 -23
- data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
- data/vendor/faiss/faiss/IndexReplicas.h +62 -56
- data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
- data/vendor/faiss/faiss/IndexResidual.h +152 -0
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
- data/vendor/faiss/faiss/IndexShards.cpp +256 -240
- data/vendor/faiss/faiss/IndexShards.h +85 -73
- data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
- data/vendor/faiss/faiss/MatrixStats.h +7 -10
- data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
- data/vendor/faiss/faiss/MetaIndexes.h +40 -34
- data/vendor/faiss/faiss/MetricType.h +7 -7
- data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
- data/vendor/faiss/faiss/VectorTransform.h +61 -89
- data/vendor/faiss/faiss/clone_index.cpp +77 -73
- data/vendor/faiss/faiss/clone_index.h +4 -9
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
- data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
- data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
- data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
- data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
- data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
- data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
- data/vendor/faiss/faiss/impl/FaissException.h +41 -29
- data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
- data/vendor/faiss/faiss/impl/HNSW.h +179 -200
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
- data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
- data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
- data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
- data/vendor/faiss/faiss/impl/NSG.h +199 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
- data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
- data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
- data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
- data/vendor/faiss/faiss/impl/io.cpp +75 -94
- data/vendor/faiss/faiss/impl/io.h +31 -41
- data/vendor/faiss/faiss/impl/io_macros.h +40 -29
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
- data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
- data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
- data/vendor/faiss/faiss/index_factory.cpp +269 -218
- data/vendor/faiss/faiss/index_factory.h +6 -7
- data/vendor/faiss/faiss/index_io.h +23 -26
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
- data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
- data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
- data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
- data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
- data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
- data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
- data/vendor/faiss/faiss/utils/Heap.h +186 -209
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
- data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
- data/vendor/faiss/faiss/utils/distances.cpp +301 -310
- data/vendor/faiss/faiss/utils/distances.h +133 -118
- data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
- data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
- data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
- data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
- data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
- data/vendor/faiss/faiss/utils/hamming.h +62 -85
- data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
- data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
- data/vendor/faiss/faiss/utils/partitioning.h +26 -21
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
- data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
- data/vendor/faiss/faiss/utils/random.cpp +39 -63
- data/vendor/faiss/faiss/utils/random.h +13 -16
- data/vendor/faiss/faiss/utils/simdlib.h +4 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
- data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
- data/vendor/faiss/faiss/utils/utils.cpp +304 -287
- data/vendor/faiss/faiss/utils/utils.h +53 -48
- metadata +20 -2
|
@@ -15,79 +15,67 @@ namespace faiss {
|
|
|
15
15
|
* Random data generation functions
|
|
16
16
|
**************************************************/
|
|
17
17
|
|
|
18
|
-
RandomGenerator::RandomGenerator
|
|
19
|
-
: mt((unsigned int)seed) {}
|
|
18
|
+
RandomGenerator::RandomGenerator(int64_t seed) : mt((unsigned int)seed) {}
|
|
20
19
|
|
|
21
|
-
int RandomGenerator::rand_int
|
|
22
|
-
{
|
|
20
|
+
int RandomGenerator::rand_int() {
|
|
23
21
|
return mt() & 0x7fffffff;
|
|
24
22
|
}
|
|
25
23
|
|
|
26
|
-
int64_t RandomGenerator::rand_int64
|
|
27
|
-
{
|
|
24
|
+
int64_t RandomGenerator::rand_int64() {
|
|
28
25
|
return int64_t(rand_int()) | int64_t(rand_int()) << 31;
|
|
29
26
|
}
|
|
30
27
|
|
|
31
|
-
int RandomGenerator::rand_int
|
|
32
|
-
{
|
|
28
|
+
int RandomGenerator::rand_int(int max) {
|
|
33
29
|
return mt() % max;
|
|
34
30
|
}
|
|
35
31
|
|
|
36
|
-
float RandomGenerator::rand_float
|
|
37
|
-
{
|
|
32
|
+
float RandomGenerator::rand_float() {
|
|
38
33
|
return mt() / float(mt.max());
|
|
39
34
|
}
|
|
40
35
|
|
|
41
|
-
double RandomGenerator::rand_double
|
|
42
|
-
{
|
|
36
|
+
double RandomGenerator::rand_double() {
|
|
43
37
|
return mt() / double(mt.max());
|
|
44
38
|
}
|
|
45
39
|
|
|
46
|
-
|
|
47
40
|
/***********************************************************************
|
|
48
41
|
* Random functions in this C file only exist because Torch
|
|
49
42
|
* counterparts are slow and not multi-threaded. Typical use is for
|
|
50
43
|
* more than 1-100 billion values. */
|
|
51
44
|
|
|
52
|
-
|
|
53
45
|
/* Generate a set of random floating point values such that x[i] in [0,1]
|
|
54
46
|
multi-threading. For this reason, we rely on re-entreant functions. */
|
|
55
|
-
void float_rand
|
|
56
|
-
{
|
|
47
|
+
void float_rand(float* x, size_t n, int64_t seed) {
|
|
57
48
|
// only try to parallelize on large enough arrays
|
|
58
49
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
|
59
50
|
|
|
60
|
-
RandomGenerator rng0
|
|
61
|
-
int a0 = rng0.rand_int
|
|
51
|
+
RandomGenerator rng0(seed);
|
|
52
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
|
62
53
|
|
|
63
54
|
#pragma omp parallel for
|
|
64
55
|
for (int64_t j = 0; j < nblock; j++) {
|
|
65
|
-
|
|
66
|
-
RandomGenerator rng (a0 + j * b0);
|
|
56
|
+
RandomGenerator rng(a0 + j * b0);
|
|
67
57
|
|
|
68
58
|
const size_t istart = j * n / nblock;
|
|
69
59
|
const size_t iend = (j + 1) * n / nblock;
|
|
70
60
|
|
|
71
61
|
for (size_t i = istart; i < iend; i++)
|
|
72
|
-
x[i] = rng.rand_float
|
|
62
|
+
x[i] = rng.rand_float();
|
|
73
63
|
}
|
|
74
64
|
}
|
|
75
65
|
|
|
76
|
-
|
|
77
|
-
void float_randn (float * x, size_t n, int64_t seed)
|
|
78
|
-
{
|
|
66
|
+
void float_randn(float* x, size_t n, int64_t seed) {
|
|
79
67
|
// only try to parallelize on large enough arrays
|
|
80
68
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
|
81
69
|
|
|
82
|
-
RandomGenerator rng0
|
|
83
|
-
int a0 = rng0.rand_int
|
|
70
|
+
RandomGenerator rng0(seed);
|
|
71
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
|
84
72
|
|
|
85
73
|
#pragma omp parallel for
|
|
86
74
|
for (int64_t j = 0; j < nblock; j++) {
|
|
87
|
-
RandomGenerator rng
|
|
75
|
+
RandomGenerator rng(a0 + j * b0);
|
|
88
76
|
|
|
89
77
|
double a = 0, b = 0, s = 0;
|
|
90
|
-
int state = 0;
|
|
78
|
+
int state = 0; /* generate two number per "do-while" loop */
|
|
91
79
|
|
|
92
80
|
const size_t istart = j * n / nblock;
|
|
93
81
|
const size_t iend = (j + 1) * n / nblock;
|
|
@@ -96,96 +84,84 @@ void float_randn (float * x, size_t n, int64_t seed)
|
|
|
96
84
|
/* Marsaglia's method (see Knuth) */
|
|
97
85
|
if (state == 0) {
|
|
98
86
|
do {
|
|
99
|
-
a = 2.0 * rng.rand_double
|
|
100
|
-
b = 2.0 * rng.rand_double
|
|
87
|
+
a = 2.0 * rng.rand_double() - 1;
|
|
88
|
+
b = 2.0 * rng.rand_double() - 1;
|
|
101
89
|
s = a * a + b * b;
|
|
102
90
|
} while (s >= 1.0);
|
|
103
91
|
x[i] = a * sqrt(-2.0 * log(s) / s);
|
|
104
|
-
}
|
|
105
|
-
else
|
|
92
|
+
} else
|
|
106
93
|
x[i] = b * sqrt(-2.0 * log(s) / s);
|
|
107
94
|
state = 1 - state;
|
|
108
95
|
}
|
|
109
96
|
}
|
|
110
97
|
}
|
|
111
98
|
|
|
112
|
-
|
|
113
99
|
/* Integer versions */
|
|
114
|
-
void int64_rand
|
|
115
|
-
{
|
|
100
|
+
void int64_rand(int64_t* x, size_t n, int64_t seed) {
|
|
116
101
|
// only try to parallelize on large enough arrays
|
|
117
102
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
|
118
103
|
|
|
119
|
-
RandomGenerator rng0
|
|
120
|
-
int a0 = rng0.rand_int
|
|
104
|
+
RandomGenerator rng0(seed);
|
|
105
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
|
121
106
|
|
|
122
107
|
#pragma omp parallel for
|
|
123
108
|
for (int64_t j = 0; j < nblock; j++) {
|
|
124
|
-
|
|
125
|
-
RandomGenerator rng (a0 + j * b0);
|
|
109
|
+
RandomGenerator rng(a0 + j * b0);
|
|
126
110
|
|
|
127
111
|
const size_t istart = j * n / nblock;
|
|
128
112
|
const size_t iend = (j + 1) * n / nblock;
|
|
129
113
|
for (size_t i = istart; i < iend; i++)
|
|
130
|
-
x[i] = rng.rand_int64
|
|
114
|
+
x[i] = rng.rand_int64();
|
|
131
115
|
}
|
|
132
116
|
}
|
|
133
117
|
|
|
134
|
-
void int64_rand_max
|
|
135
|
-
{
|
|
118
|
+
void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed) {
|
|
136
119
|
// only try to parallelize on large enough arrays
|
|
137
120
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
|
138
121
|
|
|
139
|
-
RandomGenerator rng0
|
|
140
|
-
int a0 = rng0.rand_int
|
|
122
|
+
RandomGenerator rng0(seed);
|
|
123
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
|
141
124
|
|
|
142
125
|
#pragma omp parallel for
|
|
143
126
|
for (int64_t j = 0; j < nblock; j++) {
|
|
144
|
-
|
|
145
|
-
RandomGenerator rng (a0 + j * b0);
|
|
127
|
+
RandomGenerator rng(a0 + j * b0);
|
|
146
128
|
|
|
147
129
|
const size_t istart = j * n / nblock;
|
|
148
130
|
const size_t iend = (j + 1) * n / nblock;
|
|
149
131
|
for (size_t i = istart; i < iend; i++)
|
|
150
|
-
x[i] = rng.rand_int64
|
|
132
|
+
x[i] = rng.rand_int64() % max;
|
|
151
133
|
}
|
|
152
134
|
}
|
|
153
135
|
|
|
136
|
+
void rand_perm(int* perm, size_t n, int64_t seed) {
|
|
137
|
+
for (size_t i = 0; i < n; i++)
|
|
138
|
+
perm[i] = i;
|
|
154
139
|
|
|
155
|
-
|
|
156
|
-
{
|
|
157
|
-
for (size_t i = 0; i < n; i++) perm[i] = i;
|
|
158
|
-
|
|
159
|
-
RandomGenerator rng (seed);
|
|
140
|
+
RandomGenerator rng(seed);
|
|
160
141
|
|
|
161
142
|
for (size_t i = 0; i + 1 < n; i++) {
|
|
162
|
-
int i2 = i + rng.rand_int
|
|
143
|
+
int i2 = i + rng.rand_int(n - i);
|
|
163
144
|
std::swap(perm[i], perm[i2]);
|
|
164
145
|
}
|
|
165
146
|
}
|
|
166
147
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
void byte_rand (uint8_t * x, size_t n, int64_t seed)
|
|
171
|
-
{
|
|
148
|
+
void byte_rand(uint8_t* x, size_t n, int64_t seed) {
|
|
172
149
|
// only try to parallelize on large enough arrays
|
|
173
150
|
const size_t nblock = n < 1024 ? 1 : 1024;
|
|
174
151
|
|
|
175
|
-
RandomGenerator rng0
|
|
176
|
-
int a0 = rng0.rand_int
|
|
152
|
+
RandomGenerator rng0(seed);
|
|
153
|
+
int a0 = rng0.rand_int(), b0 = rng0.rand_int();
|
|
177
154
|
|
|
178
155
|
#pragma omp parallel for
|
|
179
156
|
for (int64_t j = 0; j < nblock; j++) {
|
|
180
|
-
|
|
181
|
-
RandomGenerator rng (a0 + j * b0);
|
|
157
|
+
RandomGenerator rng(a0 + j * b0);
|
|
182
158
|
|
|
183
159
|
const size_t istart = j * n / nblock;
|
|
184
160
|
const size_t iend = (j + 1) * n / nblock;
|
|
185
161
|
|
|
186
162
|
size_t i;
|
|
187
163
|
for (i = istart; i < iend; i++)
|
|
188
|
-
x[i] = rng.rand_int64
|
|
164
|
+
x[i] = rng.rand_int64();
|
|
189
165
|
}
|
|
190
166
|
}
|
|
191
167
|
|
|
@@ -13,9 +13,8 @@
|
|
|
13
13
|
|
|
14
14
|
#pragma once
|
|
15
15
|
|
|
16
|
-
#include <random>
|
|
17
16
|
#include <stdint.h>
|
|
18
|
-
|
|
17
|
+
#include <random>
|
|
19
18
|
|
|
20
19
|
namespace faiss {
|
|
21
20
|
|
|
@@ -25,36 +24,34 @@ namespace faiss {
|
|
|
25
24
|
|
|
26
25
|
/// random generator that can be used in multithreaded contexts
|
|
27
26
|
struct RandomGenerator {
|
|
28
|
-
|
|
29
27
|
std::mt19937 mt;
|
|
30
28
|
|
|
31
29
|
/// random positive integer
|
|
32
|
-
int rand_int
|
|
30
|
+
int rand_int();
|
|
33
31
|
|
|
34
32
|
/// random int64_t
|
|
35
|
-
int64_t rand_int64
|
|
33
|
+
int64_t rand_int64();
|
|
36
34
|
|
|
37
35
|
/// generate random integer between 0 and max-1
|
|
38
|
-
int rand_int
|
|
36
|
+
int rand_int(int max);
|
|
39
37
|
|
|
40
38
|
/// between 0 and 1
|
|
41
|
-
float rand_float
|
|
39
|
+
float rand_float();
|
|
42
40
|
|
|
43
|
-
double rand_double
|
|
41
|
+
double rand_double();
|
|
44
42
|
|
|
45
|
-
explicit RandomGenerator
|
|
43
|
+
explicit RandomGenerator(int64_t seed = 1234);
|
|
46
44
|
};
|
|
47
45
|
|
|
48
46
|
/* Generate an array of uniform random floats / multi-threaded implementation */
|
|
49
|
-
void float_rand
|
|
50
|
-
void float_randn
|
|
51
|
-
void int64_rand
|
|
52
|
-
void byte_rand
|
|
47
|
+
void float_rand(float* x, size_t n, int64_t seed);
|
|
48
|
+
void float_randn(float* x, size_t n, int64_t seed);
|
|
49
|
+
void int64_rand(int64_t* x, size_t n, int64_t seed);
|
|
50
|
+
void byte_rand(uint8_t* x, size_t n, int64_t seed);
|
|
53
51
|
// max is actually the maximum value + 1
|
|
54
|
-
void int64_rand_max
|
|
52
|
+
void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
|
|
55
53
|
|
|
56
54
|
/* random permutation */
|
|
57
|
-
void rand_perm
|
|
58
|
-
|
|
55
|
+
void rand_perm(int* perm, size_t n, int64_t seed);
|
|
59
56
|
|
|
60
57
|
} // namespace faiss
|
|
@@ -7,8 +7,6 @@
|
|
|
7
7
|
|
|
8
8
|
#pragma once
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
12
10
|
/** Abstractions for 256-bit registers
|
|
13
11
|
*
|
|
14
12
|
* The objective is to separate the different interpretations of the same
|
|
@@ -20,6 +18,10 @@
|
|
|
20
18
|
|
|
21
19
|
#include <faiss/utils/simdlib_avx2.h>
|
|
22
20
|
|
|
21
|
+
#elif defined(__aarch64__)
|
|
22
|
+
|
|
23
|
+
#include <faiss/utils/simdlib_neon.h>
|
|
24
|
+
|
|
23
25
|
#else
|
|
24
26
|
|
|
25
27
|
// emulated = all operations are implemented as scalars
|
|
@@ -7,8 +7,8 @@
|
|
|
7
7
|
|
|
8
8
|
#pragma once
|
|
9
9
|
|
|
10
|
-
#include <string>
|
|
11
10
|
#include <cstdint>
|
|
11
|
+
#include <string>
|
|
12
12
|
|
|
13
13
|
#include <immintrin.h>
|
|
14
14
|
|
|
@@ -16,7 +16,6 @@
|
|
|
16
16
|
|
|
17
17
|
namespace faiss {
|
|
18
18
|
|
|
19
|
-
|
|
20
19
|
/** Simple wrapper around the AVX 256-bit registers
|
|
21
20
|
*
|
|
22
21
|
* The objective is to separate the different interpretations of the same
|
|
@@ -27,36 +26,34 @@ namespace faiss {
|
|
|
27
26
|
|
|
28
27
|
/// 256-bit representation without interpretation as a vector
|
|
29
28
|
struct simd256bit {
|
|
30
|
-
|
|
31
|
-
union {
|
|
29
|
+
union {
|
|
32
30
|
__m256i i;
|
|
33
31
|
__m256 f;
|
|
34
32
|
};
|
|
35
33
|
|
|
36
|
-
simd256bit()
|
|
34
|
+
simd256bit() {}
|
|
37
35
|
|
|
38
|
-
explicit simd256bit(__m256i i): i(i) {}
|
|
36
|
+
explicit simd256bit(__m256i i) : i(i) {}
|
|
39
37
|
|
|
40
|
-
explicit simd256bit(__m256 f): f(f) {}
|
|
38
|
+
explicit simd256bit(__m256 f) : f(f) {}
|
|
41
39
|
|
|
42
|
-
explicit simd256bit(const void
|
|
43
|
-
|
|
44
|
-
{}
|
|
40
|
+
explicit simd256bit(const void* x)
|
|
41
|
+
: i(_mm256_load_si256((__m256i const*)x)) {}
|
|
45
42
|
|
|
46
43
|
void clear() {
|
|
47
44
|
i = _mm256_setzero_si256();
|
|
48
45
|
}
|
|
49
46
|
|
|
50
|
-
void storeu(void
|
|
51
|
-
_mm256_storeu_si256((__m256i
|
|
47
|
+
void storeu(void* ptr) const {
|
|
48
|
+
_mm256_storeu_si256((__m256i*)ptr, i);
|
|
52
49
|
}
|
|
53
50
|
|
|
54
|
-
void loadu(const void
|
|
51
|
+
void loadu(const void* ptr) {
|
|
55
52
|
i = _mm256_loadu_si256((__m256i*)ptr);
|
|
56
53
|
}
|
|
57
54
|
|
|
58
|
-
void store(void
|
|
59
|
-
_mm256_store_si256((__m256i
|
|
55
|
+
void store(void* ptr) const {
|
|
56
|
+
_mm256_store_si256((__m256i*)ptr, i);
|
|
60
57
|
}
|
|
61
58
|
|
|
62
59
|
void bin(char bits[257]) const {
|
|
@@ -73,30 +70,28 @@ struct simd256bit {
|
|
|
73
70
|
bin(bits);
|
|
74
71
|
return std::string(bits);
|
|
75
72
|
}
|
|
76
|
-
|
|
77
73
|
};
|
|
78
74
|
|
|
79
|
-
|
|
80
75
|
/// vector of 16 elements in uint16
|
|
81
|
-
struct simd16uint16: simd256bit {
|
|
76
|
+
struct simd16uint16 : simd256bit {
|
|
82
77
|
simd16uint16() {}
|
|
83
78
|
|
|
84
|
-
explicit simd16uint16(__m256i i): simd256bit(i) {}
|
|
79
|
+
explicit simd16uint16(__m256i i) : simd256bit(i) {}
|
|
85
80
|
|
|
86
|
-
explicit simd16uint16(int x): simd256bit(_mm256_set1_epi16(x)) {}
|
|
81
|
+
explicit simd16uint16(int x) : simd256bit(_mm256_set1_epi16(x)) {}
|
|
87
82
|
|
|
88
|
-
explicit simd16uint16(uint16_t x): simd256bit(_mm256_set1_epi16(x)) {}
|
|
83
|
+
explicit simd16uint16(uint16_t x) : simd256bit(_mm256_set1_epi16(x)) {}
|
|
89
84
|
|
|
90
|
-
explicit simd16uint16(simd256bit x): simd256bit(x) {}
|
|
85
|
+
explicit simd16uint16(simd256bit x) : simd256bit(x) {}
|
|
91
86
|
|
|
92
|
-
explicit simd16uint16(const uint16_t
|
|
87
|
+
explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
|
|
93
88
|
|
|
94
|
-
std::string elements_to_string(const char
|
|
89
|
+
std::string elements_to_string(const char* fmt) const {
|
|
95
90
|
uint16_t bytes[16];
|
|
96
91
|
storeu((void*)bytes);
|
|
97
92
|
char res[1000];
|
|
98
|
-
char
|
|
99
|
-
for(int i = 0; i < 16; i++) {
|
|
93
|
+
char* ptr = res;
|
|
94
|
+
for (int i = 0; i < 16; i++) {
|
|
100
95
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
|
101
96
|
}
|
|
102
97
|
// strip last ,
|
|
@@ -117,47 +112,47 @@ struct simd16uint16: simd256bit {
|
|
|
117
112
|
}
|
|
118
113
|
|
|
119
114
|
// shift must be known at compile time
|
|
120
|
-
simd16uint16 operator
|
|
115
|
+
simd16uint16 operator>>(const int shift) const {
|
|
121
116
|
return simd16uint16(_mm256_srli_epi16(i, shift));
|
|
122
117
|
}
|
|
123
118
|
|
|
124
119
|
// shift must be known at compile time
|
|
125
|
-
simd16uint16 operator
|
|
120
|
+
simd16uint16 operator<<(const int shift) const {
|
|
126
121
|
return simd16uint16(_mm256_slli_epi16(i, shift));
|
|
127
122
|
}
|
|
128
123
|
|
|
129
|
-
simd16uint16 operator
|
|
124
|
+
simd16uint16 operator+=(simd16uint16 other) {
|
|
130
125
|
i = _mm256_add_epi16(i, other.i);
|
|
131
126
|
return *this;
|
|
132
127
|
}
|
|
133
128
|
|
|
134
|
-
simd16uint16 operator
|
|
129
|
+
simd16uint16 operator-=(simd16uint16 other) {
|
|
135
130
|
i = _mm256_sub_epi16(i, other.i);
|
|
136
131
|
return *this;
|
|
137
132
|
}
|
|
138
133
|
|
|
139
|
-
simd16uint16 operator
|
|
134
|
+
simd16uint16 operator+(simd16uint16 other) const {
|
|
140
135
|
return simd16uint16(_mm256_add_epi16(i, other.i));
|
|
141
136
|
}
|
|
142
137
|
|
|
143
|
-
simd16uint16 operator
|
|
138
|
+
simd16uint16 operator-(simd16uint16 other) const {
|
|
144
139
|
return simd16uint16(_mm256_sub_epi16(i, other.i));
|
|
145
140
|
}
|
|
146
141
|
|
|
147
|
-
simd16uint16 operator
|
|
142
|
+
simd16uint16 operator&(simd256bit other) const {
|
|
148
143
|
return simd16uint16(_mm256_and_si256(i, other.i));
|
|
149
144
|
}
|
|
150
145
|
|
|
151
|
-
simd16uint16 operator
|
|
146
|
+
simd16uint16 operator|(simd256bit other) const {
|
|
152
147
|
return simd16uint16(_mm256_or_si256(i, other.i));
|
|
153
148
|
}
|
|
154
149
|
|
|
155
150
|
// returns binary masks
|
|
156
|
-
simd16uint16 operator
|
|
151
|
+
simd16uint16 operator==(simd256bit other) const {
|
|
157
152
|
return simd16uint16(_mm256_cmpeq_epi16(i, other.i));
|
|
158
153
|
}
|
|
159
154
|
|
|
160
|
-
simd16uint16 operator
|
|
155
|
+
simd16uint16 operator~() const {
|
|
161
156
|
return simd16uint16(_mm256_xor_si256(i, _mm256_set1_epi32(-1)));
|
|
162
157
|
}
|
|
163
158
|
|
|
@@ -188,7 +183,7 @@ struct simd16uint16: simd256bit {
|
|
|
188
183
|
}
|
|
189
184
|
|
|
190
185
|
// for debugging only
|
|
191
|
-
uint16_t operator
|
|
186
|
+
uint16_t operator[](int i) const {
|
|
192
187
|
ALIGNED(32) uint16_t tab[16];
|
|
193
188
|
store(tab);
|
|
194
189
|
return tab[i];
|
|
@@ -201,7 +196,6 @@ struct simd16uint16: simd256bit {
|
|
|
201
196
|
void accu_max(simd16uint16 incoming) {
|
|
202
197
|
i = _mm256_max_epu16(i, incoming.i);
|
|
203
198
|
}
|
|
204
|
-
|
|
205
199
|
};
|
|
206
200
|
|
|
207
201
|
// not really a std::min because it returns an elementwise min
|
|
@@ -213,13 +207,10 @@ inline simd16uint16 max(simd16uint16 a, simd16uint16 b) {
|
|
|
213
207
|
return simd16uint16(_mm256_max_epu16(a.i, b.i));
|
|
214
208
|
}
|
|
215
209
|
|
|
216
|
-
|
|
217
|
-
|
|
218
210
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
|
219
211
|
// return (a0 + a1, b0 + b1)
|
|
220
212
|
// TODO find a better name
|
|
221
213
|
inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
|
|
222
|
-
|
|
223
214
|
__m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
|
|
224
215
|
__m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
|
|
225
216
|
|
|
@@ -229,7 +220,6 @@ inline simd16uint16 combine2x2(simd16uint16 a, simd16uint16 b) {
|
|
|
229
220
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
|
230
221
|
// of d0 and d1 with thr
|
|
231
222
|
inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
232
|
-
|
|
233
223
|
__m256i max0 = _mm256_max_epu16(d0.i, thr.i);
|
|
234
224
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
|
235
225
|
|
|
@@ -245,9 +235,7 @@ inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
|
245
235
|
return ge;
|
|
246
236
|
}
|
|
247
237
|
|
|
248
|
-
|
|
249
238
|
inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
250
|
-
|
|
251
239
|
__m256i max0 = _mm256_min_epu16(d0.i, thr.i);
|
|
252
240
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
|
253
241
|
|
|
@@ -263,29 +251,26 @@ inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
|
263
251
|
return ge;
|
|
264
252
|
}
|
|
265
253
|
|
|
266
|
-
|
|
267
254
|
// vector of 32 unsigned 8-bit integers
|
|
268
|
-
struct simd32uint8: simd256bit {
|
|
269
|
-
|
|
270
|
-
|
|
255
|
+
struct simd32uint8 : simd256bit {
|
|
271
256
|
simd32uint8() {}
|
|
272
257
|
|
|
273
|
-
explicit simd32uint8(__m256i i): simd256bit(i) {}
|
|
258
|
+
explicit simd32uint8(__m256i i) : simd256bit(i) {}
|
|
274
259
|
|
|
275
|
-
explicit simd32uint8(int x): simd256bit(_mm256_set1_epi8(x)) {}
|
|
260
|
+
explicit simd32uint8(int x) : simd256bit(_mm256_set1_epi8(x)) {}
|
|
276
261
|
|
|
277
|
-
explicit simd32uint8(uint8_t x): simd256bit(_mm256_set1_epi8(x)) {}
|
|
262
|
+
explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
|
|
278
263
|
|
|
279
|
-
explicit simd32uint8(simd256bit x): simd256bit(x) {}
|
|
264
|
+
explicit simd32uint8(simd256bit x) : simd256bit(x) {}
|
|
280
265
|
|
|
281
|
-
explicit simd32uint8(const uint8_t
|
|
266
|
+
explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
|
|
282
267
|
|
|
283
|
-
std::string elements_to_string(const char
|
|
268
|
+
std::string elements_to_string(const char* fmt) const {
|
|
284
269
|
uint8_t bytes[32];
|
|
285
270
|
storeu((void*)bytes);
|
|
286
271
|
char res[1000];
|
|
287
|
-
char
|
|
288
|
-
for(int i = 0; i < 32; i++) {
|
|
272
|
+
char* ptr = res;
|
|
273
|
+
for (int i = 0; i < 32; i++) {
|
|
289
274
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
|
290
275
|
}
|
|
291
276
|
// strip last ,
|
|
@@ -305,11 +290,11 @@ struct simd32uint8: simd256bit {
|
|
|
305
290
|
i = _mm256_set1_epi8((char)x);
|
|
306
291
|
}
|
|
307
292
|
|
|
308
|
-
simd32uint8 operator
|
|
293
|
+
simd32uint8 operator&(simd256bit other) const {
|
|
309
294
|
return simd32uint8(_mm256_and_si256(i, other.i));
|
|
310
295
|
}
|
|
311
296
|
|
|
312
|
-
simd32uint8 operator
|
|
297
|
+
simd32uint8 operator+(simd32uint8 other) const {
|
|
313
298
|
return simd32uint8(_mm256_add_epi8(i, other.i));
|
|
314
299
|
}
|
|
315
300
|
|
|
@@ -329,18 +314,17 @@ struct simd32uint8: simd256bit {
|
|
|
329
314
|
return simd16uint16(_mm256_cvtepu8_epi16(x));
|
|
330
315
|
}
|
|
331
316
|
|
|
332
|
-
simd32uint8 operator
|
|
317
|
+
simd32uint8 operator+=(simd32uint8 other) {
|
|
333
318
|
i = _mm256_add_epi8(i, other.i);
|
|
334
319
|
return *this;
|
|
335
320
|
}
|
|
336
321
|
|
|
337
322
|
// for debugging only
|
|
338
|
-
uint8_t operator
|
|
323
|
+
uint8_t operator[](int i) const {
|
|
339
324
|
ALIGNED(32) uint8_t tab[32];
|
|
340
325
|
store(tab);
|
|
341
326
|
return tab[i];
|
|
342
327
|
}
|
|
343
|
-
|
|
344
328
|
};
|
|
345
329
|
|
|
346
330
|
// convert with saturation
|
|
@@ -359,26 +343,24 @@ inline simd32uint8 blendv(simd32uint8 a, simd32uint8 b, simd32uint8 mask) {
|
|
|
359
343
|
return simd32uint8(_mm256_blendv_epi8(a.i, b.i, mask.i));
|
|
360
344
|
}
|
|
361
345
|
|
|
362
|
-
|
|
363
|
-
|
|
364
346
|
/// vector of 8 unsigned 32-bit integers
|
|
365
|
-
struct simd8uint32: simd256bit {
|
|
347
|
+
struct simd8uint32 : simd256bit {
|
|
366
348
|
simd8uint32() {}
|
|
367
349
|
|
|
368
|
-
explicit simd8uint32(__m256i i): simd256bit(i) {}
|
|
350
|
+
explicit simd8uint32(__m256i i) : simd256bit(i) {}
|
|
369
351
|
|
|
370
|
-
explicit simd8uint32(uint32_t x): simd256bit(_mm256_set1_epi32(x)) {}
|
|
352
|
+
explicit simd8uint32(uint32_t x) : simd256bit(_mm256_set1_epi32(x)) {}
|
|
371
353
|
|
|
372
|
-
explicit simd8uint32(simd256bit x): simd256bit(x) {}
|
|
354
|
+
explicit simd8uint32(simd256bit x) : simd256bit(x) {}
|
|
373
355
|
|
|
374
|
-
explicit simd8uint32(const uint8_t
|
|
356
|
+
explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
|
|
375
357
|
|
|
376
|
-
std::string elements_to_string(const char
|
|
358
|
+
std::string elements_to_string(const char* fmt) const {
|
|
377
359
|
uint32_t bytes[8];
|
|
378
360
|
storeu((void*)bytes);
|
|
379
361
|
char res[1000];
|
|
380
|
-
char
|
|
381
|
-
for(int i = 0; i < 8; i++) {
|
|
362
|
+
char* ptr = res;
|
|
363
|
+
for (int i = 0; i < 8; i++) {
|
|
382
364
|
ptr += sprintf(ptr, fmt, bytes[i]);
|
|
383
365
|
}
|
|
384
366
|
// strip last ,
|
|
@@ -397,31 +379,28 @@ struct simd8uint32: simd256bit {
|
|
|
397
379
|
void set1(uint32_t x) {
|
|
398
380
|
i = _mm256_set1_epi32((int)x);
|
|
399
381
|
}
|
|
400
|
-
|
|
401
382
|
};
|
|
402
383
|
|
|
403
|
-
struct simd8float32: simd256bit {
|
|
404
|
-
|
|
384
|
+
struct simd8float32 : simd256bit {
|
|
405
385
|
simd8float32() {}
|
|
406
386
|
|
|
387
|
+
explicit simd8float32(simd256bit x) : simd256bit(x) {}
|
|
407
388
|
|
|
408
|
-
explicit simd8float32(
|
|
409
|
-
|
|
410
|
-
explicit simd8float32(__m256 x): simd256bit(x) {}
|
|
389
|
+
explicit simd8float32(__m256 x) : simd256bit(x) {}
|
|
411
390
|
|
|
412
|
-
explicit simd8float32(float x): simd256bit(_mm256_set1_ps(x)) {}
|
|
391
|
+
explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
|
|
413
392
|
|
|
414
|
-
explicit simd8float32(const float
|
|
393
|
+
explicit simd8float32(const float* x) : simd256bit(_mm256_load_ps(x)) {}
|
|
415
394
|
|
|
416
|
-
simd8float32 operator
|
|
395
|
+
simd8float32 operator*(simd8float32 other) const {
|
|
417
396
|
return simd8float32(_mm256_mul_ps(f, other.f));
|
|
418
397
|
}
|
|
419
398
|
|
|
420
|
-
simd8float32 operator
|
|
399
|
+
simd8float32 operator+(simd8float32 other) const {
|
|
421
400
|
return simd8float32(_mm256_add_ps(f, other.f));
|
|
422
401
|
}
|
|
423
402
|
|
|
424
|
-
simd8float32 operator
|
|
403
|
+
simd8float32 operator-(simd8float32 other) const {
|
|
425
404
|
return simd8float32(_mm256_sub_ps(f, other.f));
|
|
426
405
|
}
|
|
427
406
|
|
|
@@ -429,15 +408,14 @@ struct simd8float32: simd256bit {
|
|
|
429
408
|
float tab[8];
|
|
430
409
|
storeu((void*)tab);
|
|
431
410
|
char res[1000];
|
|
432
|
-
char
|
|
433
|
-
for(int i = 0; i < 8; i++) {
|
|
411
|
+
char* ptr = res;
|
|
412
|
+
for (int i = 0; i < 8; i++) {
|
|
434
413
|
ptr += sprintf(ptr, "%g,", tab[i]);
|
|
435
414
|
}
|
|
436
415
|
// strip last ,
|
|
437
416
|
ptr[-1] = 0;
|
|
438
417
|
return std::string(res);
|
|
439
418
|
}
|
|
440
|
-
|
|
441
419
|
};
|
|
442
420
|
|
|
443
421
|
inline simd8float32 hadd(simd8float32 a, simd8float32 b) {
|
|
@@ -457,5 +435,30 @@ inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
|
|
|
457
435
|
return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
|
|
458
436
|
}
|
|
459
437
|
|
|
438
|
+
namespace {
|
|
439
|
+
|
|
440
|
+
// get even float32's of a and b, interleaved
|
|
441
|
+
inline simd8float32 geteven(simd8float32 a, simd8float32 b) {
|
|
442
|
+
return simd8float32(
|
|
443
|
+
_mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6));
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// get odd float32's of a and b, interleaved
|
|
447
|
+
inline simd8float32 getodd(simd8float32 a, simd8float32 b) {
|
|
448
|
+
return simd8float32(
|
|
449
|
+
_mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6));
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// 3 cycles
|
|
453
|
+
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
|
454
|
+
inline simd8float32 getlow128(simd8float32 a, simd8float32 b) {
|
|
455
|
+
return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4));
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
inline simd8float32 gethigh128(simd8float32 a, simd8float32 b) {
|
|
459
|
+
return simd8float32(_mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4));
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
} // namespace
|
|
460
463
|
|
|
461
464
|
} // namespace faiss
|