redisbench-admin 0.11.54__py3-none-any.whl → 0.11.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redisbench_admin/environments/oss_cluster.py +9 -1
- redisbench_admin/run/aibench_run_inference_redisai_vision/aibench_run_inference_redisai_vision.py +4 -16
- redisbench_admin/run/asm.py +426 -0
- redisbench_admin/run/common.py +3 -0
- redisbench_admin/run/ftsb/ftsb.py +4 -16
- redisbench_admin/run/tsbs_run_queries_redistimeseries/tsbs_run_queries_redistimeseries.py +4 -16
- redisbench_admin/run_remote/standalone.py +2 -3
- redisbench_admin/utils/benchmark_config.py +11 -13
- redisbench_admin/utils/utils.py +0 -21
- {redisbench_admin-0.11.54.dist-info → redisbench_admin-0.11.56.dist-info}/METADATA +7 -4
- redisbench_admin-0.11.56.dist-info/RECORD +117 -0
- {redisbench_admin-0.11.54.dist-info → redisbench_admin-0.11.56.dist-info}/WHEEL +1 -1
- redisbench_admin/run/ann/pkg/.dockerignore +0 -2
- redisbench_admin/run/ann/pkg/.git +0 -1
- redisbench_admin/run/ann/pkg/.github/workflows/benchmarks.yml +0 -100
- redisbench_admin/run/ann/pkg/.gitignore +0 -21
- redisbench_admin/run/ann/pkg/LICENSE +0 -21
- redisbench_admin/run/ann/pkg/README.md +0 -157
- redisbench_admin/run/ann/pkg/algos.yaml +0 -1294
- redisbench_admin/run/ann/pkg/algosP.yaml +0 -67
- redisbench_admin/run/ann/pkg/ann_benchmarks/__init__.py +0 -2
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/__init__.py +0 -0
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/annoy.py +0 -26
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/balltree.py +0 -22
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/base.py +0 -36
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/bruteforce.py +0 -110
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/ckdtree.py +0 -17
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/datasketch.py +0 -29
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/definitions.py +0 -187
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/diskann.py +0 -190
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/dolphinnpy.py +0 -31
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/dummy_algo.py +0 -25
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/elasticsearch.py +0 -107
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/elastiknn.py +0 -124
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss.py +0 -124
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss_gpu.py +0 -61
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss_hnsw.py +0 -39
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/flann.py +0 -27
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/hnswlib.py +0 -36
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/kdtree.py +0 -22
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/kgraph.py +0 -39
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/lshf.py +0 -25
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/milvus.py +0 -99
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/mrpt.py +0 -41
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/n2.py +0 -28
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/nearpy.py +0 -48
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/nmslib.py +0 -74
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/onng_ngt.py +0 -100
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/opensearchknn.py +0 -107
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/panng_ngt.py +0 -79
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/pinecone.py +0 -39
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/puffinn.py +0 -45
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/pynndescent.py +0 -115
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/qg_ngt.py +0 -102
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/redisearch.py +0 -90
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/rpforest.py +0 -20
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/scann.py +0 -34
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/sptag.py +0 -28
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/subprocess.py +0 -246
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vald.py +0 -149
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vecsim-hnsw.py +0 -43
- redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vespa.py +0 -47
- redisbench_admin/run/ann/pkg/ann_benchmarks/constants.py +0 -1
- redisbench_admin/run/ann/pkg/ann_benchmarks/data.py +0 -48
- redisbench_admin/run/ann/pkg/ann_benchmarks/datasets.py +0 -620
- redisbench_admin/run/ann/pkg/ann_benchmarks/distance.py +0 -53
- redisbench_admin/run/ann/pkg/ann_benchmarks/main.py +0 -325
- redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/__init__.py +0 -2
- redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/metrics.py +0 -183
- redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/plot_variants.py +0 -17
- redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/utils.py +0 -165
- redisbench_admin/run/ann/pkg/ann_benchmarks/results.py +0 -71
- redisbench_admin/run/ann/pkg/ann_benchmarks/runner.py +0 -333
- redisbench_admin/run/ann/pkg/create_dataset.py +0 -12
- redisbench_admin/run/ann/pkg/create_hybrid_dataset.py +0 -147
- redisbench_admin/run/ann/pkg/create_text_to_image_ds.py +0 -117
- redisbench_admin/run/ann/pkg/create_website.py +0 -272
- redisbench_admin/run/ann/pkg/install/Dockerfile +0 -11
- redisbench_admin/run/ann/pkg/install/Dockerfile.annoy +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.datasketch +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.diskann +0 -29
- redisbench_admin/run/ann/pkg/install/Dockerfile.diskann_pq +0 -31
- redisbench_admin/run/ann/pkg/install/Dockerfile.dolphinn +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.elasticsearch +0 -45
- redisbench_admin/run/ann/pkg/install/Dockerfile.elastiknn +0 -61
- redisbench_admin/run/ann/pkg/install/Dockerfile.faiss +0 -18
- redisbench_admin/run/ann/pkg/install/Dockerfile.flann +0 -10
- redisbench_admin/run/ann/pkg/install/Dockerfile.hnswlib +0 -10
- redisbench_admin/run/ann/pkg/install/Dockerfile.kgraph +0 -6
- redisbench_admin/run/ann/pkg/install/Dockerfile.mih +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.milvus +0 -27
- redisbench_admin/run/ann/pkg/install/Dockerfile.mrpt +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.n2 +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.nearpy +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.ngt +0 -13
- redisbench_admin/run/ann/pkg/install/Dockerfile.nmslib +0 -10
- redisbench_admin/run/ann/pkg/install/Dockerfile.opensearchknn +0 -43
- redisbench_admin/run/ann/pkg/install/Dockerfile.puffinn +0 -6
- redisbench_admin/run/ann/pkg/install/Dockerfile.pynndescent +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.redisearch +0 -18
- redisbench_admin/run/ann/pkg/install/Dockerfile.rpforest +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.scann +0 -5
- redisbench_admin/run/ann/pkg/install/Dockerfile.scipy +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.sklearn +0 -4
- redisbench_admin/run/ann/pkg/install/Dockerfile.sptag +0 -30
- redisbench_admin/run/ann/pkg/install/Dockerfile.vald +0 -8
- redisbench_admin/run/ann/pkg/install/Dockerfile.vespa +0 -17
- redisbench_admin/run/ann/pkg/install.py +0 -70
- redisbench_admin/run/ann/pkg/logging.conf +0 -34
- redisbench_admin/run/ann/pkg/multirun.py +0 -298
- redisbench_admin/run/ann/pkg/plot.py +0 -159
- redisbench_admin/run/ann/pkg/protocol/bf-runner +0 -10
- redisbench_admin/run/ann/pkg/protocol/bf-runner.py +0 -204
- redisbench_admin/run/ann/pkg/protocol/ext-add-query-metric.md +0 -51
- redisbench_admin/run/ann/pkg/protocol/ext-batch-queries.md +0 -77
- redisbench_admin/run/ann/pkg/protocol/ext-prepared-queries.md +0 -77
- redisbench_admin/run/ann/pkg/protocol/ext-query-parameters.md +0 -47
- redisbench_admin/run/ann/pkg/protocol/specification.md +0 -194
- redisbench_admin/run/ann/pkg/requirements.txt +0 -14
- redisbench_admin/run/ann/pkg/requirements_py38.txt +0 -11
- redisbench_admin/run/ann/pkg/results/fashion-mnist-784-euclidean.png +0 -0
- redisbench_admin/run/ann/pkg/results/gist-960-euclidean.png +0 -0
- redisbench_admin/run/ann/pkg/results/glove-100-angular.png +0 -0
- redisbench_admin/run/ann/pkg/results/glove-25-angular.png +0 -0
- redisbench_admin/run/ann/pkg/results/lastfm-64-dot.png +0 -0
- redisbench_admin/run/ann/pkg/results/mnist-784-euclidean.png +0 -0
- redisbench_admin/run/ann/pkg/results/nytimes-256-angular.png +0 -0
- redisbench_admin/run/ann/pkg/results/sift-128-euclidean.png +0 -0
- redisbench_admin/run/ann/pkg/run.py +0 -12
- redisbench_admin/run/ann/pkg/run_algorithm.py +0 -3
- redisbench_admin/run/ann/pkg/templates/chartjs.template +0 -102
- redisbench_admin/run/ann/pkg/templates/detail_page.html +0 -23
- redisbench_admin/run/ann/pkg/templates/general.html +0 -58
- redisbench_admin/run/ann/pkg/templates/latex.template +0 -30
- redisbench_admin/run/ann/pkg/templates/summary.html +0 -60
- redisbench_admin/run/ann/pkg/test/__init__.py +0 -0
- redisbench_admin/run/ann/pkg/test/test-jaccard.py +0 -19
- redisbench_admin/run/ann/pkg/test/test-metrics.py +0 -99
- redisbench_admin-0.11.54.dist-info/RECORD +0 -242
- {redisbench_admin-0.11.54.dist-info → redisbench_admin-0.11.56.dist-info}/entry_points.txt +0 -0
- {redisbench_admin-0.11.54.dist-info → redisbench_admin-0.11.56.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,620 +0,0 @@
|
|
|
1
|
-
from copyreg import pickle
|
|
2
|
-
import h5py
|
|
3
|
-
import numpy
|
|
4
|
-
import os
|
|
5
|
-
import random
|
|
6
|
-
|
|
7
|
-
from urllib.request import urlopen
|
|
8
|
-
from urllib.request import urlretrieve
|
|
9
|
-
|
|
10
|
-
from ann_benchmarks.distance import dataset_transform
|
|
11
|
-
import urllib.parse
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def download(src, dst):
|
|
15
|
-
if not os.path.exists(dst):
|
|
16
|
-
# TODO: should be atomic
|
|
17
|
-
print('downloading %s -> %s...' % (src, dst))
|
|
18
|
-
urlretrieve(src, dst)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def get_dataset_fn(dataset):
|
|
22
|
-
if not os.path.exists('data'):
|
|
23
|
-
try:
|
|
24
|
-
os.mkdir('data')
|
|
25
|
-
except FileExistsError:
|
|
26
|
-
pass # fixes race condition
|
|
27
|
-
return os.path.join('data', '%s.hdf5' % dataset)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def get_dataset(which):
|
|
31
|
-
hdf5_fn = get_dataset_fn(which)
|
|
32
|
-
try:
|
|
33
|
-
if 'dbpedia' in which:
|
|
34
|
-
url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/dbpedia/dbpedia-768.hdf5'
|
|
35
|
-
elif 'amazon-reviews' in which:
|
|
36
|
-
url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/amazon-reviews-384.hdf5'
|
|
37
|
-
elif 'hybrid' in which:
|
|
38
|
-
url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which)
|
|
39
|
-
elif 'Text-to-Image' in which:
|
|
40
|
-
url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/big_ann/%s.hdf5' % urllib.parse.quote(which)
|
|
41
|
-
else:
|
|
42
|
-
url = 'http://ann-benchmarks.com/%s.hdf5' % which
|
|
43
|
-
download(url, hdf5_fn)
|
|
44
|
-
except:
|
|
45
|
-
print("Cannot download %s" % url)
|
|
46
|
-
if which in DATASETS:
|
|
47
|
-
print("Creating dataset locally")
|
|
48
|
-
DATASETS[which](hdf5_fn)
|
|
49
|
-
hdf5_f = h5py.File(hdf5_fn, 'r')
|
|
50
|
-
|
|
51
|
-
# here for backward compatibility, to ensure old datasets can still be used with newer versions
|
|
52
|
-
# cast to integer because the json parser (later on) cannot interpret numpy integers
|
|
53
|
-
dimension = int(hdf5_f.attrs['dimension']) if 'dimension' in hdf5_f.attrs else len(hdf5_f['train'][0])
|
|
54
|
-
|
|
55
|
-
return hdf5_f, dimension
|
|
56
|
-
|
|
57
|
-
# Everything below this line is related to creating datasets
|
|
58
|
-
# You probably never need to do this at home,
|
|
59
|
-
# just rely on the prepared datasets at http://ann-benchmarks.com
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def write_output(train, test, fn, distance, point_type='float', count=100):
|
|
63
|
-
from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
|
|
64
|
-
n = 0
|
|
65
|
-
f = h5py.File(fn, 'w')
|
|
66
|
-
f.attrs['type'] = 'dense'
|
|
67
|
-
f.attrs['distance'] = distance
|
|
68
|
-
f.attrs['dimension'] = len(train[0])
|
|
69
|
-
f.attrs['point_type'] = point_type
|
|
70
|
-
print('train size: %9d * %4d' % train.shape)
|
|
71
|
-
print('test size: %9d * %4d' % test.shape)
|
|
72
|
-
f.create_dataset('train', (len(train), len(
|
|
73
|
-
train[0])), dtype=train.dtype)[:] = train
|
|
74
|
-
f.create_dataset('test', (len(test), len(
|
|
75
|
-
test[0])), dtype=test.dtype)[:] = test
|
|
76
|
-
neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
|
|
77
|
-
distances = f.create_dataset('distances', (len(test), count), dtype='f')
|
|
78
|
-
bf = BruteForceBLAS(distance, precision=train.dtype)
|
|
79
|
-
|
|
80
|
-
bf.fit(train)
|
|
81
|
-
for i, x in enumerate(test):
|
|
82
|
-
if i % 1000 == 0:
|
|
83
|
-
print('%d/%d...' % (i, len(test)))
|
|
84
|
-
res = list(bf.query_with_distances(x, count))
|
|
85
|
-
res.sort(key=lambda t: t[-1])
|
|
86
|
-
neighbors[i] = [j for j, _ in res]
|
|
87
|
-
distances[i] = [d for _, d in res]
|
|
88
|
-
f.close()
|
|
89
|
-
|
|
90
|
-
"""
|
|
91
|
-
param: train and test are arrays of arrays of indices.
|
|
92
|
-
"""
|
|
93
|
-
def write_sparse_output(train, test, fn, distance, dimension, count=100):
|
|
94
|
-
from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
|
|
95
|
-
f = h5py.File(fn, 'w')
|
|
96
|
-
f.attrs['type'] = 'sparse'
|
|
97
|
-
f.attrs['distance'] = distance
|
|
98
|
-
f.attrs['dimension'] = dimension
|
|
99
|
-
f.attrs['point_type'] = 'bit'
|
|
100
|
-
print('train size: %9d * %4d' % (train.shape[0], dimension))
|
|
101
|
-
print('test size: %9d * %4d' % (test.shape[0], dimension))
|
|
102
|
-
|
|
103
|
-
# We ensure the sets are sorted
|
|
104
|
-
train = numpy.array(list(map(sorted, train)))
|
|
105
|
-
test = numpy.array(list(map(sorted, test)))
|
|
106
|
-
|
|
107
|
-
flat_train = numpy.hstack(train.flatten())
|
|
108
|
-
flat_test = numpy.hstack(test.flatten())
|
|
109
|
-
|
|
110
|
-
f.create_dataset('train', (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train
|
|
111
|
-
f.create_dataset('test', (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test
|
|
112
|
-
neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
|
|
113
|
-
distances = f.create_dataset('distances', (len(test), count), dtype='f')
|
|
114
|
-
|
|
115
|
-
f.create_dataset('size_test', (len(test),), dtype='i')[:] = list(map(len, test))
|
|
116
|
-
f.create_dataset('size_train', (len(train),), dtype='i')[:] = list(map(len, train))
|
|
117
|
-
|
|
118
|
-
bf = BruteForceBLAS(distance, precision=train.dtype)
|
|
119
|
-
bf.fit(train)
|
|
120
|
-
for i, x in enumerate(test):
|
|
121
|
-
if i % 1000 == 0:
|
|
122
|
-
print('%d/%d...' % (i, len(test)))
|
|
123
|
-
res = list(bf.query_with_distances(x, count))
|
|
124
|
-
res.sort(key=lambda t: t[-1])
|
|
125
|
-
neighbors[i] = [j for j, _ in res]
|
|
126
|
-
distances[i] = [d for _, d in res]
|
|
127
|
-
f.close()
|
|
128
|
-
|
|
129
|
-
def train_test_split(X, test_size=10000, dimension=None):
|
|
130
|
-
import sklearn.model_selection
|
|
131
|
-
if dimension == None:
|
|
132
|
-
dimension = X.shape[1]
|
|
133
|
-
print('Splitting %d*%d into train/test' % (X.shape[0], dimension))
|
|
134
|
-
return sklearn.model_selection.train_test_split(
|
|
135
|
-
X, test_size=test_size, random_state=1)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def glove(out_fn, d):
|
|
139
|
-
import zipfile
|
|
140
|
-
|
|
141
|
-
url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
|
|
142
|
-
fn = os.path.join('data', 'glove.twitter.27B.zip')
|
|
143
|
-
download(url, fn)
|
|
144
|
-
with zipfile.ZipFile(fn) as z:
|
|
145
|
-
print('preparing %s' % out_fn)
|
|
146
|
-
z_fn = 'glove.twitter.27B.%dd.txt' % d
|
|
147
|
-
X = []
|
|
148
|
-
for line in z.open(z_fn):
|
|
149
|
-
v = [float(x) for x in line.strip().split()[1:]]
|
|
150
|
-
X.append(numpy.array(v))
|
|
151
|
-
X_train, X_test = train_test_split(X)
|
|
152
|
-
write_output(numpy.array(X_train), numpy.array(
|
|
153
|
-
X_test), out_fn, 'angular')
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def _load_texmex_vectors(f, n, k):
|
|
157
|
-
import struct
|
|
158
|
-
|
|
159
|
-
v = numpy.zeros((n, k))
|
|
160
|
-
for i in range(n):
|
|
161
|
-
f.read(4) # ignore vec length
|
|
162
|
-
v[i] = struct.unpack('f' * k, f.read(k * 4))
|
|
163
|
-
|
|
164
|
-
return v
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def _get_irisa_matrix(t, fn):
|
|
168
|
-
import struct
|
|
169
|
-
m = t.getmember(fn)
|
|
170
|
-
f = t.extractfile(m)
|
|
171
|
-
k, = struct.unpack('i', f.read(4))
|
|
172
|
-
n = m.size // (4 + 4 * k)
|
|
173
|
-
f.seek(0)
|
|
174
|
-
return _load_texmex_vectors(f, n, k)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def sift(out_fn):
|
|
178
|
-
import tarfile
|
|
179
|
-
|
|
180
|
-
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
|
|
181
|
-
fn = os.path.join('data', 'sift.tar.tz')
|
|
182
|
-
download(url, fn)
|
|
183
|
-
with tarfile.open(fn, 'r:gz') as t:
|
|
184
|
-
train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
|
|
185
|
-
test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
|
|
186
|
-
write_output(train, test, out_fn, 'euclidean')
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def gist(out_fn):
|
|
190
|
-
import tarfile
|
|
191
|
-
|
|
192
|
-
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
|
|
193
|
-
fn = os.path.join('data', 'gist.tar.tz')
|
|
194
|
-
download(url, fn)
|
|
195
|
-
with tarfile.open(fn, 'r:gz') as t:
|
|
196
|
-
train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
|
|
197
|
-
test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
|
|
198
|
-
write_output(train, test, out_fn, 'euclidean')
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def _load_mnist_vectors(fn):
|
|
202
|
-
import gzip
|
|
203
|
-
import struct
|
|
204
|
-
|
|
205
|
-
print('parsing vectors in %s...' % fn)
|
|
206
|
-
f = gzip.open(fn)
|
|
207
|
-
type_code_info = {
|
|
208
|
-
0x08: (1, "!B"),
|
|
209
|
-
0x09: (1, "!b"),
|
|
210
|
-
0x0B: (2, "!H"),
|
|
211
|
-
0x0C: (4, "!I"),
|
|
212
|
-
0x0D: (4, "!f"),
|
|
213
|
-
0x0E: (8, "!d")
|
|
214
|
-
}
|
|
215
|
-
magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
|
|
216
|
-
assert magic == 0
|
|
217
|
-
assert type_code in type_code_info
|
|
218
|
-
|
|
219
|
-
dimensions = [struct.unpack("!I", f.read(4))[0]
|
|
220
|
-
for i in range(dim_count)]
|
|
221
|
-
|
|
222
|
-
entry_count = dimensions[0]
|
|
223
|
-
entry_size = numpy.product(dimensions[1:])
|
|
224
|
-
|
|
225
|
-
b, format_string = type_code_info[type_code]
|
|
226
|
-
vectors = []
|
|
227
|
-
for i in range(entry_count):
|
|
228
|
-
vectors.append([struct.unpack(format_string, f.read(b))[0]
|
|
229
|
-
for j in range(entry_size)])
|
|
230
|
-
return numpy.array(vectors)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def mnist(out_fn):
|
|
234
|
-
download(
|
|
235
|
-
'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') # noqa
|
|
236
|
-
download(
|
|
237
|
-
'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') # noqa
|
|
238
|
-
train = _load_mnist_vectors('mnist-train.gz')
|
|
239
|
-
test = _load_mnist_vectors('mnist-test.gz')
|
|
240
|
-
write_output(train, test, out_fn, 'euclidean')
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def fashion_mnist(out_fn):
|
|
244
|
-
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', # noqa
|
|
245
|
-
'fashion-mnist-train.gz')
|
|
246
|
-
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', # noqa
|
|
247
|
-
'fashion-mnist-test.gz')
|
|
248
|
-
train = _load_mnist_vectors('fashion-mnist-train.gz')
|
|
249
|
-
test = _load_mnist_vectors('fashion-mnist-test.gz')
|
|
250
|
-
write_output(train, test, out_fn, 'euclidean')
|
|
251
|
-
|
|
252
|
-
# Creates a 'deep image descriptor' dataset using the 'deep10M.fvecs' sample
|
|
253
|
-
# from http://sites.skoltech.ru/compvision/noimi/. The download logic is adapted
|
|
254
|
-
# from the script https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py.
|
|
255
|
-
def deep_image(out_fn):
|
|
256
|
-
yadisk_key = 'https://yadi.sk/d/11eDCm7Dsn9GA'
|
|
257
|
-
response = urlopen('https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=' \
|
|
258
|
-
+ yadisk_key + '&path=/deep10M.fvecs')
|
|
259
|
-
response_body = response.read().decode("utf-8")
|
|
260
|
-
|
|
261
|
-
dataset_url = response_body.split(',')[0][9:-1]
|
|
262
|
-
filename = os.path.join('data', 'deep-image.fvecs')
|
|
263
|
-
download(dataset_url, filename)
|
|
264
|
-
|
|
265
|
-
# In the fvecs file format, each vector is stored by first writing its
|
|
266
|
-
# length as an integer, then writing its components as floats.
|
|
267
|
-
fv = numpy.fromfile(filename, dtype=numpy.float32)
|
|
268
|
-
dim = fv.view(numpy.int32)[0]
|
|
269
|
-
fv = fv.reshape(-1, dim + 1)[:, 1:]
|
|
270
|
-
|
|
271
|
-
X_train, X_test = train_test_split(fv)
|
|
272
|
-
write_output(X_train, X_test, out_fn, 'angular')
|
|
273
|
-
|
|
274
|
-
def transform_bag_of_words(filename, n_dimensions, out_fn):
|
|
275
|
-
import gzip
|
|
276
|
-
from scipy.sparse import lil_matrix
|
|
277
|
-
from sklearn.feature_extraction.text import TfidfTransformer
|
|
278
|
-
from sklearn import random_projection
|
|
279
|
-
with gzip.open(filename, 'rb') as f:
|
|
280
|
-
file_content = f.readlines()
|
|
281
|
-
entries = int(file_content[0])
|
|
282
|
-
words = int(file_content[1])
|
|
283
|
-
file_content = file_content[3:] # strip first three entries
|
|
284
|
-
print("building matrix...")
|
|
285
|
-
A = lil_matrix((entries, words))
|
|
286
|
-
for e in file_content:
|
|
287
|
-
doc, word, cnt = [int(v) for v in e.strip().split()]
|
|
288
|
-
A[doc - 1, word - 1] = cnt
|
|
289
|
-
print("normalizing matrix entries with tfidf...")
|
|
290
|
-
B = TfidfTransformer().fit_transform(A)
|
|
291
|
-
print("reducing dimensionality...")
|
|
292
|
-
C = random_projection.GaussianRandomProjection(
|
|
293
|
-
n_components=n_dimensions).fit_transform(B)
|
|
294
|
-
X_train, X_test = train_test_split(C)
|
|
295
|
-
write_output(numpy.array(X_train), numpy.array(
|
|
296
|
-
X_test), out_fn, 'angular')
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
def nytimes(out_fn, n_dimensions):
|
|
300
|
-
fn = 'nytimes_%s.txt.gz' % n_dimensions
|
|
301
|
-
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) # noqa
|
|
302
|
-
transform_bag_of_words(fn, n_dimensions, out_fn)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def random_float(out_fn, n_dims, n_samples, centers, distance):
|
|
306
|
-
import sklearn.datasets
|
|
307
|
-
|
|
308
|
-
X, _ = sklearn.datasets.make_blobs(
|
|
309
|
-
n_samples=n_samples, n_features=n_dims,
|
|
310
|
-
centers=centers, random_state=1)
|
|
311
|
-
X_train, X_test = train_test_split(X, test_size=0.1)
|
|
312
|
-
write_output(X_train, X_test, out_fn, distance)
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
def random_bitstring(out_fn, n_dims, n_samples, n_queries):
|
|
316
|
-
import sklearn.datasets
|
|
317
|
-
|
|
318
|
-
Y, _ = sklearn.datasets.make_blobs(
|
|
319
|
-
n_samples=n_samples, n_features=n_dims,
|
|
320
|
-
centers=n_queries, random_state=1)
|
|
321
|
-
X = numpy.zeros((n_samples, n_dims), dtype=numpy.bool)
|
|
322
|
-
for i, vec in enumerate(Y):
|
|
323
|
-
X[i] = numpy.array([v > 0 for v in vec], dtype=numpy.bool)
|
|
324
|
-
|
|
325
|
-
X_train, X_test = train_test_split(X, test_size=n_queries)
|
|
326
|
-
write_output(X_train, X_test, out_fn, 'hamming', 'bit')
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
def word2bits(out_fn, path, fn):
|
|
330
|
-
import tarfile
|
|
331
|
-
local_fn = fn + '.tar.gz'
|
|
332
|
-
url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % ( # noqa
|
|
333
|
-
path, fn)
|
|
334
|
-
download(url, local_fn)
|
|
335
|
-
print('parsing vectors in %s...' % local_fn)
|
|
336
|
-
with tarfile.open(local_fn, 'r:gz') as t:
|
|
337
|
-
f = t.extractfile(fn)
|
|
338
|
-
n_words, k = [int(z) for z in next(f).strip().split()]
|
|
339
|
-
X = numpy.zeros((n_words, k), dtype=numpy.bool)
|
|
340
|
-
for i in range(n_words):
|
|
341
|
-
X[i] = numpy.array([float(z) > 0 for z in next(
|
|
342
|
-
f).strip().split()[1:]], dtype=numpy.bool)
|
|
343
|
-
|
|
344
|
-
X_train, X_test = train_test_split(X, test_size=1000)
|
|
345
|
-
write_output(X_train, X_test, out_fn, 'hamming', 'bit')
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def sift_hamming(out_fn, fn):
|
|
349
|
-
import tarfile
|
|
350
|
-
local_fn = fn + '.tar.gz'
|
|
351
|
-
url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
|
|
352
|
-
download(url, local_fn)
|
|
353
|
-
print('parsing vectors in %s...' % local_fn)
|
|
354
|
-
with tarfile.open(local_fn, 'r:gz') as t:
|
|
355
|
-
f = t.extractfile(fn)
|
|
356
|
-
lines = f.readlines()
|
|
357
|
-
X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
|
|
358
|
-
for i, line in enumerate(lines):
|
|
359
|
-
X[i] = numpy.array(
|
|
360
|
-
[int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
|
|
361
|
-
X_train, X_test = train_test_split(X, test_size=1000)
|
|
362
|
-
write_output(X_train, X_test, out_fn, 'hamming', 'bit')
|
|
363
|
-
|
|
364
|
-
def kosarak(out_fn):
|
|
365
|
-
import gzip
|
|
366
|
-
local_fn = 'kosarak.dat.gz'
|
|
367
|
-
# only consider sets with at least min_elements many elements
|
|
368
|
-
min_elements = 20
|
|
369
|
-
url = 'http://fimi.uantwerpen.be/data/%s' % local_fn
|
|
370
|
-
download(url, local_fn)
|
|
371
|
-
|
|
372
|
-
X = []
|
|
373
|
-
dimension = 0
|
|
374
|
-
with gzip.open('kosarak.dat.gz', 'r') as f:
|
|
375
|
-
content = f.readlines()
|
|
376
|
-
# preprocess data to find sets with more than 20 elements
|
|
377
|
-
# keep track of used ids for reenumeration
|
|
378
|
-
for line in content:
|
|
379
|
-
if len(line.split()) >= min_elements:
|
|
380
|
-
X.append(list(map(int, line.split())))
|
|
381
|
-
dimension = max(dimension, max(X[-1]) + 1)
|
|
382
|
-
|
|
383
|
-
X_train, X_test = train_test_split(numpy.array(X), test_size=500, dimension=dimension)
|
|
384
|
-
write_sparse_output(X_train, X_test, out_fn, 'jaccard', dimension)
|
|
385
|
-
|
|
386
|
-
def random_jaccard(out_fn, n=10000, size=50, universe=80):
|
|
387
|
-
random.seed(1)
|
|
388
|
-
l = list(range(universe))
|
|
389
|
-
X = []
|
|
390
|
-
for i in range(n):
|
|
391
|
-
X.append(random.sample(l, size))
|
|
392
|
-
|
|
393
|
-
X_train, X_test = train_test_split(numpy.array(X), test_size=100, dimension=universe)
|
|
394
|
-
write_sparse_output(X_train, X_test, out_fn, 'jaccard', universe)
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
def lastfm(out_fn, n_dimensions, test_size=50000):
|
|
399
|
-
# This tests out ANN methods for retrieval on simple matrix factorization
|
|
400
|
-
# based recommendation algorithms. The idea being that the query/test
|
|
401
|
-
# vectors are user factors and the train set are item factors from
|
|
402
|
-
# the matrix factorization model.
|
|
403
|
-
|
|
404
|
-
# Since the predictor is a dot product, we transform the factors first
|
|
405
|
-
# as described in this
|
|
406
|
-
# paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf # noqa
|
|
407
|
-
# This hopefully replicates the experiments done in this post:
|
|
408
|
-
# http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ # noqa
|
|
409
|
-
|
|
410
|
-
# The dataset is from "Last.fm Dataset - 360K users":
|
|
411
|
-
# http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html # noqa
|
|
412
|
-
|
|
413
|
-
# This requires the implicit package to generate the factors
|
|
414
|
-
# (on my desktop/gpu this only takes 4-5 seconds to train - but
|
|
415
|
-
# could take 1-2 minutes on a laptop)
|
|
416
|
-
from implicit.datasets.lastfm import get_lastfm
|
|
417
|
-
from implicit.approximate_als import augment_inner_product_matrix
|
|
418
|
-
import implicit
|
|
419
|
-
|
|
420
|
-
# train an als model on the lastfm data
|
|
421
|
-
_, _, play_counts = get_lastfm()
|
|
422
|
-
model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
|
|
423
|
-
model.fit(implicit.nearest_neighbours.bm25_weight(
|
|
424
|
-
play_counts, K1=100, B=0.8))
|
|
425
|
-
|
|
426
|
-
# transform item factors so that each one has the same norm,
|
|
427
|
-
# and transform the user factors such by appending a 0 column
|
|
428
|
-
_, item_factors = augment_inner_product_matrix(model.item_factors)
|
|
429
|
-
user_factors = numpy.append(model.user_factors,
|
|
430
|
-
numpy.zeros((model.user_factors.shape[0], 1)),
|
|
431
|
-
axis=1)
|
|
432
|
-
|
|
433
|
-
# only query the first 50k users (speeds things up signficantly
|
|
434
|
-
# without changing results)
|
|
435
|
-
user_factors = user_factors[:test_size]
|
|
436
|
-
|
|
437
|
-
# after that transformation a cosine lookup will return the same results
|
|
438
|
-
# as the inner product on the untransformed data
|
|
439
|
-
write_output(item_factors, user_factors, out_fn, 'angular')
|
|
440
|
-
|
|
441
|
-
def parse_dbpedia_data(source_file, max_docs: int):
|
|
442
|
-
import re
|
|
443
|
-
"""
|
|
444
|
-
Parses the input file of abstracts and returns an iterable
|
|
445
|
-
:param max_docs: maximum number of input documents to process; -1 for no limit
|
|
446
|
-
:param source_file: input file
|
|
447
|
-
:return: yields document by document to the consumer
|
|
448
|
-
"""
|
|
449
|
-
global VERBOSE
|
|
450
|
-
count = 0
|
|
451
|
-
max_tokens = 0
|
|
452
|
-
|
|
453
|
-
if -1 < max_docs < 50:
|
|
454
|
-
VERBOSE = True
|
|
455
|
-
|
|
456
|
-
percent = 0.1
|
|
457
|
-
bulk_size = (percent / 100) * max_docs
|
|
458
|
-
|
|
459
|
-
print(f"bulk_size={bulk_size}")
|
|
460
|
-
|
|
461
|
-
if bulk_size <= 0:
|
|
462
|
-
bulk_size = 1000
|
|
463
|
-
|
|
464
|
-
for line in source_file:
|
|
465
|
-
line = line.decode("utf-8")
|
|
466
|
-
|
|
467
|
-
# skip commented out lines
|
|
468
|
-
comment_regex = '^#'
|
|
469
|
-
if re.search(comment_regex, line):
|
|
470
|
-
continue
|
|
471
|
-
|
|
472
|
-
token_size = len(line.split())
|
|
473
|
-
if token_size > max_tokens:
|
|
474
|
-
max_tokens = token_size
|
|
475
|
-
|
|
476
|
-
# skip lines with 20 tokens or less, because they tend to contain noise
|
|
477
|
-
# (this may vary in your dataset)
|
|
478
|
-
if token_size <= 20:
|
|
479
|
-
continue
|
|
480
|
-
|
|
481
|
-
first_url_regex = '^<([^\>]+)>\s*'
|
|
482
|
-
|
|
483
|
-
x = re.search(first_url_regex, line)
|
|
484
|
-
if x:
|
|
485
|
-
url = x.group(1)
|
|
486
|
-
# also remove the url from the string
|
|
487
|
-
line = re.sub(first_url_regex, '', line)
|
|
488
|
-
else:
|
|
489
|
-
url = ''
|
|
490
|
-
|
|
491
|
-
# remove the second url from the string: we don't need to capture it, because it is repetitive across
|
|
492
|
-
# all abstracts
|
|
493
|
-
second_url_regex = '^<[^\>]+>\s*'
|
|
494
|
-
line = re.sub(second_url_regex, '', line)
|
|
495
|
-
|
|
496
|
-
# remove some strange line ending, that occurs in many abstracts
|
|
497
|
-
language_at_ending_regex = '@en \.\n$'
|
|
498
|
-
line = re.sub(language_at_ending_regex, '', line)
|
|
499
|
-
|
|
500
|
-
# form the input object for this abstract
|
|
501
|
-
doc = {
|
|
502
|
-
"_text_": line,
|
|
503
|
-
"url": url,
|
|
504
|
-
"id": count+1
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
yield doc
|
|
508
|
-
count += 1
|
|
509
|
-
|
|
510
|
-
if count % bulk_size == 0:
|
|
511
|
-
print(f"Processed {count} documents", end="\r")
|
|
512
|
-
|
|
513
|
-
if count == max_docs:
|
|
514
|
-
break
|
|
515
|
-
|
|
516
|
-
source_file.close()
|
|
517
|
-
print("Maximum tokens observed per abstract: {}".format(max_tokens))
|
|
518
|
-
|
|
519
|
-
def dbpedia(out_fn):
|
|
520
|
-
import bz2
|
|
521
|
-
from sentence_transformers import SentenceTransformer
|
|
522
|
-
import torch
|
|
523
|
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
524
|
-
print(device)
|
|
525
|
-
local_fn = "long_abstracts_en.ttl.bz2"
|
|
526
|
-
url = "http://downloads.dbpedia.org/2016-10/core-i18n/en/long_abstracts_en.ttl.bz2"
|
|
527
|
-
download(url, local_fn)
|
|
528
|
-
source_file = bz2.BZ2File(local_fn, "r")
|
|
529
|
-
docs_iter = parse_dbpedia_data(source_file=source_file, max_docs=1000000)
|
|
530
|
-
text = []
|
|
531
|
-
for doc in docs_iter:
|
|
532
|
-
text.append(doc['_text_'])
|
|
533
|
-
model = SentenceTransformer('bert-base-nli-mean-tokens')
|
|
534
|
-
model.to(device)
|
|
535
|
-
sentence_embeddings = model.encode(text, show_progress_bar=True)
|
|
536
|
-
write_output(sentence_embeddings, sentence_embeddings[:10000], out_fn, 'angular')
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
def amazon_reviews(out_fn):
|
|
540
|
-
import os
|
|
541
|
-
import math
|
|
542
|
-
import pickle
|
|
543
|
-
import numpy as np
|
|
544
|
-
subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02']
|
|
545
|
-
train_set = None
|
|
546
|
-
test_set = None
|
|
547
|
-
for i, subset in enumerate(subsets):
|
|
548
|
-
url = f'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/{subset}_embeddings'
|
|
549
|
-
local_fn = f'{subset}_embeddings'
|
|
550
|
-
download(url, local_fn)
|
|
551
|
-
subset_embeddings = pickle.load(open(local_fn, "rb"))
|
|
552
|
-
if i==0:
|
|
553
|
-
train_set = subset_embeddings
|
|
554
|
-
test_set = subset_embeddings[:math.ceil(10000/len(subsets))]
|
|
555
|
-
else:
|
|
556
|
-
train_set = np.append(train_set, subset_embeddings, axis =0)
|
|
557
|
-
test_set = np.append(test_set, subset_embeddings[:math.ceil(10000/len(subsets))], axis=0)
|
|
558
|
-
print(subset_embeddings.shape)
|
|
559
|
-
print(train_set.shape)
|
|
560
|
-
print(test_set.shape)
|
|
561
|
-
os.remove(local_fn)
|
|
562
|
-
write_output(train_set, test_set[:10000], out_fn, 'angular')
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
DATASETS = {
|
|
566
|
-
'deep-image-96-angular': deep_image,
|
|
567
|
-
'fashion-mnist-784-euclidean': fashion_mnist,
|
|
568
|
-
'gist-960-euclidean': gist,
|
|
569
|
-
'glove-25-angular': lambda out_fn: glove(out_fn, 25),
|
|
570
|
-
'glove-50-angular': lambda out_fn: glove(out_fn, 50),
|
|
571
|
-
'glove-100-angular': lambda out_fn: glove(out_fn, 100),
|
|
572
|
-
'glove-200-angular': lambda out_fn: glove(out_fn, 200),
|
|
573
|
-
'mnist-784-euclidean': mnist,
|
|
574
|
-
'random-xs-20-euclidean': lambda out_fn: random_float(out_fn, 20, 10000, 100,
|
|
575
|
-
'euclidean'),
|
|
576
|
-
'random-s-100-euclidean': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
|
|
577
|
-
'euclidean'),
|
|
578
|
-
'random-xs-20-angular': lambda out_fn: random_float(out_fn, 20, 10000, 100,
|
|
579
|
-
'angular'),
|
|
580
|
-
'random-s-100-angular': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
|
|
581
|
-
'angular'),
|
|
582
|
-
'random-xs-16-hamming': lambda out_fn: random_bitstring(out_fn, 16, 10000,
|
|
583
|
-
100),
|
|
584
|
-
'random-s-128-hamming': lambda out_fn: random_bitstring(out_fn, 128,
|
|
585
|
-
50000, 1000),
|
|
586
|
-
'random-l-256-hamming': lambda out_fn: random_bitstring(out_fn, 256,
|
|
587
|
-
100000, 1000),
|
|
588
|
-
'random-s-jaccard': lambda out_fn: random_jaccard(out_fn, n=10000,
|
|
589
|
-
size=20, universe=40),
|
|
590
|
-
'random-l-jaccard': lambda out_fn: random_jaccard(out_fn, n=100000,
|
|
591
|
-
size=70, universe=100),
|
|
592
|
-
'sift-128-euclidean': sift,
|
|
593
|
-
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
|
|
594
|
-
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
|
|
595
|
-
'word2bits-800-hamming': lambda out_fn: word2bits(
|
|
596
|
-
out_fn, '400K',
|
|
597
|
-
'w2b_bitlevel1_size800_vocab400K'),
|
|
598
|
-
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
|
|
599
|
-
'sift-256-hamming': lambda out_fn: sift_hamming(
|
|
600
|
-
out_fn, 'sift.hamming.256'),
|
|
601
|
-
'kosarak-jaccard': lambda out_fn: kosarak(out_fn),
|
|
602
|
-
'dbpedia-768' : lambda out_fn: dbpedia(out_fn),
|
|
603
|
-
'amazon-reviews-384': lambda out_fn: amazon_reviews(out_fn),
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
big_ann_datasets = [f'Text-to-Image-{x}' for x in ['10M', '20M', '30M', '40M', '50M', '60M', '70M', '80M', '90M', '100M']]
|
|
610
|
-
for dataset in big_ann_datasets:
|
|
611
|
-
DATASETS[dataset] = lambda fn: ()
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular', 'fashion-mnist-784-euclidean']
|
|
615
|
-
hybrid_datasets.extend(big_ann_datasets)
|
|
616
|
-
percentiles= ['0.5', '1', '2', '5', '10', '20', '50']
|
|
617
|
-
for dataset in hybrid_datasets:
|
|
618
|
-
for percentile in percentiles:
|
|
619
|
-
DATASETS[f'{dataset}-hybrid-{percentile}'] = lambda fn: ()
|
|
620
|
-
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
from __future__ import absolute_import
|
|
2
|
-
from scipy.spatial.distance import pdist as scipy_pdist
|
|
3
|
-
import itertools
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
def pdist(a, b, metric):
|
|
7
|
-
return scipy_pdist([a, b], metric=metric)[0]
|
|
8
|
-
|
|
9
|
-
# Need own implementation of jaccard because scipy's
|
|
10
|
-
# implementation is different
|
|
11
|
-
|
|
12
|
-
def jaccard(a, b):
|
|
13
|
-
if len(a) == 0 or len(b) == 0:
|
|
14
|
-
return 0
|
|
15
|
-
intersect = len(set(a) & set(b))
|
|
16
|
-
return intersect / (float)(len(a) + len(b) - intersect)
|
|
17
|
-
|
|
18
|
-
metrics = {
|
|
19
|
-
'hamming': {
|
|
20
|
-
'distance': lambda a, b: pdist(a, b, "hamming"),
|
|
21
|
-
'distance_valid': lambda a: True
|
|
22
|
-
},
|
|
23
|
-
# return 1 - jaccard similarity, because smaller distances are better.
|
|
24
|
-
'jaccard': {
|
|
25
|
-
'distance': lambda a, b: 1 - jaccard(a, b),
|
|
26
|
-
'distance_valid': lambda a: a < 1 - 1e-5
|
|
27
|
-
},
|
|
28
|
-
'euclidean': {
|
|
29
|
-
'distance': lambda a, b: pdist(a, b, "euclidean"),
|
|
30
|
-
'distance_valid': lambda a: True
|
|
31
|
-
},
|
|
32
|
-
'angular': {
|
|
33
|
-
'distance': lambda a, b: pdist(a, b, "cosine"),
|
|
34
|
-
'distance_valid': lambda a: True
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
def sparse_to_lists(data, lengths):
|
|
39
|
-
X = []
|
|
40
|
-
index = 0
|
|
41
|
-
for l in lengths:
|
|
42
|
-
X.append(data[index:index+l])
|
|
43
|
-
index += l
|
|
44
|
-
|
|
45
|
-
return X
|
|
46
|
-
|
|
47
|
-
def dataset_transform(dataset):
|
|
48
|
-
if dataset.attrs.get('type', 'dense') != 'sparse':
|
|
49
|
-
return np.array(dataset['train']), np.array(dataset['test'])
|
|
50
|
-
|
|
51
|
-
# we store the dataset as a list of integers, accompanied by a list of lengths in hdf5
|
|
52
|
-
# so we transform it back to the format expected by the algorithms here (array of array of ints)
|
|
53
|
-
return sparse_to_lists(dataset['train'], dataset['size_train']), sparse_to_lists(dataset['test'], dataset['size_test'])
|