redisbench-admin 0.11.66__py3-none-any.whl → 0.11.68__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. redisbench_admin/run/args.py +1 -0
  2. redisbench_admin/run/cluster.py +1 -3
  3. redisbench_admin/run_remote/remote_db.py +3 -1
  4. redisbench_admin/run_remote/remote_helpers.py +27 -11
  5. redisbench_admin/run_remote/run_remote.py +11 -8
  6. redisbench_admin/run_remote/standalone.py +6 -2
  7. redisbench_admin/utils/benchmark_config.py +6 -2
  8. redisbench_admin/utils/local.py +4 -2
  9. redisbench_admin/utils/remote.py +81 -33
  10. {redisbench_admin-0.11.66.dist-info → redisbench_admin-0.11.68.dist-info}/METADATA +5 -2
  11. redisbench_admin-0.11.68.dist-info/RECORD +117 -0
  12. {redisbench_admin-0.11.66.dist-info → redisbench_admin-0.11.68.dist-info}/WHEEL +1 -1
  13. redisbench_admin/run/ann/pkg/.dockerignore +0 -2
  14. redisbench_admin/run/ann/pkg/.git +0 -1
  15. redisbench_admin/run/ann/pkg/.github/workflows/benchmarks.yml +0 -100
  16. redisbench_admin/run/ann/pkg/.gitignore +0 -21
  17. redisbench_admin/run/ann/pkg/LICENSE +0 -21
  18. redisbench_admin/run/ann/pkg/README.md +0 -157
  19. redisbench_admin/run/ann/pkg/algos.yaml +0 -1294
  20. redisbench_admin/run/ann/pkg/algosP.yaml +0 -67
  21. redisbench_admin/run/ann/pkg/ann_benchmarks/__init__.py +0 -2
  22. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/__init__.py +0 -0
  23. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/annoy.py +0 -26
  24. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/balltree.py +0 -22
  25. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/base.py +0 -36
  26. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/bruteforce.py +0 -110
  27. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/ckdtree.py +0 -17
  28. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/datasketch.py +0 -29
  29. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/definitions.py +0 -187
  30. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/diskann.py +0 -190
  31. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/dolphinnpy.py +0 -31
  32. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/dummy_algo.py +0 -25
  33. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/elasticsearch.py +0 -107
  34. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/elastiknn.py +0 -124
  35. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss.py +0 -124
  36. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss_gpu.py +0 -61
  37. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/faiss_hnsw.py +0 -39
  38. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/flann.py +0 -27
  39. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/hnswlib.py +0 -36
  40. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/kdtree.py +0 -22
  41. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/kgraph.py +0 -39
  42. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/lshf.py +0 -25
  43. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/milvus.py +0 -99
  44. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/mrpt.py +0 -41
  45. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/n2.py +0 -28
  46. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/nearpy.py +0 -48
  47. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/nmslib.py +0 -74
  48. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/onng_ngt.py +0 -100
  49. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/opensearchknn.py +0 -107
  50. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/panng_ngt.py +0 -79
  51. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/pinecone.py +0 -39
  52. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/puffinn.py +0 -45
  53. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/pynndescent.py +0 -115
  54. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/qg_ngt.py +0 -102
  55. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/redisearch.py +0 -90
  56. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/rpforest.py +0 -20
  57. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/scann.py +0 -34
  58. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/sptag.py +0 -28
  59. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/subprocess.py +0 -246
  60. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vald.py +0 -149
  61. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vecsim-hnsw.py +0 -43
  62. redisbench_admin/run/ann/pkg/ann_benchmarks/algorithms/vespa.py +0 -47
  63. redisbench_admin/run/ann/pkg/ann_benchmarks/constants.py +0 -1
  64. redisbench_admin/run/ann/pkg/ann_benchmarks/data.py +0 -48
  65. redisbench_admin/run/ann/pkg/ann_benchmarks/datasets.py +0 -620
  66. redisbench_admin/run/ann/pkg/ann_benchmarks/distance.py +0 -53
  67. redisbench_admin/run/ann/pkg/ann_benchmarks/main.py +0 -325
  68. redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/__init__.py +0 -2
  69. redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/metrics.py +0 -183
  70. redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/plot_variants.py +0 -17
  71. redisbench_admin/run/ann/pkg/ann_benchmarks/plotting/utils.py +0 -165
  72. redisbench_admin/run/ann/pkg/ann_benchmarks/results.py +0 -71
  73. redisbench_admin/run/ann/pkg/ann_benchmarks/runner.py +0 -333
  74. redisbench_admin/run/ann/pkg/create_dataset.py +0 -12
  75. redisbench_admin/run/ann/pkg/create_hybrid_dataset.py +0 -147
  76. redisbench_admin/run/ann/pkg/create_text_to_image_ds.py +0 -117
  77. redisbench_admin/run/ann/pkg/create_website.py +0 -272
  78. redisbench_admin/run/ann/pkg/install/Dockerfile +0 -11
  79. redisbench_admin/run/ann/pkg/install/Dockerfile.annoy +0 -5
  80. redisbench_admin/run/ann/pkg/install/Dockerfile.datasketch +0 -4
  81. redisbench_admin/run/ann/pkg/install/Dockerfile.diskann +0 -29
  82. redisbench_admin/run/ann/pkg/install/Dockerfile.diskann_pq +0 -31
  83. redisbench_admin/run/ann/pkg/install/Dockerfile.dolphinn +0 -5
  84. redisbench_admin/run/ann/pkg/install/Dockerfile.elasticsearch +0 -45
  85. redisbench_admin/run/ann/pkg/install/Dockerfile.elastiknn +0 -61
  86. redisbench_admin/run/ann/pkg/install/Dockerfile.faiss +0 -18
  87. redisbench_admin/run/ann/pkg/install/Dockerfile.flann +0 -10
  88. redisbench_admin/run/ann/pkg/install/Dockerfile.hnswlib +0 -10
  89. redisbench_admin/run/ann/pkg/install/Dockerfile.kgraph +0 -6
  90. redisbench_admin/run/ann/pkg/install/Dockerfile.mih +0 -4
  91. redisbench_admin/run/ann/pkg/install/Dockerfile.milvus +0 -27
  92. redisbench_admin/run/ann/pkg/install/Dockerfile.mrpt +0 -4
  93. redisbench_admin/run/ann/pkg/install/Dockerfile.n2 +0 -5
  94. redisbench_admin/run/ann/pkg/install/Dockerfile.nearpy +0 -5
  95. redisbench_admin/run/ann/pkg/install/Dockerfile.ngt +0 -13
  96. redisbench_admin/run/ann/pkg/install/Dockerfile.nmslib +0 -10
  97. redisbench_admin/run/ann/pkg/install/Dockerfile.opensearchknn +0 -43
  98. redisbench_admin/run/ann/pkg/install/Dockerfile.puffinn +0 -6
  99. redisbench_admin/run/ann/pkg/install/Dockerfile.pynndescent +0 -4
  100. redisbench_admin/run/ann/pkg/install/Dockerfile.redisearch +0 -18
  101. redisbench_admin/run/ann/pkg/install/Dockerfile.rpforest +0 -5
  102. redisbench_admin/run/ann/pkg/install/Dockerfile.scann +0 -5
  103. redisbench_admin/run/ann/pkg/install/Dockerfile.scipy +0 -4
  104. redisbench_admin/run/ann/pkg/install/Dockerfile.sklearn +0 -4
  105. redisbench_admin/run/ann/pkg/install/Dockerfile.sptag +0 -30
  106. redisbench_admin/run/ann/pkg/install/Dockerfile.vald +0 -8
  107. redisbench_admin/run/ann/pkg/install/Dockerfile.vespa +0 -17
  108. redisbench_admin/run/ann/pkg/install.py +0 -70
  109. redisbench_admin/run/ann/pkg/logging.conf +0 -34
  110. redisbench_admin/run/ann/pkg/multirun.py +0 -298
  111. redisbench_admin/run/ann/pkg/plot.py +0 -159
  112. redisbench_admin/run/ann/pkg/protocol/bf-runner +0 -10
  113. redisbench_admin/run/ann/pkg/protocol/bf-runner.py +0 -204
  114. redisbench_admin/run/ann/pkg/protocol/ext-add-query-metric.md +0 -51
  115. redisbench_admin/run/ann/pkg/protocol/ext-batch-queries.md +0 -77
  116. redisbench_admin/run/ann/pkg/protocol/ext-prepared-queries.md +0 -77
  117. redisbench_admin/run/ann/pkg/protocol/ext-query-parameters.md +0 -47
  118. redisbench_admin/run/ann/pkg/protocol/specification.md +0 -194
  119. redisbench_admin/run/ann/pkg/requirements.txt +0 -14
  120. redisbench_admin/run/ann/pkg/requirements_py38.txt +0 -11
  121. redisbench_admin/run/ann/pkg/results/fashion-mnist-784-euclidean.png +0 -0
  122. redisbench_admin/run/ann/pkg/results/gist-960-euclidean.png +0 -0
  123. redisbench_admin/run/ann/pkg/results/glove-100-angular.png +0 -0
  124. redisbench_admin/run/ann/pkg/results/glove-25-angular.png +0 -0
  125. redisbench_admin/run/ann/pkg/results/lastfm-64-dot.png +0 -0
  126. redisbench_admin/run/ann/pkg/results/mnist-784-euclidean.png +0 -0
  127. redisbench_admin/run/ann/pkg/results/nytimes-256-angular.png +0 -0
  128. redisbench_admin/run/ann/pkg/results/sift-128-euclidean.png +0 -0
  129. redisbench_admin/run/ann/pkg/run.py +0 -12
  130. redisbench_admin/run/ann/pkg/run_algorithm.py +0 -3
  131. redisbench_admin/run/ann/pkg/templates/chartjs.template +0 -102
  132. redisbench_admin/run/ann/pkg/templates/detail_page.html +0 -23
  133. redisbench_admin/run/ann/pkg/templates/general.html +0 -58
  134. redisbench_admin/run/ann/pkg/templates/latex.template +0 -30
  135. redisbench_admin/run/ann/pkg/templates/summary.html +0 -60
  136. redisbench_admin/run/ann/pkg/test/__init__.py +0 -0
  137. redisbench_admin/run/ann/pkg/test/test-jaccard.py +0 -19
  138. redisbench_admin/run/ann/pkg/test/test-metrics.py +0 -99
  139. redisbench_admin-0.11.66.dist-info/RECORD +0 -243
  140. {redisbench_admin-0.11.66.dist-info → redisbench_admin-0.11.68.dist-info}/entry_points.txt +0 -0
  141. {redisbench_admin-0.11.66.dist-info → redisbench_admin-0.11.68.dist-info/licenses}/LICENSE +0 -0
@@ -1,620 +0,0 @@
1
- from copyreg import pickle
2
- import h5py
3
- import numpy
4
- import os
5
- import random
6
-
7
- from urllib.request import urlopen
8
- from urllib.request import urlretrieve
9
-
10
- from ann_benchmarks.distance import dataset_transform
11
- import urllib.parse
12
-
13
-
14
- def download(src, dst):
15
- if not os.path.exists(dst):
16
- # TODO: should be atomic
17
- print('downloading %s -> %s...' % (src, dst))
18
- urlretrieve(src, dst)
19
-
20
-
21
- def get_dataset_fn(dataset):
22
- if not os.path.exists('data'):
23
- try:
24
- os.mkdir('data')
25
- except FileExistsError:
26
- pass # fixes race condition
27
- return os.path.join('data', '%s.hdf5' % dataset)
28
-
29
-
30
- def get_dataset(which):
31
- hdf5_fn = get_dataset_fn(which)
32
- try:
33
- if 'dbpedia' in which:
34
- url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/dbpedia/dbpedia-768.hdf5'
35
- elif 'amazon-reviews' in which:
36
- url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/amazon-reviews-384.hdf5'
37
- elif 'hybrid' in which:
38
- url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which)
39
- elif 'Text-to-Image' in which:
40
- url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/big_ann/%s.hdf5' % urllib.parse.quote(which)
41
- else:
42
- url = 'http://ann-benchmarks.com/%s.hdf5' % which
43
- download(url, hdf5_fn)
44
- except:
45
- print("Cannot download %s" % url)
46
- if which in DATASETS:
47
- print("Creating dataset locally")
48
- DATASETS[which](hdf5_fn)
49
- hdf5_f = h5py.File(hdf5_fn, 'r')
50
-
51
- # here for backward compatibility, to ensure old datasets can still be used with newer versions
52
- # cast to integer because the json parser (later on) cannot interpret numpy integers
53
- dimension = int(hdf5_f.attrs['dimension']) if 'dimension' in hdf5_f.attrs else len(hdf5_f['train'][0])
54
-
55
- return hdf5_f, dimension
56
-
57
- # Everything below this line is related to creating datasets
58
- # You probably never need to do this at home,
59
- # just rely on the prepared datasets at http://ann-benchmarks.com
60
-
61
-
62
- def write_output(train, test, fn, distance, point_type='float', count=100):
63
- from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
64
- n = 0
65
- f = h5py.File(fn, 'w')
66
- f.attrs['type'] = 'dense'
67
- f.attrs['distance'] = distance
68
- f.attrs['dimension'] = len(train[0])
69
- f.attrs['point_type'] = point_type
70
- print('train size: %9d * %4d' % train.shape)
71
- print('test size: %9d * %4d' % test.shape)
72
- f.create_dataset('train', (len(train), len(
73
- train[0])), dtype=train.dtype)[:] = train
74
- f.create_dataset('test', (len(test), len(
75
- test[0])), dtype=test.dtype)[:] = test
76
- neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
77
- distances = f.create_dataset('distances', (len(test), count), dtype='f')
78
- bf = BruteForceBLAS(distance, precision=train.dtype)
79
-
80
- bf.fit(train)
81
- for i, x in enumerate(test):
82
- if i % 1000 == 0:
83
- print('%d/%d...' % (i, len(test)))
84
- res = list(bf.query_with_distances(x, count))
85
- res.sort(key=lambda t: t[-1])
86
- neighbors[i] = [j for j, _ in res]
87
- distances[i] = [d for _, d in res]
88
- f.close()
89
-
90
- """
91
- param: train and test are arrays of arrays of indices.
92
- """
93
- def write_sparse_output(train, test, fn, distance, dimension, count=100):
94
- from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
95
- f = h5py.File(fn, 'w')
96
- f.attrs['type'] = 'sparse'
97
- f.attrs['distance'] = distance
98
- f.attrs['dimension'] = dimension
99
- f.attrs['point_type'] = 'bit'
100
- print('train size: %9d * %4d' % (train.shape[0], dimension))
101
- print('test size: %9d * %4d' % (test.shape[0], dimension))
102
-
103
- # We ensure the sets are sorted
104
- train = numpy.array(list(map(sorted, train)))
105
- test = numpy.array(list(map(sorted, test)))
106
-
107
- flat_train = numpy.hstack(train.flatten())
108
- flat_test = numpy.hstack(test.flatten())
109
-
110
- f.create_dataset('train', (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train
111
- f.create_dataset('test', (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test
112
- neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
113
- distances = f.create_dataset('distances', (len(test), count), dtype='f')
114
-
115
- f.create_dataset('size_test', (len(test),), dtype='i')[:] = list(map(len, test))
116
- f.create_dataset('size_train', (len(train),), dtype='i')[:] = list(map(len, train))
117
-
118
- bf = BruteForceBLAS(distance, precision=train.dtype)
119
- bf.fit(train)
120
- for i, x in enumerate(test):
121
- if i % 1000 == 0:
122
- print('%d/%d...' % (i, len(test)))
123
- res = list(bf.query_with_distances(x, count))
124
- res.sort(key=lambda t: t[-1])
125
- neighbors[i] = [j for j, _ in res]
126
- distances[i] = [d for _, d in res]
127
- f.close()
128
-
129
- def train_test_split(X, test_size=10000, dimension=None):
130
- import sklearn.model_selection
131
- if dimension == None:
132
- dimension = X.shape[1]
133
- print('Splitting %d*%d into train/test' % (X.shape[0], dimension))
134
- return sklearn.model_selection.train_test_split(
135
- X, test_size=test_size, random_state=1)
136
-
137
-
138
- def glove(out_fn, d):
139
- import zipfile
140
-
141
- url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
142
- fn = os.path.join('data', 'glove.twitter.27B.zip')
143
- download(url, fn)
144
- with zipfile.ZipFile(fn) as z:
145
- print('preparing %s' % out_fn)
146
- z_fn = 'glove.twitter.27B.%dd.txt' % d
147
- X = []
148
- for line in z.open(z_fn):
149
- v = [float(x) for x in line.strip().split()[1:]]
150
- X.append(numpy.array(v))
151
- X_train, X_test = train_test_split(X)
152
- write_output(numpy.array(X_train), numpy.array(
153
- X_test), out_fn, 'angular')
154
-
155
-
156
- def _load_texmex_vectors(f, n, k):
157
- import struct
158
-
159
- v = numpy.zeros((n, k))
160
- for i in range(n):
161
- f.read(4) # ignore vec length
162
- v[i] = struct.unpack('f' * k, f.read(k * 4))
163
-
164
- return v
165
-
166
-
167
- def _get_irisa_matrix(t, fn):
168
- import struct
169
- m = t.getmember(fn)
170
- f = t.extractfile(m)
171
- k, = struct.unpack('i', f.read(4))
172
- n = m.size // (4 + 4 * k)
173
- f.seek(0)
174
- return _load_texmex_vectors(f, n, k)
175
-
176
-
177
- def sift(out_fn):
178
- import tarfile
179
-
180
- url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
181
- fn = os.path.join('data', 'sift.tar.tz')
182
- download(url, fn)
183
- with tarfile.open(fn, 'r:gz') as t:
184
- train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
185
- test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
186
- write_output(train, test, out_fn, 'euclidean')
187
-
188
-
189
- def gist(out_fn):
190
- import tarfile
191
-
192
- url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
193
- fn = os.path.join('data', 'gist.tar.tz')
194
- download(url, fn)
195
- with tarfile.open(fn, 'r:gz') as t:
196
- train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
197
- test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
198
- write_output(train, test, out_fn, 'euclidean')
199
-
200
-
201
- def _load_mnist_vectors(fn):
202
- import gzip
203
- import struct
204
-
205
- print('parsing vectors in %s...' % fn)
206
- f = gzip.open(fn)
207
- type_code_info = {
208
- 0x08: (1, "!B"),
209
- 0x09: (1, "!b"),
210
- 0x0B: (2, "!H"),
211
- 0x0C: (4, "!I"),
212
- 0x0D: (4, "!f"),
213
- 0x0E: (8, "!d")
214
- }
215
- magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
216
- assert magic == 0
217
- assert type_code in type_code_info
218
-
219
- dimensions = [struct.unpack("!I", f.read(4))[0]
220
- for i in range(dim_count)]
221
-
222
- entry_count = dimensions[0]
223
- entry_size = numpy.product(dimensions[1:])
224
-
225
- b, format_string = type_code_info[type_code]
226
- vectors = []
227
- for i in range(entry_count):
228
- vectors.append([struct.unpack(format_string, f.read(b))[0]
229
- for j in range(entry_size)])
230
- return numpy.array(vectors)
231
-
232
-
233
- def mnist(out_fn):
234
- download(
235
- 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') # noqa
236
- download(
237
- 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') # noqa
238
- train = _load_mnist_vectors('mnist-train.gz')
239
- test = _load_mnist_vectors('mnist-test.gz')
240
- write_output(train, test, out_fn, 'euclidean')
241
-
242
-
243
- def fashion_mnist(out_fn):
244
- download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', # noqa
245
- 'fashion-mnist-train.gz')
246
- download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', # noqa
247
- 'fashion-mnist-test.gz')
248
- train = _load_mnist_vectors('fashion-mnist-train.gz')
249
- test = _load_mnist_vectors('fashion-mnist-test.gz')
250
- write_output(train, test, out_fn, 'euclidean')
251
-
252
- # Creates a 'deep image descriptor' dataset using the 'deep10M.fvecs' sample
253
- # from http://sites.skoltech.ru/compvision/noimi/. The download logic is adapted
254
- # from the script https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py.
255
- def deep_image(out_fn):
256
- yadisk_key = 'https://yadi.sk/d/11eDCm7Dsn9GA'
257
- response = urlopen('https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=' \
258
- + yadisk_key + '&path=/deep10M.fvecs')
259
- response_body = response.read().decode("utf-8")
260
-
261
- dataset_url = response_body.split(',')[0][9:-1]
262
- filename = os.path.join('data', 'deep-image.fvecs')
263
- download(dataset_url, filename)
264
-
265
- # In the fvecs file format, each vector is stored by first writing its
266
- # length as an integer, then writing its components as floats.
267
- fv = numpy.fromfile(filename, dtype=numpy.float32)
268
- dim = fv.view(numpy.int32)[0]
269
- fv = fv.reshape(-1, dim + 1)[:, 1:]
270
-
271
- X_train, X_test = train_test_split(fv)
272
- write_output(X_train, X_test, out_fn, 'angular')
273
-
274
- def transform_bag_of_words(filename, n_dimensions, out_fn):
275
- import gzip
276
- from scipy.sparse import lil_matrix
277
- from sklearn.feature_extraction.text import TfidfTransformer
278
- from sklearn import random_projection
279
- with gzip.open(filename, 'rb') as f:
280
- file_content = f.readlines()
281
- entries = int(file_content[0])
282
- words = int(file_content[1])
283
- file_content = file_content[3:] # strip first three entries
284
- print("building matrix...")
285
- A = lil_matrix((entries, words))
286
- for e in file_content:
287
- doc, word, cnt = [int(v) for v in e.strip().split()]
288
- A[doc - 1, word - 1] = cnt
289
- print("normalizing matrix entries with tfidf...")
290
- B = TfidfTransformer().fit_transform(A)
291
- print("reducing dimensionality...")
292
- C = random_projection.GaussianRandomProjection(
293
- n_components=n_dimensions).fit_transform(B)
294
- X_train, X_test = train_test_split(C)
295
- write_output(numpy.array(X_train), numpy.array(
296
- X_test), out_fn, 'angular')
297
-
298
-
299
- def nytimes(out_fn, n_dimensions):
300
- fn = 'nytimes_%s.txt.gz' % n_dimensions
301
- download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) # noqa
302
- transform_bag_of_words(fn, n_dimensions, out_fn)
303
-
304
-
305
- def random_float(out_fn, n_dims, n_samples, centers, distance):
306
- import sklearn.datasets
307
-
308
- X, _ = sklearn.datasets.make_blobs(
309
- n_samples=n_samples, n_features=n_dims,
310
- centers=centers, random_state=1)
311
- X_train, X_test = train_test_split(X, test_size=0.1)
312
- write_output(X_train, X_test, out_fn, distance)
313
-
314
-
315
- def random_bitstring(out_fn, n_dims, n_samples, n_queries):
316
- import sklearn.datasets
317
-
318
- Y, _ = sklearn.datasets.make_blobs(
319
- n_samples=n_samples, n_features=n_dims,
320
- centers=n_queries, random_state=1)
321
- X = numpy.zeros((n_samples, n_dims), dtype=numpy.bool)
322
- for i, vec in enumerate(Y):
323
- X[i] = numpy.array([v > 0 for v in vec], dtype=numpy.bool)
324
-
325
- X_train, X_test = train_test_split(X, test_size=n_queries)
326
- write_output(X_train, X_test, out_fn, 'hamming', 'bit')
327
-
328
-
329
- def word2bits(out_fn, path, fn):
330
- import tarfile
331
- local_fn = fn + '.tar.gz'
332
- url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % ( # noqa
333
- path, fn)
334
- download(url, local_fn)
335
- print('parsing vectors in %s...' % local_fn)
336
- with tarfile.open(local_fn, 'r:gz') as t:
337
- f = t.extractfile(fn)
338
- n_words, k = [int(z) for z in next(f).strip().split()]
339
- X = numpy.zeros((n_words, k), dtype=numpy.bool)
340
- for i in range(n_words):
341
- X[i] = numpy.array([float(z) > 0 for z in next(
342
- f).strip().split()[1:]], dtype=numpy.bool)
343
-
344
- X_train, X_test = train_test_split(X, test_size=1000)
345
- write_output(X_train, X_test, out_fn, 'hamming', 'bit')
346
-
347
-
348
- def sift_hamming(out_fn, fn):
349
- import tarfile
350
- local_fn = fn + '.tar.gz'
351
- url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
352
- download(url, local_fn)
353
- print('parsing vectors in %s...' % local_fn)
354
- with tarfile.open(local_fn, 'r:gz') as t:
355
- f = t.extractfile(fn)
356
- lines = f.readlines()
357
- X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
358
- for i, line in enumerate(lines):
359
- X[i] = numpy.array(
360
- [int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
361
- X_train, X_test = train_test_split(X, test_size=1000)
362
- write_output(X_train, X_test, out_fn, 'hamming', 'bit')
363
-
364
- def kosarak(out_fn):
365
- import gzip
366
- local_fn = 'kosarak.dat.gz'
367
- # only consider sets with at least min_elements many elements
368
- min_elements = 20
369
- url = 'http://fimi.uantwerpen.be/data/%s' % local_fn
370
- download(url, local_fn)
371
-
372
- X = []
373
- dimension = 0
374
- with gzip.open('kosarak.dat.gz', 'r') as f:
375
- content = f.readlines()
376
- # preprocess data to find sets with more than 20 elements
377
- # keep track of used ids for reenumeration
378
- for line in content:
379
- if len(line.split()) >= min_elements:
380
- X.append(list(map(int, line.split())))
381
- dimension = max(dimension, max(X[-1]) + 1)
382
-
383
- X_train, X_test = train_test_split(numpy.array(X), test_size=500, dimension=dimension)
384
- write_sparse_output(X_train, X_test, out_fn, 'jaccard', dimension)
385
-
386
- def random_jaccard(out_fn, n=10000, size=50, universe=80):
387
- random.seed(1)
388
- l = list(range(universe))
389
- X = []
390
- for i in range(n):
391
- X.append(random.sample(l, size))
392
-
393
- X_train, X_test = train_test_split(numpy.array(X), test_size=100, dimension=universe)
394
- write_sparse_output(X_train, X_test, out_fn, 'jaccard', universe)
395
-
396
-
397
-
398
- def lastfm(out_fn, n_dimensions, test_size=50000):
399
- # This tests out ANN methods for retrieval on simple matrix factorization
400
- # based recommendation algorithms. The idea being that the query/test
401
- # vectors are user factors and the train set are item factors from
402
- # the matrix factorization model.
403
-
404
- # Since the predictor is a dot product, we transform the factors first
405
- # as described in this
406
- # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf # noqa
407
- # This hopefully replicates the experiments done in this post:
408
- # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ # noqa
409
-
410
- # The dataset is from "Last.fm Dataset - 360K users":
411
- # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html # noqa
412
-
413
- # This requires the implicit package to generate the factors
414
- # (on my desktop/gpu this only takes 4-5 seconds to train - but
415
- # could take 1-2 minutes on a laptop)
416
- from implicit.datasets.lastfm import get_lastfm
417
- from implicit.approximate_als import augment_inner_product_matrix
418
- import implicit
419
-
420
- # train an als model on the lastfm data
421
- _, _, play_counts = get_lastfm()
422
- model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
423
- model.fit(implicit.nearest_neighbours.bm25_weight(
424
- play_counts, K1=100, B=0.8))
425
-
426
- # transform item factors so that each one has the same norm,
427
- # and transform the user factors such by appending a 0 column
428
- _, item_factors = augment_inner_product_matrix(model.item_factors)
429
- user_factors = numpy.append(model.user_factors,
430
- numpy.zeros((model.user_factors.shape[0], 1)),
431
- axis=1)
432
-
433
- # only query the first 50k users (speeds things up signficantly
434
- # without changing results)
435
- user_factors = user_factors[:test_size]
436
-
437
- # after that transformation a cosine lookup will return the same results
438
- # as the inner product on the untransformed data
439
- write_output(item_factors, user_factors, out_fn, 'angular')
440
-
441
- def parse_dbpedia_data(source_file, max_docs: int):
442
- import re
443
- """
444
- Parses the input file of abstracts and returns an iterable
445
- :param max_docs: maximum number of input documents to process; -1 for no limit
446
- :param source_file: input file
447
- :return: yields document by document to the consumer
448
- """
449
- global VERBOSE
450
- count = 0
451
- max_tokens = 0
452
-
453
- if -1 < max_docs < 50:
454
- VERBOSE = True
455
-
456
- percent = 0.1
457
- bulk_size = (percent / 100) * max_docs
458
-
459
- print(f"bulk_size={bulk_size}")
460
-
461
- if bulk_size <= 0:
462
- bulk_size = 1000
463
-
464
- for line in source_file:
465
- line = line.decode("utf-8")
466
-
467
- # skip commented out lines
468
- comment_regex = '^#'
469
- if re.search(comment_regex, line):
470
- continue
471
-
472
- token_size = len(line.split())
473
- if token_size > max_tokens:
474
- max_tokens = token_size
475
-
476
- # skip lines with 20 tokens or less, because they tend to contain noise
477
- # (this may vary in your dataset)
478
- if token_size <= 20:
479
- continue
480
-
481
- first_url_regex = '^<([^\>]+)>\s*'
482
-
483
- x = re.search(first_url_regex, line)
484
- if x:
485
- url = x.group(1)
486
- # also remove the url from the string
487
- line = re.sub(first_url_regex, '', line)
488
- else:
489
- url = ''
490
-
491
- # remove the second url from the string: we don't need to capture it, because it is repetitive across
492
- # all abstracts
493
- second_url_regex = '^<[^\>]+>\s*'
494
- line = re.sub(second_url_regex, '', line)
495
-
496
- # remove some strange line ending, that occurs in many abstracts
497
- language_at_ending_regex = '@en \.\n$'
498
- line = re.sub(language_at_ending_regex, '', line)
499
-
500
- # form the input object for this abstract
501
- doc = {
502
- "_text_": line,
503
- "url": url,
504
- "id": count+1
505
- }
506
-
507
- yield doc
508
- count += 1
509
-
510
- if count % bulk_size == 0:
511
- print(f"Processed {count} documents", end="\r")
512
-
513
- if count == max_docs:
514
- break
515
-
516
- source_file.close()
517
- print("Maximum tokens observed per abstract: {}".format(max_tokens))
518
-
519
- def dbpedia(out_fn):
520
- import bz2
521
- from sentence_transformers import SentenceTransformer
522
- import torch
523
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
524
- print(device)
525
- local_fn = "long_abstracts_en.ttl.bz2"
526
- url = "http://downloads.dbpedia.org/2016-10/core-i18n/en/long_abstracts_en.ttl.bz2"
527
- download(url, local_fn)
528
- source_file = bz2.BZ2File(local_fn, "r")
529
- docs_iter = parse_dbpedia_data(source_file=source_file, max_docs=1000000)
530
- text = []
531
- for doc in docs_iter:
532
- text.append(doc['_text_'])
533
- model = SentenceTransformer('bert-base-nli-mean-tokens')
534
- model.to(device)
535
- sentence_embeddings = model.encode(text, show_progress_bar=True)
536
- write_output(sentence_embeddings, sentence_embeddings[:10000], out_fn, 'angular')
537
-
538
-
539
- def amazon_reviews(out_fn):
540
- import os
541
- import math
542
- import pickle
543
- import numpy as np
544
- subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02']
545
- train_set = None
546
- test_set = None
547
- for i, subset in enumerate(subsets):
548
- url = f'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/{subset}_embeddings'
549
- local_fn = f'{subset}_embeddings'
550
- download(url, local_fn)
551
- subset_embeddings = pickle.load(open(local_fn, "rb"))
552
- if i==0:
553
- train_set = subset_embeddings
554
- test_set = subset_embeddings[:math.ceil(10000/len(subsets))]
555
- else:
556
- train_set = np.append(train_set, subset_embeddings, axis =0)
557
- test_set = np.append(test_set, subset_embeddings[:math.ceil(10000/len(subsets))], axis=0)
558
- print(subset_embeddings.shape)
559
- print(train_set.shape)
560
- print(test_set.shape)
561
- os.remove(local_fn)
562
- write_output(train_set, test_set[:10000], out_fn, 'angular')
563
-
564
-
565
- DATASETS = {
566
- 'deep-image-96-angular': deep_image,
567
- 'fashion-mnist-784-euclidean': fashion_mnist,
568
- 'gist-960-euclidean': gist,
569
- 'glove-25-angular': lambda out_fn: glove(out_fn, 25),
570
- 'glove-50-angular': lambda out_fn: glove(out_fn, 50),
571
- 'glove-100-angular': lambda out_fn: glove(out_fn, 100),
572
- 'glove-200-angular': lambda out_fn: glove(out_fn, 200),
573
- 'mnist-784-euclidean': mnist,
574
- 'random-xs-20-euclidean': lambda out_fn: random_float(out_fn, 20, 10000, 100,
575
- 'euclidean'),
576
- 'random-s-100-euclidean': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
577
- 'euclidean'),
578
- 'random-xs-20-angular': lambda out_fn: random_float(out_fn, 20, 10000, 100,
579
- 'angular'),
580
- 'random-s-100-angular': lambda out_fn: random_float(out_fn, 100, 100000, 1000,
581
- 'angular'),
582
- 'random-xs-16-hamming': lambda out_fn: random_bitstring(out_fn, 16, 10000,
583
- 100),
584
- 'random-s-128-hamming': lambda out_fn: random_bitstring(out_fn, 128,
585
- 50000, 1000),
586
- 'random-l-256-hamming': lambda out_fn: random_bitstring(out_fn, 256,
587
- 100000, 1000),
588
- 'random-s-jaccard': lambda out_fn: random_jaccard(out_fn, n=10000,
589
- size=20, universe=40),
590
- 'random-l-jaccard': lambda out_fn: random_jaccard(out_fn, n=100000,
591
- size=70, universe=100),
592
- 'sift-128-euclidean': sift,
593
- 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
594
- 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
595
- 'word2bits-800-hamming': lambda out_fn: word2bits(
596
- out_fn, '400K',
597
- 'w2b_bitlevel1_size800_vocab400K'),
598
- 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
599
- 'sift-256-hamming': lambda out_fn: sift_hamming(
600
- out_fn, 'sift.hamming.256'),
601
- 'kosarak-jaccard': lambda out_fn: kosarak(out_fn),
602
- 'dbpedia-768' : lambda out_fn: dbpedia(out_fn),
603
- 'amazon-reviews-384': lambda out_fn: amazon_reviews(out_fn),
604
- }
605
-
606
-
607
-
608
-
609
- big_ann_datasets = [f'Text-to-Image-{x}' for x in ['10M', '20M', '30M', '40M', '50M', '60M', '70M', '80M', '90M', '100M']]
610
- for dataset in big_ann_datasets:
611
- DATASETS[dataset] = lambda fn: ()
612
-
613
-
614
- hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular', 'fashion-mnist-784-euclidean']
615
- hybrid_datasets.extend(big_ann_datasets)
616
- percentiles= ['0.5', '1', '2', '5', '10', '20', '50']
617
- for dataset in hybrid_datasets:
618
- for percentile in percentiles:
619
- DATASETS[f'{dataset}-hybrid-{percentile}'] = lambda fn: ()
620
-
@@ -1,53 +0,0 @@
1
- from __future__ import absolute_import
2
- from scipy.spatial.distance import pdist as scipy_pdist
3
- import itertools
4
- import numpy as np
5
-
6
- def pdist(a, b, metric):
7
- return scipy_pdist([a, b], metric=metric)[0]
8
-
9
- # Need own implementation of jaccard because scipy's
10
- # implementation is different
11
-
12
- def jaccard(a, b):
13
- if len(a) == 0 or len(b) == 0:
14
- return 0
15
- intersect = len(set(a) & set(b))
16
- return intersect / (float)(len(a) + len(b) - intersect)
17
-
18
- metrics = {
19
- 'hamming': {
20
- 'distance': lambda a, b: pdist(a, b, "hamming"),
21
- 'distance_valid': lambda a: True
22
- },
23
- # return 1 - jaccard similarity, because smaller distances are better.
24
- 'jaccard': {
25
- 'distance': lambda a, b: 1 - jaccard(a, b),
26
- 'distance_valid': lambda a: a < 1 - 1e-5
27
- },
28
- 'euclidean': {
29
- 'distance': lambda a, b: pdist(a, b, "euclidean"),
30
- 'distance_valid': lambda a: True
31
- },
32
- 'angular': {
33
- 'distance': lambda a, b: pdist(a, b, "cosine"),
34
- 'distance_valid': lambda a: True
35
- }
36
- }
37
-
38
- def sparse_to_lists(data, lengths):
39
- X = []
40
- index = 0
41
- for l in lengths:
42
- X.append(data[index:index+l])
43
- index += l
44
-
45
- return X
46
-
47
- def dataset_transform(dataset):
48
- if dataset.attrs.get('type', 'dense') != 'sparse':
49
- return np.array(dataset['train']), np.array(dataset['test'])
50
-
51
- # we store the dataset as a list of integers, accompanied by a list of lengths in hdf5
52
- # so we transform it back to the format expected by the algorithms here (array of array of ints)
53
- return sparse_to_lists(dataset['train'], dataset['size_train']), sparse_to_lists(dataset['test'], dataset['size_test'])