vectordb-bench 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +49 -24
- vectordb_bench/__main__.py +4 -3
- vectordb_bench/backend/assembler.py +12 -13
- vectordb_bench/backend/cases.py +55 -45
- vectordb_bench/backend/clients/__init__.py +85 -14
- vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
- vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +112 -77
- vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
- vectordb_bench/backend/clients/alloydb/alloydb.py +59 -84
- vectordb_bench/backend/clients/alloydb/cli.py +51 -34
- vectordb_bench/backend/clients/alloydb/config.py +30 -30
- vectordb_bench/backend/clients/api.py +13 -24
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +50 -54
- vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
- vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
- vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
- vectordb_bench/backend/clients/chroma/chroma.py +39 -40
- vectordb_bench/backend/clients/chroma/config.py +4 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +24 -26
- vectordb_bench/backend/clients/memorydb/cli.py +8 -8
- vectordb_bench/backend/clients/memorydb/config.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +67 -58
- vectordb_bench/backend/clients/milvus/cli.py +41 -83
- vectordb_bench/backend/clients/milvus/config.py +18 -8
- vectordb_bench/backend/clients/milvus/milvus.py +19 -39
- vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
- vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +56 -77
- vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
- vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +34 -43
- vectordb_bench/backend/clients/pgvector/cli.py +40 -31
- vectordb_bench/backend/clients/pgvector/config.py +63 -73
- vectordb_bench/backend/clients/pgvector/pgvector.py +98 -104
- vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
- vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +39 -49
- vectordb_bench/backend/clients/pinecone/config.py +1 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +15 -25
- vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +41 -35
- vectordb_bench/backend/clients/redis/cli.py +6 -12
- vectordb_bench/backend/clients/redis/config.py +7 -5
- vectordb_bench/backend/clients/redis/redis.py +95 -62
- vectordb_bench/backend/clients/test/cli.py +2 -3
- vectordb_bench/backend/clients/test/config.py +2 -2
- vectordb_bench/backend/clients/test/test.py +5 -9
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +37 -26
- vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
- vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/data_source.py +18 -14
- vectordb_bench/backend/dataset.py +47 -27
- vectordb_bench/backend/result_collector.py +2 -3
- vectordb_bench/backend/runner/__init__.py +4 -6
- vectordb_bench/backend/runner/mp_runner.py +56 -23
- vectordb_bench/backend/runner/rate_runner.py +30 -19
- vectordb_bench/backend/runner/read_write_runner.py +46 -22
- vectordb_bench/backend/runner/serial_runner.py +81 -46
- vectordb_bench/backend/runner/util.py +4 -3
- vectordb_bench/backend/task_runner.py +92 -92
- vectordb_bench/backend/utils.py +17 -10
- vectordb_bench/base.py +0 -1
- vectordb_bench/cli/cli.py +65 -60
- vectordb_bench/cli/vectordbbench.py +6 -7
- vectordb_bench/frontend/components/check_results/charts.py +8 -19
- vectordb_bench/frontend/components/check_results/data.py +4 -16
- vectordb_bench/frontend/components/check_results/filters.py +8 -16
- vectordb_bench/frontend/components/check_results/nav.py +4 -4
- vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
- vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
- vectordb_bench/frontend/components/concurrent/charts.py +12 -12
- vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
- vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
- vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
- vectordb_bench/frontend/components/custom/initStyle.py +1 -1
- vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
- vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
- vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
- vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
- vectordb_bench/frontend/components/tables/data.py +3 -6
- vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
- vectordb_bench/frontend/pages/concurrent.py +3 -5
- vectordb_bench/frontend/pages/custom.py +30 -9
- vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
- vectordb_bench/frontend/pages/run_test.py +3 -7
- vectordb_bench/frontend/utils.py +1 -1
- vectordb_bench/frontend/vdb_benchmark.py +4 -6
- vectordb_bench/interface.py +45 -24
- vectordb_bench/log_util.py +59 -64
- vectordb_bench/metric.py +10 -11
- vectordb_bench/models.py +26 -43
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/METADATA +22 -15
- vectordb_bench-0.0.21.dist-info/RECORD +135 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/WHEEL +1 -1
- vectordb_bench-0.0.19.dist-info/RECORD +0 -135
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,18 @@
|
|
1
|
+
import concurrent
|
1
2
|
import logging
|
2
|
-
|
3
|
+
import math
|
3
4
|
import multiprocessing as mp
|
4
|
-
import
|
5
|
+
from collections.abc import Iterable
|
6
|
+
|
5
7
|
import numpy as np
|
6
|
-
import math
|
7
8
|
|
8
|
-
from .mp_runner import MultiProcessingSearchRunner
|
9
|
-
from .serial_runner import SerialSearchRunner
|
10
|
-
from .rate_runner import RatedMultiThreadingInsertRunner
|
11
9
|
from vectordb_bench.backend.clients import api
|
12
10
|
from vectordb_bench.backend.dataset import DatasetManager
|
13
11
|
|
12
|
+
from .mp_runner import MultiProcessingSearchRunner
|
13
|
+
from .rate_runner import RatedMultiThreadingInsertRunner
|
14
|
+
from .serial_runner import SerialSearchRunner
|
15
|
+
|
14
16
|
log = logging.getLogger(__name__)
|
15
17
|
|
16
18
|
|
@@ -24,8 +26,14 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
24
26
|
k: int = 100,
|
25
27
|
filters: dict | None = None,
|
26
28
|
concurrencies: Iterable[int] = (1, 15, 50),
|
27
|
-
search_stage: Iterable[float] = (
|
28
|
-
|
29
|
+
search_stage: Iterable[float] = (
|
30
|
+
0.5,
|
31
|
+
0.6,
|
32
|
+
0.7,
|
33
|
+
0.8,
|
34
|
+
0.9,
|
35
|
+
), # search from insert portion, 0.0 means search from the start
|
36
|
+
read_dur_after_write: int = 300, # seconds, search duration when insertion is done
|
29
37
|
timeout: float | None = None,
|
30
38
|
):
|
31
39
|
self.insert_rate = insert_rate
|
@@ -36,7 +44,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
36
44
|
self.search_stage = sorted(search_stage)
|
37
45
|
self.read_dur_after_write = read_dur_after_write
|
38
46
|
|
39
|
-
log.info(
|
47
|
+
log.info(
|
48
|
+
f"Init runner, concurencys={concurrencies}, search_stage={search_stage}, "
|
49
|
+
f"stage_search_dur={read_dur_after_write}"
|
50
|
+
)
|
40
51
|
|
41
52
|
test_emb = np.stack(dataset.test_data["emb"])
|
42
53
|
if normalize:
|
@@ -69,14 +80,17 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
69
80
|
"""Optimize needs to run in differenct process for pymilvus schema recursion problem"""
|
70
81
|
with self.db.init():
|
71
82
|
log.info("Search after write - Optimize start")
|
72
|
-
self.db.optimize()
|
83
|
+
self.db.optimize(data_size=self.data_volume)
|
73
84
|
log.info("Search after write - Optimize finished")
|
74
85
|
|
75
86
|
def run_search(self):
|
76
87
|
log.info("Search after write - Serial search start")
|
77
88
|
res, ssearch_dur = self.serial_search_runner.run()
|
78
89
|
recall, ndcg, p99_latency = res
|
79
|
-
log.info(
|
90
|
+
log.info(
|
91
|
+
f"Search after write - Serial search - recall={recall}, ndcg={ndcg}, p99={p99_latency}, "
|
92
|
+
f"dur={ssearch_dur:.4f}",
|
93
|
+
)
|
80
94
|
log.info(f"Search after wirte - Conc search start, dur for each conc={self.read_dur_after_write}")
|
81
95
|
max_qps = self.run_by_dur(self.read_dur_after_write)
|
82
96
|
log.info(f"Search after wirte - Conc search finished, max_qps={max_qps}")
|
@@ -86,7 +100,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
86
100
|
def run_read_write(self):
|
87
101
|
with mp.Manager() as m:
|
88
102
|
q = m.Queue()
|
89
|
-
with concurrent.futures.ProcessPoolExecutor(
|
103
|
+
with concurrent.futures.ProcessPoolExecutor(
|
104
|
+
mp_context=mp.get_context("spawn"),
|
105
|
+
max_workers=2,
|
106
|
+
) as executor:
|
90
107
|
read_write_futures = []
|
91
108
|
read_write_futures.append(executor.submit(self.run_with_rate, q))
|
92
109
|
read_write_futures.append(executor.submit(self.run_search_by_sig, q))
|
@@ -107,10 +124,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
107
124
|
except Exception as e:
|
108
125
|
log.warning(f"Read and write error: {e}")
|
109
126
|
executor.shutdown(wait=True, cancel_futures=True)
|
110
|
-
raise e
|
127
|
+
raise e from e
|
111
128
|
log.info("Concurrent read write all done")
|
112
129
|
|
113
|
-
def run_search_by_sig(self, q):
|
130
|
+
def run_search_by_sig(self, q: mp.Queue):
|
114
131
|
"""
|
115
132
|
Args:
|
116
133
|
q: multiprocessing queue
|
@@ -122,15 +139,14 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
122
139
|
total_batch = math.ceil(self.data_volume / self.insert_rate)
|
123
140
|
recall, ndcg, p99_latency = None, None, None
|
124
141
|
|
125
|
-
def wait_next_target(start, target_batch) -> bool:
|
142
|
+
def wait_next_target(start: int, target_batch: int) -> bool:
|
126
143
|
"""Return False when receive True or None"""
|
127
144
|
while start < target_batch:
|
128
145
|
sig = q.get(block=True)
|
129
146
|
|
130
147
|
if sig is None or sig is True:
|
131
148
|
return False
|
132
|
-
|
133
|
-
start += 1
|
149
|
+
start += 1
|
134
150
|
return True
|
135
151
|
|
136
152
|
for idx, stage in enumerate(self.search_stage):
|
@@ -140,18 +156,21 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
140
156
|
got = wait_next_target(start_batch, target_batch)
|
141
157
|
if got is False:
|
142
158
|
log.warning(f"Abnormal exit, target_batch={target_batch}, start_batch={start_batch}")
|
143
|
-
return
|
159
|
+
return None
|
144
160
|
|
145
161
|
log.info(f"Insert {perc}% done, total batch={total_batch}")
|
146
162
|
log.info(f"[{target_batch}/{total_batch}] Serial search - {perc}% start")
|
147
163
|
res, ssearch_dur = self.serial_search_runner.run()
|
148
164
|
recall, ndcg, p99_latency = res
|
149
|
-
log.info(
|
165
|
+
log.info(
|
166
|
+
f"[{target_batch}/{total_batch}] Serial search - {perc}% done, recall={recall}, "
|
167
|
+
f"ndcg={ndcg}, p99={p99_latency}, dur={ssearch_dur:.4f}"
|
168
|
+
)
|
150
169
|
|
151
170
|
# Search duration for non-last search stage is carefully calculated.
|
152
171
|
# If duration for each concurrency is less than 30s, runner will raise error.
|
153
172
|
if idx < len(self.search_stage) - 1:
|
154
|
-
total_dur_between_stages = self.data_volume
|
173
|
+
total_dur_between_stages = self.data_volume * (self.search_stage[idx + 1] - stage) // self.insert_rate
|
155
174
|
csearch_dur = total_dur_between_stages - ssearch_dur
|
156
175
|
|
157
176
|
# Try to leave room for init process executors
|
@@ -159,14 +178,19 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
|
|
159
178
|
|
160
179
|
each_conc_search_dur = csearch_dur / len(self.concurrencies)
|
161
180
|
if each_conc_search_dur < 30:
|
162
|
-
warning_msg =
|
181
|
+
warning_msg = (
|
182
|
+
f"Results might be inaccurate, duration[{csearch_dur:.4f}] left for conc-search is too short, "
|
183
|
+
f"total available dur={total_dur_between_stages}, serial_search_cost={ssearch_dur}."
|
184
|
+
)
|
163
185
|
log.warning(warning_msg)
|
164
186
|
|
165
187
|
# The last stage
|
166
188
|
else:
|
167
189
|
each_conc_search_dur = 60
|
168
190
|
|
169
|
-
log.info(
|
191
|
+
log.info(
|
192
|
+
f"[{target_batch}/{total_batch}] Concurrent search - {perc}% start, dur={each_conc_search_dur:.4f}"
|
193
|
+
)
|
170
194
|
max_qps = self.run_by_dur(each_conc_search_dur)
|
171
195
|
result.append((perc, max_qps, recall, ndcg, p99_latency))
|
172
196
|
|
@@ -1,20 +1,21 @@
|
|
1
|
-
import time
|
2
|
-
import logging
|
3
|
-
import traceback
|
4
1
|
import concurrent
|
5
|
-
import
|
2
|
+
import logging
|
6
3
|
import math
|
7
|
-
import
|
4
|
+
import multiprocessing as mp
|
5
|
+
import time
|
6
|
+
import traceback
|
8
7
|
|
9
8
|
import numpy as np
|
10
9
|
import pandas as pd
|
10
|
+
import psutil
|
11
11
|
|
12
|
-
from
|
12
|
+
from vectordb_bench.backend.dataset import DatasetManager
|
13
|
+
|
14
|
+
from ... import config
|
13
15
|
from ...metric import calc_ndcg, calc_recall, get_ideal_dcg
|
14
16
|
from ...models import LoadTimeoutError, PerformanceTimeoutError
|
15
17
|
from .. import utils
|
16
|
-
from
|
17
|
-
from vectordb_bench.backend.dataset import DatasetManager
|
18
|
+
from ..clients import api
|
18
19
|
|
19
20
|
NUM_PER_BATCH = config.NUM_PER_BATCH
|
20
21
|
LOAD_MAX_TRY_COUNT = 10
|
@@ -22,9 +23,16 @@ WAITTING_TIME = 60
|
|
22
23
|
|
23
24
|
log = logging.getLogger(__name__)
|
24
25
|
|
26
|
+
|
25
27
|
class SerialInsertRunner:
|
26
|
-
def __init__(
|
27
|
-
self
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
db: api.VectorDB,
|
31
|
+
dataset: DatasetManager,
|
32
|
+
normalize: bool,
|
33
|
+
timeout: float | None = None,
|
34
|
+
):
|
35
|
+
self.timeout = timeout if isinstance(timeout, int | float) else None
|
28
36
|
self.dataset = dataset
|
29
37
|
self.db = db
|
30
38
|
self.normalize = normalize
|
@@ -35,15 +43,15 @@ class SerialInsertRunner:
|
|
35
43
|
log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
|
36
44
|
start = time.perf_counter()
|
37
45
|
for data_df in self.dataset:
|
38
|
-
all_metadata = data_df[
|
46
|
+
all_metadata = data_df["id"].tolist()
|
39
47
|
|
40
|
-
emb_np = np.stack(data_df[
|
48
|
+
emb_np = np.stack(data_df["emb"])
|
41
49
|
if self.normalize:
|
42
50
|
log.debug("normalize the 100k train data")
|
43
51
|
all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist()
|
44
52
|
else:
|
45
53
|
all_embeddings = emb_np.tolist()
|
46
|
-
del
|
54
|
+
del emb_np
|
47
55
|
log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
|
48
56
|
|
49
57
|
insert_count, error = self.db.insert_embeddings(
|
@@ -58,28 +66,37 @@ class SerialInsertRunner:
|
|
58
66
|
if count % 100_000 == 0:
|
59
67
|
log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
|
60
68
|
|
61
|
-
log.info(
|
69
|
+
log.info(
|
70
|
+
f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, "
|
71
|
+
f"dur={time.perf_counter() - start}"
|
72
|
+
)
|
62
73
|
return count
|
63
74
|
|
64
|
-
def endless_insert_data(self, all_embeddings, all_metadata, left_id: int = 0) -> int:
|
75
|
+
def endless_insert_data(self, all_embeddings: list, all_metadata: list, left_id: int = 0) -> int:
|
65
76
|
with self.db.init():
|
66
77
|
# unique id for endlessness insertion
|
67
|
-
all_metadata = [i+left_id for i in all_metadata]
|
78
|
+
all_metadata = [i + left_id for i in all_metadata]
|
68
79
|
|
69
|
-
|
70
|
-
log.info(
|
80
|
+
num_batches = math.ceil(len(all_embeddings) / NUM_PER_BATCH)
|
81
|
+
log.info(
|
82
|
+
f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} "
|
83
|
+
f"embeddings in batch {NUM_PER_BATCH}"
|
84
|
+
)
|
71
85
|
count = 0
|
72
|
-
for batch_id in range(
|
86
|
+
for batch_id in range(num_batches):
|
73
87
|
retry_count = 0
|
74
88
|
already_insert_count = 0
|
75
|
-
metadata = all_metadata[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
|
76
|
-
embeddings = all_embeddings[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
|
89
|
+
metadata = all_metadata[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
|
90
|
+
embeddings = all_embeddings[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
|
77
91
|
|
78
|
-
log.debug(
|
92
|
+
log.debug(
|
93
|
+
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
94
|
+
f"Start inserting {len(metadata)} embeddings"
|
95
|
+
)
|
79
96
|
while retry_count < LOAD_MAX_TRY_COUNT:
|
80
97
|
insert_count, error = self.db.insert_embeddings(
|
81
|
-
embeddings=embeddings[already_insert_count
|
82
|
-
metadata=metadata[already_insert_count
|
98
|
+
embeddings=embeddings[already_insert_count:],
|
99
|
+
metadata=metadata[already_insert_count:],
|
83
100
|
)
|
84
101
|
already_insert_count += insert_count
|
85
102
|
if error is not None:
|
@@ -91,17 +108,26 @@ class SerialInsertRunner:
|
|
91
108
|
raise error
|
92
109
|
else:
|
93
110
|
break
|
94
|
-
log.debug(
|
111
|
+
log.debug(
|
112
|
+
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
113
|
+
f"Finish inserting {len(metadata)} embeddings"
|
114
|
+
)
|
95
115
|
|
96
116
|
assert already_insert_count == len(metadata)
|
97
117
|
count += already_insert_count
|
98
|
-
log.info(
|
118
|
+
log.info(
|
119
|
+
f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in "
|
120
|
+
f"batch {NUM_PER_BATCH}"
|
121
|
+
)
|
99
122
|
return count
|
100
123
|
|
101
124
|
@utils.time_it
|
102
125
|
def _insert_all_batches(self) -> int:
|
103
126
|
"""Performance case only"""
|
104
|
-
with concurrent.futures.ProcessPoolExecutor(
|
127
|
+
with concurrent.futures.ProcessPoolExecutor(
|
128
|
+
mp_context=mp.get_context("spawn"),
|
129
|
+
max_workers=1,
|
130
|
+
) as executor:
|
105
131
|
future = executor.submit(self.task)
|
106
132
|
try:
|
107
133
|
count = future.result(timeout=self.timeout)
|
@@ -121,27 +147,36 @@ class SerialInsertRunner:
|
|
121
147
|
"""run forever util DB raises exception or crash"""
|
122
148
|
# datasets for load tests are quite small, can fit into memory
|
123
149
|
# only 1 file
|
124
|
-
data_df =
|
125
|
-
all_embeddings, all_metadata =
|
150
|
+
data_df = next(iter(self.dataset))
|
151
|
+
all_embeddings, all_metadata = (
|
152
|
+
np.stack(data_df["emb"]).tolist(),
|
153
|
+
data_df["id"].tolist(),
|
154
|
+
)
|
126
155
|
|
127
156
|
start_time = time.perf_counter()
|
128
157
|
max_load_count, times = 0, 0
|
129
158
|
try:
|
130
|
-
with self.db.init():
|
131
|
-
self.db.ready_to_load()
|
132
159
|
while time.perf_counter() - start_time < self.timeout:
|
133
|
-
count = self.endless_insert_data(
|
160
|
+
count = self.endless_insert_data(
|
161
|
+
all_embeddings,
|
162
|
+
all_metadata,
|
163
|
+
left_id=max_load_count,
|
164
|
+
)
|
134
165
|
max_load_count += count
|
135
166
|
times += 1
|
136
|
-
log.info(
|
167
|
+
log.info(
|
168
|
+
f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, "
|
169
|
+
f"{max_load_count}"
|
170
|
+
)
|
137
171
|
except Exception as e:
|
138
|
-
log.info(
|
172
|
+
log.info(
|
173
|
+
f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, "
|
174
|
+
f"{max_load_count}, err={e}"
|
175
|
+
)
|
139
176
|
traceback.print_exc()
|
140
177
|
return max_load_count
|
141
178
|
else:
|
142
|
-
|
143
|
-
log.info(msg)
|
144
|
-
raise LoadTimeoutError(msg)
|
179
|
+
raise LoadTimeoutError(self.timeout)
|
145
180
|
|
146
181
|
def run(self) -> int:
|
147
182
|
count, dur = self._insert_all_batches()
|
@@ -193,13 +228,15 @@ class SerialSearchRunner:
|
|
193
228
|
|
194
229
|
latencies.append(time.perf_counter() - s)
|
195
230
|
|
196
|
-
gt = ground_truth[
|
197
|
-
recalls.append(calc_recall(self.k, gt[:self.k], results))
|
198
|
-
ndcgs.append(calc_ndcg(gt[:self.k], results, ideal_dcg))
|
199
|
-
|
231
|
+
gt = ground_truth["neighbors_id"][idx]
|
232
|
+
recalls.append(calc_recall(self.k, gt[: self.k], results))
|
233
|
+
ndcgs.append(calc_ndcg(gt[: self.k], results, ideal_dcg))
|
200
234
|
|
201
235
|
if len(latencies) % 100 == 0:
|
202
|
-
log.debug(
|
236
|
+
log.debug(
|
237
|
+
f"({mp.current_process().name:14}) search_count={len(latencies):3}, "
|
238
|
+
f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}"
|
239
|
+
)
|
203
240
|
|
204
241
|
avg_latency = round(np.mean(latencies), 4)
|
205
242
|
avg_recall = round(np.mean(recalls), 4)
|
@@ -214,15 +251,13 @@ class SerialSearchRunner:
|
|
214
251
|
f"avg_ndcg={avg_ndcg},"
|
215
252
|
f"avg_latency={avg_latency}, "
|
216
253
|
f"p99={p99}"
|
217
|
-
|
254
|
+
)
|
218
255
|
return (avg_recall, avg_ndcg, p99)
|
219
256
|
|
220
|
-
|
221
257
|
def _run_in_subprocess(self) -> tuple[float, float]:
|
222
258
|
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
|
223
259
|
future = executor.submit(self.search, (self.test_data, self.ground_truth))
|
224
|
-
|
225
|
-
return result
|
260
|
+
return future.result()
|
226
261
|
|
227
262
|
@utils.time_it
|
228
263
|
def run(self) -> tuple[float, float, float]:
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from pandas import DataFrame
|
4
3
|
import numpy as np
|
4
|
+
from pandas import DataFrame
|
5
5
|
|
6
6
|
log = logging.getLogger(__name__)
|
7
7
|
|
8
|
+
|
8
9
|
def get_data(data_df: DataFrame, normalize: bool) -> tuple[list[list[float]], list[str]]:
|
9
|
-
all_metadata = data_df[
|
10
|
-
emb_np = np.stack(data_df[
|
10
|
+
all_metadata = data_df["id"].tolist()
|
11
|
+
emb_np = np.stack(data_df["emb"])
|
11
12
|
if normalize:
|
12
13
|
log.debug("normalize the 100k train data")
|
13
14
|
all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist()
|