vectordb-bench 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. vectordb_bench/__init__.py +49 -24
  2. vectordb_bench/__main__.py +4 -3
  3. vectordb_bench/backend/assembler.py +12 -13
  4. vectordb_bench/backend/cases.py +55 -45
  5. vectordb_bench/backend/clients/__init__.py +85 -14
  6. vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
  7. vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
  8. vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +112 -77
  9. vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
  10. vectordb_bench/backend/clients/alloydb/alloydb.py +59 -84
  11. vectordb_bench/backend/clients/alloydb/cli.py +51 -34
  12. vectordb_bench/backend/clients/alloydb/config.py +30 -30
  13. vectordb_bench/backend/clients/api.py +13 -24
  14. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +50 -54
  15. vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
  16. vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
  17. vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
  18. vectordb_bench/backend/clients/chroma/chroma.py +39 -40
  19. vectordb_bench/backend/clients/chroma/config.py +4 -2
  20. vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
  21. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +24 -26
  22. vectordb_bench/backend/clients/memorydb/cli.py +8 -8
  23. vectordb_bench/backend/clients/memorydb/config.py +2 -2
  24. vectordb_bench/backend/clients/memorydb/memorydb.py +67 -58
  25. vectordb_bench/backend/clients/milvus/cli.py +41 -83
  26. vectordb_bench/backend/clients/milvus/config.py +18 -8
  27. vectordb_bench/backend/clients/milvus/milvus.py +19 -39
  28. vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
  29. vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
  30. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +56 -77
  31. vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
  32. vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
  33. vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +34 -43
  34. vectordb_bench/backend/clients/pgvector/cli.py +40 -31
  35. vectordb_bench/backend/clients/pgvector/config.py +63 -73
  36. vectordb_bench/backend/clients/pgvector/pgvector.py +98 -104
  37. vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
  38. vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
  39. vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +39 -49
  40. vectordb_bench/backend/clients/pinecone/config.py +1 -0
  41. vectordb_bench/backend/clients/pinecone/pinecone.py +15 -25
  42. vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
  43. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +41 -35
  44. vectordb_bench/backend/clients/redis/cli.py +6 -12
  45. vectordb_bench/backend/clients/redis/config.py +7 -5
  46. vectordb_bench/backend/clients/redis/redis.py +95 -62
  47. vectordb_bench/backend/clients/test/cli.py +2 -3
  48. vectordb_bench/backend/clients/test/config.py +2 -2
  49. vectordb_bench/backend/clients/test/test.py +5 -9
  50. vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
  51. vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
  52. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +37 -26
  53. vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
  54. vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
  55. vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
  56. vectordb_bench/backend/data_source.py +18 -14
  57. vectordb_bench/backend/dataset.py +47 -27
  58. vectordb_bench/backend/result_collector.py +2 -3
  59. vectordb_bench/backend/runner/__init__.py +4 -6
  60. vectordb_bench/backend/runner/mp_runner.py +56 -23
  61. vectordb_bench/backend/runner/rate_runner.py +30 -19
  62. vectordb_bench/backend/runner/read_write_runner.py +46 -22
  63. vectordb_bench/backend/runner/serial_runner.py +81 -46
  64. vectordb_bench/backend/runner/util.py +4 -3
  65. vectordb_bench/backend/task_runner.py +92 -92
  66. vectordb_bench/backend/utils.py +17 -10
  67. vectordb_bench/base.py +0 -1
  68. vectordb_bench/cli/cli.py +65 -60
  69. vectordb_bench/cli/vectordbbench.py +6 -7
  70. vectordb_bench/frontend/components/check_results/charts.py +8 -19
  71. vectordb_bench/frontend/components/check_results/data.py +4 -16
  72. vectordb_bench/frontend/components/check_results/filters.py +8 -16
  73. vectordb_bench/frontend/components/check_results/nav.py +4 -4
  74. vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
  75. vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
  76. vectordb_bench/frontend/components/concurrent/charts.py +12 -12
  77. vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
  78. vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
  79. vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
  80. vectordb_bench/frontend/components/custom/initStyle.py +1 -1
  81. vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
  82. vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
  83. vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
  84. vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
  85. vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
  86. vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
  87. vectordb_bench/frontend/components/tables/data.py +3 -6
  88. vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
  89. vectordb_bench/frontend/pages/concurrent.py +3 -5
  90. vectordb_bench/frontend/pages/custom.py +30 -9
  91. vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
  92. vectordb_bench/frontend/pages/run_test.py +3 -7
  93. vectordb_bench/frontend/utils.py +1 -1
  94. vectordb_bench/frontend/vdb_benchmark.py +4 -6
  95. vectordb_bench/interface.py +45 -24
  96. vectordb_bench/log_util.py +59 -64
  97. vectordb_bench/metric.py +10 -11
  98. vectordb_bench/models.py +26 -43
  99. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/METADATA +22 -15
  100. vectordb_bench-0.0.21.dist-info/RECORD +135 -0
  101. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/WHEEL +1 -1
  102. vectordb_bench-0.0.19.dist-info/RECORD +0 -135
  103. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/LICENSE +0 -0
  104. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/entry_points.txt +0 -0
  105. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,18 @@
1
+ import concurrent
1
2
  import logging
2
- from typing import Iterable
3
+ import math
3
4
  import multiprocessing as mp
4
- import concurrent
5
+ from collections.abc import Iterable
6
+
5
7
  import numpy as np
6
- import math
7
8
 
8
- from .mp_runner import MultiProcessingSearchRunner
9
- from .serial_runner import SerialSearchRunner
10
- from .rate_runner import RatedMultiThreadingInsertRunner
11
9
  from vectordb_bench.backend.clients import api
12
10
  from vectordb_bench.backend.dataset import DatasetManager
13
11
 
12
+ from .mp_runner import MultiProcessingSearchRunner
13
+ from .rate_runner import RatedMultiThreadingInsertRunner
14
+ from .serial_runner import SerialSearchRunner
15
+
14
16
  log = logging.getLogger(__name__)
15
17
 
16
18
 
@@ -24,8 +26,14 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
24
26
  k: int = 100,
25
27
  filters: dict | None = None,
26
28
  concurrencies: Iterable[int] = (1, 15, 50),
27
- search_stage: Iterable[float] = (0.5, 0.6, 0.7, 0.8, 0.9), # search from insert portion, 0.0 means search from the start
28
- read_dur_after_write: int = 300, # seconds, search duration when insertion is done
29
+ search_stage: Iterable[float] = (
30
+ 0.5,
31
+ 0.6,
32
+ 0.7,
33
+ 0.8,
34
+ 0.9,
35
+ ), # search from insert portion, 0.0 means search from the start
36
+ read_dur_after_write: int = 300, # seconds, search duration when insertion is done
29
37
  timeout: float | None = None,
30
38
  ):
31
39
  self.insert_rate = insert_rate
@@ -36,7 +44,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
36
44
  self.search_stage = sorted(search_stage)
37
45
  self.read_dur_after_write = read_dur_after_write
38
46
 
39
- log.info(f"Init runner, concurencys={concurrencies}, search_stage={search_stage}, stage_search_dur={read_dur_after_write}")
47
+ log.info(
48
+ f"Init runner, concurencys={concurrencies}, search_stage={search_stage}, "
49
+ f"stage_search_dur={read_dur_after_write}"
50
+ )
40
51
 
41
52
  test_emb = np.stack(dataset.test_data["emb"])
42
53
  if normalize:
@@ -69,14 +80,17 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
69
80
  """Optimize needs to run in differenct process for pymilvus schema recursion problem"""
70
81
  with self.db.init():
71
82
  log.info("Search after write - Optimize start")
72
- self.db.optimize()
83
+ self.db.optimize(data_size=self.data_volume)
73
84
  log.info("Search after write - Optimize finished")
74
85
 
75
86
  def run_search(self):
76
87
  log.info("Search after write - Serial search start")
77
88
  res, ssearch_dur = self.serial_search_runner.run()
78
89
  recall, ndcg, p99_latency = res
79
- log.info(f"Search after write - Serial search - recall={recall}, ndcg={ndcg}, p99={p99_latency}, dur={ssearch_dur:.4f}")
90
+ log.info(
91
+ f"Search after write - Serial search - recall={recall}, ndcg={ndcg}, p99={p99_latency}, "
92
+ f"dur={ssearch_dur:.4f}",
93
+ )
80
94
  log.info(f"Search after wirte - Conc search start, dur for each conc={self.read_dur_after_write}")
81
95
  max_qps = self.run_by_dur(self.read_dur_after_write)
82
96
  log.info(f"Search after wirte - Conc search finished, max_qps={max_qps}")
@@ -86,7 +100,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
86
100
  def run_read_write(self):
87
101
  with mp.Manager() as m:
88
102
  q = m.Queue()
89
- with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context("spawn"), max_workers=2) as executor:
103
+ with concurrent.futures.ProcessPoolExecutor(
104
+ mp_context=mp.get_context("spawn"),
105
+ max_workers=2,
106
+ ) as executor:
90
107
  read_write_futures = []
91
108
  read_write_futures.append(executor.submit(self.run_with_rate, q))
92
109
  read_write_futures.append(executor.submit(self.run_search_by_sig, q))
@@ -107,10 +124,10 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
107
124
  except Exception as e:
108
125
  log.warning(f"Read and write error: {e}")
109
126
  executor.shutdown(wait=True, cancel_futures=True)
110
- raise e
127
+ raise e from e
111
128
  log.info("Concurrent read write all done")
112
129
 
113
- def run_search_by_sig(self, q):
130
+ def run_search_by_sig(self, q: mp.Queue):
114
131
  """
115
132
  Args:
116
133
  q: multiprocessing queue
@@ -122,15 +139,14 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
122
139
  total_batch = math.ceil(self.data_volume / self.insert_rate)
123
140
  recall, ndcg, p99_latency = None, None, None
124
141
 
125
- def wait_next_target(start, target_batch) -> bool:
142
+ def wait_next_target(start: int, target_batch: int) -> bool:
126
143
  """Return False when receive True or None"""
127
144
  while start < target_batch:
128
145
  sig = q.get(block=True)
129
146
 
130
147
  if sig is None or sig is True:
131
148
  return False
132
- else:
133
- start += 1
149
+ start += 1
134
150
  return True
135
151
 
136
152
  for idx, stage in enumerate(self.search_stage):
@@ -140,18 +156,21 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
140
156
  got = wait_next_target(start_batch, target_batch)
141
157
  if got is False:
142
158
  log.warning(f"Abnormal exit, target_batch={target_batch}, start_batch={start_batch}")
143
- return
159
+ return None
144
160
 
145
161
  log.info(f"Insert {perc}% done, total batch={total_batch}")
146
162
  log.info(f"[{target_batch}/{total_batch}] Serial search - {perc}% start")
147
163
  res, ssearch_dur = self.serial_search_runner.run()
148
164
  recall, ndcg, p99_latency = res
149
- log.info(f"[{target_batch}/{total_batch}] Serial search - {perc}% done, recall={recall}, ndcg={ndcg}, p99={p99_latency}, dur={ssearch_dur:.4f}")
165
+ log.info(
166
+ f"[{target_batch}/{total_batch}] Serial search - {perc}% done, recall={recall}, "
167
+ f"ndcg={ndcg}, p99={p99_latency}, dur={ssearch_dur:.4f}"
168
+ )
150
169
 
151
170
  # Search duration for non-last search stage is carefully calculated.
152
171
  # If duration for each concurrency is less than 30s, runner will raise error.
153
172
  if idx < len(self.search_stage) - 1:
154
- total_dur_between_stages = self.data_volume * (self.search_stage[idx + 1] - stage) // self.insert_rate
173
+ total_dur_between_stages = self.data_volume * (self.search_stage[idx + 1] - stage) // self.insert_rate
155
174
  csearch_dur = total_dur_between_stages - ssearch_dur
156
175
 
157
176
  # Try to leave room for init process executors
@@ -159,14 +178,19 @@ class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunn
159
178
 
160
179
  each_conc_search_dur = csearch_dur / len(self.concurrencies)
161
180
  if each_conc_search_dur < 30:
162
- warning_msg = f"Results might be inaccurate, duration[{csearch_dur:.4f}] left for conc-search is too short, total available dur={total_dur_between_stages}, serial_search_cost={ssearch_dur}."
181
+ warning_msg = (
182
+ f"Results might be inaccurate, duration[{csearch_dur:.4f}] left for conc-search is too short, "
183
+ f"total available dur={total_dur_between_stages}, serial_search_cost={ssearch_dur}."
184
+ )
163
185
  log.warning(warning_msg)
164
186
 
165
187
  # The last stage
166
188
  else:
167
189
  each_conc_search_dur = 60
168
190
 
169
- log.info(f"[{target_batch}/{total_batch}] Concurrent search - {perc}% start, dur={each_conc_search_dur:.4f}")
191
+ log.info(
192
+ f"[{target_batch}/{total_batch}] Concurrent search - {perc}% start, dur={each_conc_search_dur:.4f}"
193
+ )
170
194
  max_qps = self.run_by_dur(each_conc_search_dur)
171
195
  result.append((perc, max_qps, recall, ndcg, p99_latency))
172
196
 
@@ -1,20 +1,21 @@
1
- import time
2
- import logging
3
- import traceback
4
1
  import concurrent
5
- import multiprocessing as mp
2
+ import logging
6
3
  import math
7
- import psutil
4
+ import multiprocessing as mp
5
+ import time
6
+ import traceback
8
7
 
9
8
  import numpy as np
10
9
  import pandas as pd
10
+ import psutil
11
11
 
12
- from ..clients import api
12
+ from vectordb_bench.backend.dataset import DatasetManager
13
+
14
+ from ... import config
13
15
  from ...metric import calc_ndcg, calc_recall, get_ideal_dcg
14
16
  from ...models import LoadTimeoutError, PerformanceTimeoutError
15
17
  from .. import utils
16
- from ... import config
17
- from vectordb_bench.backend.dataset import DatasetManager
18
+ from ..clients import api
18
19
 
19
20
  NUM_PER_BATCH = config.NUM_PER_BATCH
20
21
  LOAD_MAX_TRY_COUNT = 10
@@ -22,9 +23,16 @@ WAITTING_TIME = 60
22
23
 
23
24
  log = logging.getLogger(__name__)
24
25
 
26
+
25
27
  class SerialInsertRunner:
26
- def __init__(self, db: api.VectorDB, dataset: DatasetManager, normalize: bool, timeout: float | None = None):
27
- self.timeout = timeout if isinstance(timeout, (int, float)) else None
28
+ def __init__(
29
+ self,
30
+ db: api.VectorDB,
31
+ dataset: DatasetManager,
32
+ normalize: bool,
33
+ timeout: float | None = None,
34
+ ):
35
+ self.timeout = timeout if isinstance(timeout, int | float) else None
28
36
  self.dataset = dataset
29
37
  self.db = db
30
38
  self.normalize = normalize
@@ -35,15 +43,15 @@ class SerialInsertRunner:
35
43
  log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
36
44
  start = time.perf_counter()
37
45
  for data_df in self.dataset:
38
- all_metadata = data_df['id'].tolist()
46
+ all_metadata = data_df["id"].tolist()
39
47
 
40
- emb_np = np.stack(data_df['emb'])
48
+ emb_np = np.stack(data_df["emb"])
41
49
  if self.normalize:
42
50
  log.debug("normalize the 100k train data")
43
51
  all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist()
44
52
  else:
45
53
  all_embeddings = emb_np.tolist()
46
- del(emb_np)
54
+ del emb_np
47
55
  log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
48
56
 
49
57
  insert_count, error = self.db.insert_embeddings(
@@ -58,28 +66,37 @@ class SerialInsertRunner:
58
66
  if count % 100_000 == 0:
59
67
  log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
60
68
 
61
- log.info(f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}")
69
+ log.info(
70
+ f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, "
71
+ f"dur={time.perf_counter() - start}"
72
+ )
62
73
  return count
63
74
 
64
- def endless_insert_data(self, all_embeddings, all_metadata, left_id: int = 0) -> int:
75
+ def endless_insert_data(self, all_embeddings: list, all_metadata: list, left_id: int = 0) -> int:
65
76
  with self.db.init():
66
77
  # unique id for endlessness insertion
67
- all_metadata = [i+left_id for i in all_metadata]
78
+ all_metadata = [i + left_id for i in all_metadata]
68
79
 
69
- NUM_BATCHES = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
70
- log.info(f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
80
+ num_batches = math.ceil(len(all_embeddings) / NUM_PER_BATCH)
81
+ log.info(
82
+ f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} "
83
+ f"embeddings in batch {NUM_PER_BATCH}"
84
+ )
71
85
  count = 0
72
- for batch_id in range(NUM_BATCHES):
86
+ for batch_id in range(num_batches):
73
87
  retry_count = 0
74
88
  already_insert_count = 0
75
- metadata = all_metadata[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
76
- embeddings = all_embeddings[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
89
+ metadata = all_metadata[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
90
+ embeddings = all_embeddings[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
77
91
 
78
- log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Start inserting {len(metadata)} embeddings")
92
+ log.debug(
93
+ f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
94
+ f"Start inserting {len(metadata)} embeddings"
95
+ )
79
96
  while retry_count < LOAD_MAX_TRY_COUNT:
80
97
  insert_count, error = self.db.insert_embeddings(
81
- embeddings=embeddings[already_insert_count :],
82
- metadata=metadata[already_insert_count :],
98
+ embeddings=embeddings[already_insert_count:],
99
+ metadata=metadata[already_insert_count:],
83
100
  )
84
101
  already_insert_count += insert_count
85
102
  if error is not None:
@@ -91,17 +108,26 @@ class SerialInsertRunner:
91
108
  raise error
92
109
  else:
93
110
  break
94
- log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Finish inserting {len(metadata)} embeddings")
111
+ log.debug(
112
+ f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
113
+ f"Finish inserting {len(metadata)} embeddings"
114
+ )
95
115
 
96
116
  assert already_insert_count == len(metadata)
97
117
  count += already_insert_count
98
- log.info(f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
118
+ log.info(
119
+ f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in "
120
+ f"batch {NUM_PER_BATCH}"
121
+ )
99
122
  return count
100
123
 
101
124
  @utils.time_it
102
125
  def _insert_all_batches(self) -> int:
103
126
  """Performance case only"""
104
- with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context('spawn'), max_workers=1) as executor:
127
+ with concurrent.futures.ProcessPoolExecutor(
128
+ mp_context=mp.get_context("spawn"),
129
+ max_workers=1,
130
+ ) as executor:
105
131
  future = executor.submit(self.task)
106
132
  try:
107
133
  count = future.result(timeout=self.timeout)
@@ -121,27 +147,36 @@ class SerialInsertRunner:
121
147
  """run forever util DB raises exception or crash"""
122
148
  # datasets for load tests are quite small, can fit into memory
123
149
  # only 1 file
124
- data_df = [data_df for data_df in self.dataset][0]
125
- all_embeddings, all_metadata = np.stack(data_df["emb"]).tolist(), data_df['id'].tolist()
150
+ data_df = next(iter(self.dataset))
151
+ all_embeddings, all_metadata = (
152
+ np.stack(data_df["emb"]).tolist(),
153
+ data_df["id"].tolist(),
154
+ )
126
155
 
127
156
  start_time = time.perf_counter()
128
157
  max_load_count, times = 0, 0
129
158
  try:
130
- with self.db.init():
131
- self.db.ready_to_load()
132
159
  while time.perf_counter() - start_time < self.timeout:
133
- count = self.endless_insert_data(all_embeddings, all_metadata, left_id=max_load_count)
160
+ count = self.endless_insert_data(
161
+ all_embeddings,
162
+ all_metadata,
163
+ left_id=max_load_count,
164
+ )
134
165
  max_load_count += count
135
166
  times += 1
136
- log.info(f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, {max_load_count}")
167
+ log.info(
168
+ f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, "
169
+ f"{max_load_count}"
170
+ )
137
171
  except Exception as e:
138
- log.info(f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, {max_load_count}, err={e}")
172
+ log.info(
173
+ f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, "
174
+ f"{max_load_count}, err={e}"
175
+ )
139
176
  traceback.print_exc()
140
177
  return max_load_count
141
178
  else:
142
- msg = f"capacity case load timeout in {self.timeout}s"
143
- log.info(msg)
144
- raise LoadTimeoutError(msg)
179
+ raise LoadTimeoutError(self.timeout)
145
180
 
146
181
  def run(self) -> int:
147
182
  count, dur = self._insert_all_batches()
@@ -193,13 +228,15 @@ class SerialSearchRunner:
193
228
 
194
229
  latencies.append(time.perf_counter() - s)
195
230
 
196
- gt = ground_truth['neighbors_id'][idx]
197
- recalls.append(calc_recall(self.k, gt[:self.k], results))
198
- ndcgs.append(calc_ndcg(gt[:self.k], results, ideal_dcg))
199
-
231
+ gt = ground_truth["neighbors_id"][idx]
232
+ recalls.append(calc_recall(self.k, gt[: self.k], results))
233
+ ndcgs.append(calc_ndcg(gt[: self.k], results, ideal_dcg))
200
234
 
201
235
  if len(latencies) % 100 == 0:
202
- log.debug(f"({mp.current_process().name:14}) search_count={len(latencies):3}, latest_latency={latencies[-1]}, latest recall={recalls[-1]}")
236
+ log.debug(
237
+ f"({mp.current_process().name:14}) search_count={len(latencies):3}, "
238
+ f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}"
239
+ )
203
240
 
204
241
  avg_latency = round(np.mean(latencies), 4)
205
242
  avg_recall = round(np.mean(recalls), 4)
@@ -214,15 +251,13 @@ class SerialSearchRunner:
214
251
  f"avg_ndcg={avg_ndcg},"
215
252
  f"avg_latency={avg_latency}, "
216
253
  f"p99={p99}"
217
- )
254
+ )
218
255
  return (avg_recall, avg_ndcg, p99)
219
256
 
220
-
221
257
  def _run_in_subprocess(self) -> tuple[float, float]:
222
258
  with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
223
259
  future = executor.submit(self.search, (self.test_data, self.ground_truth))
224
- result = future.result()
225
- return result
260
+ return future.result()
226
261
 
227
262
  @utils.time_it
228
263
  def run(self) -> tuple[float, float, float]:
@@ -1,13 +1,14 @@
1
1
  import logging
2
2
 
3
- from pandas import DataFrame
4
3
  import numpy as np
4
+ from pandas import DataFrame
5
5
 
6
6
  log = logging.getLogger(__name__)
7
7
 
8
+
8
9
  def get_data(data_df: DataFrame, normalize: bool) -> tuple[list[list[float]], list[str]]:
9
- all_metadata = data_df['id'].tolist()
10
- emb_np = np.stack(data_df['emb'])
10
+ all_metadata = data_df["id"].tolist()
11
+ emb_np = np.stack(data_df["emb"])
11
12
  if normalize:
12
13
  log.debug("normalize the 100k train data")
13
14
  all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist()