PyPI - vectordb-bench - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

vectordb-bench 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

vectordb_bench/backend/clients/pgvector/pgvector.py CHANGED Viewed

@@ -1,28 +1,14 @@
 """Wrapper around the Pgvector vector database over VectorDB"""
+import io
 import logging
 from contextlib import contextmanager
 from typing import Any
+import pandas as pd
+import psycopg2
+import psycopg2.extras
-from ..api import VectorDB, DBCaseConfig
-from pgvector.sqlalchemy import Vector
-from sqlalchemy import (
-    MetaData,
-    create_engine,
-    insert,
-    select,
-    Index,
-    Table,
-    text,
-    Column,
-    Float,
-    Integer
-)
-from sqlalchemy.orm import (
-    declarative_base,
-    mapped_column,
-    Session
-)
+from ..api import IndexType, VectorDB, DBCaseConfig
 log = logging.getLogger(__name__)
@@ -37,6 +23,7 @@ class PgVector(VectorDB):
         drop_old: bool = False,
         **kwargs,
     ):
+        self.name = "PgVector"
         self.db_config = db_config
         self.case_config = db_case_config
         self.table_name = collection_name
@@ -47,22 +34,26 @@ class PgVector(VectorDB):
         self._vector_field = "embedding"
         # construct basic units
-        pg_engine = create_engine(**self.db_config)
-        Base = declarative_base()
-        pq_metadata = Base.metadata
-        pq_metadata.reflect(pg_engine)
+        self.conn = psycopg2.connect(**self.db_config)
+        self.conn.autocommit = False
+        self.cursor = self.conn.cursor()
         # create vector extension
-        with pg_engine.connect() as conn:
-            conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
-            conn.commit()
+        self.cursor.execute('CREATE EXTENSION IF NOT EXISTS vector')
+        self.conn.commit()
-        self.pg_table = self._get_table_schema(pq_metadata)
-        if drop_old and self.table_name in pq_metadata.tables:
+        if drop_old :
             log.info(f"Pgvector client drop table : {self.table_name}")
             # self.pg_table.drop(pg_engine, checkfirst=True)
-            pq_metadata.drop_all(pg_engine)
-            self._create_table(dim, pg_engine)
+            self._drop_index()
+            self._drop_table()
+            self._create_table(dim)
+            self._create_index()
+        self.cursor.close()
+        self.conn.close()
+        self.cursor = None
+        self.conn = None
     @contextmanager
     def init(self) -> None:
@@ -72,53 +63,70 @@ class PgVector(VectorDB):
             >>>     self.insert_embeddings()
             >>>     self.search_embedding()
         """
-        self.pg_engine = create_engine(**self.db_config)
-        Base = declarative_base()
-        pq_metadata = Base.metadata
-        pq_metadata.reflect(self.pg_engine)
-        self.pg_session = Session(self.pg_engine)
-        self.pg_table = self._get_table_schema(pq_metadata)
-        yield
-        self.pg_session = None
-        self.pg_engine = None
-        del (self.pg_session)
-        del (self.pg_engine)
+        self.conn = psycopg2.connect(**self.db_config)
+        self.conn.autocommit = False
+        self.cursor = self.conn.cursor()
+        try:
+            yield
+        finally:
+            self.cursor.close()
+            self.conn.close()
+            self.cursor = None
+            self.conn = None
+    def _drop_table(self):
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
+        self.cursor.execute(f'DROP TABLE IF EXISTS public."{self.table_name}"')
+        self.conn.commit()
     def ready_to_load(self):
         pass
     def optimize(self):
         pass
+    def _post_insert(self):
+        log.info(f"{self.name} post insert before optimize")
+        self._drop_index()
+        self._create_index()
     def ready_to_search(self):
         pass
-    def _get_table_schema(self, pq_metadata):
-        return Table(
-            self.table_name,
-            pq_metadata,
-            Column(self._primary_field, Integer, primary_key=True),
-            Column(self._vector_field, Vector(self.dim)),
-            extend_existing=True
-        )
+    def _drop_index(self):
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
+        self.cursor.execute(f'DROP INDEX IF EXISTS "{self._index_name}"')
+        self.conn.commit()
-    def _create_index(self, pg_engine):
+    def _create_index(self):
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
         index_param = self.case_config.index_param()
-        index = Index(self._index_name, self.pg_table.c.embedding,
-            postgresql_using='ivfflat',
-            postgresql_with={'lists': index_param["lists"]},
-            postgresql_ops={'embedding': index_param["metric"]}
-        )
-        index.drop(pg_engine, checkfirst = True)
-        index.create(pg_engine)
-    def _create_table(self, dim, pg_engine : int):
+        if self.case_config.index == IndexType.HNSW:
+            log.debug(f'Creating HNSW index. m={index_param["m"]}, ef_construction={index_param["ef_construction"]}')
+            self.cursor.execute(f'CREATE INDEX IF NOT EXISTS {self._index_name} ON public."{self.table_name}" USING hnsw (embedding {index_param["metric"]}) WITH (m={index_param["m"]}, ef_construction={index_param["ef_construction"]});')
+        elif self.case_config.index == IndexType.IVFFlat:
+            log.debug(f'Creating IVFFLAT index. list={index_param["lists"]}')
+            self.cursor.execute(f'CREATE INDEX IF NOT EXISTS {self._index_name} ON public."{self.table_name}" USING ivfflat (embedding {index_param["metric"]}) WITH (lists={index_param["lists"]});')
+        else:
+            assert "Invalid index type {self.case_config.index}"
+        self.conn.commit()
+    def _create_table(self, dim : int):
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
         try:
             # create table
-            self.pg_table.create(bind = pg_engine, checkfirst = True)
-            # create vec index
-            self._create_index(pg_engine)
+            self.cursor.execute(f'CREATE TABLE IF NOT EXISTS public."{self.table_name}" (id BIGINT PRIMARY KEY, embedding vector({dim}));')
+            self.cursor.execute(f'ALTER TABLE public."{self.table_name}" ALTER COLUMN embedding SET STORAGE PLAIN;')
+            self.conn.commit()
         except Exception as e:
             log.warning(f"Failed to create pgvector table: {self.table_name} error: {e}")
             raise e from None
@@ -129,10 +137,24 @@ class PgVector(VectorDB):
         metadata: list[int],
         **kwargs: Any,
     ) -> (int, Exception):
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
         try:
-            items = [dict(id = metadata[i], embedding=embeddings[i]) for i in range(len(metadata))]
-            self.pg_session.execute(insert(self.pg_table), items)
-            self.pg_session.commit()
+            items = {
+                "id": metadata,
+                "embedding": embeddings
+            }
+            df = pd.DataFrame(items)
+            csv_buffer = io.StringIO()
+            df.to_csv(csv_buffer, index=False, header=False)
+            csv_buffer.seek(0)
+            self.cursor.copy_expert(f"COPY public.\"{self.table_name}\" FROM STDIN WITH (FORMAT CSV)", csv_buffer)
+            self.conn.commit()
+            if kwargs.get("last_batch"):
+                self._post_insert()
             return len(metadata), None
         except Exception as e:
             log.warning(f"Failed to insert data into pgvector table ({self.table_name}), error: {e}")
@@ -145,15 +167,21 @@ class PgVector(VectorDB):
         filters: dict | None = None,
         timeout: int | None = None,
     ) -> list[int]:
-        assert self.pg_table is not None
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
         search_param =self.case_config.search_param()
-        with self.pg_engine.connect() as conn:
-            conn.execute(text(f'SET ivfflat.probes = {search_param["probes"]}'))
-            conn.commit()
-        op_fun = getattr(self.pg_table.c.embedding, search_param["metric_fun"])
-        if filters:
-            res = self.pg_session.scalars(select(self.pg_table).order_by(op_fun(query)).filter(self.pg_table.c.id > filters.get('id')).limit(k))
-        else:
-            res = self.pg_session.scalars(select(self.pg_table).order_by(op_fun(query)).limit(k))
-        return list(res)
+        if self.case_config.index == IndexType.HNSW:
+            self.cursor.execute(f'SET hnsw.ef_search = {search_param["ef"]}')
+            self.cursor.execute(f"SELECT id FROM public.\"{self.table_name}\" ORDER BY embedding {search_param['metric_fun_op']} '{query}' LIMIT {k};")
+        elif self.case_config.index == IndexType.IVFFlat:
+            self.cursor.execute(f'SET ivfflat.probes = {search_param["probes"]}')
+            self.cursor.execute(f"SELECT id FROM public.\"{self.table_name}\" ORDER BY embedding {search_param['metric_fun_op']} '{query}' LIMIT {k};")
+        else:
+            assert "Invalid index type {self.case_config.index}"
+        self.conn.commit()
+        result = self.cursor.fetchall()
+        return [int(i[0]) for i in result]

vectordb_bench/backend/clients/qdrant_cloud/config.py CHANGED Viewed

@@ -1,18 +1,31 @@
 from pydantic import BaseModel, SecretStr
 from ..api import DBConfig, DBCaseConfig, MetricType
+from pydantic import validator
+# Allowing `api_key` to be left empty, to ensure compatibility with the open-source Qdrant.
 class QdrantConfig(DBConfig):
     url: SecretStr
     api_key: SecretStr
     def to_dict(self) -> dict:
-        return {
-            "url": self.url.get_secret_value(),
-            "api_key": self.api_key.get_secret_value(),
-            "prefer_grpc": True,
-        }
+        api_key = self.api_key.get_secret_value()
+        if len(api_key) > 0:
+            return {
+                "url": self.url.get_secret_value(),
+                "api_key": self.api_key.get_secret_value(),
+                "prefer_grpc": True,
+            }
+        else:
+            return {"url": self.url.get_secret_value(),}
+    @validator("*")
+    def not_empty_field(cls, v, field):
+        if field.name in ["api_key", "db_label"]:
+            return v
+        if isinstance(v, (str, SecretStr)) and len(v) == 0:
+            raise ValueError("Empty string!")
+        return v
 class QdrantIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None

vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py CHANGED Viewed

@@ -43,8 +43,7 @@ class QdrantCloud(VectorDB):
         if drop_old:
             log.info(f"QdrantCloud client drop_old collection: {self.collection_name}")
             tmp_client.delete_collection(self.collection_name)
-        self._create_collection(dim, tmp_client)
+            self._create_collection(dim, tmp_client)
         tmp_client = None
     @contextmanager
@@ -110,13 +109,18 @@ class QdrantCloud(VectorDB):
     ) -> (int, Exception):
         """Insert embeddings into Milvus. should call self.init() first"""
         assert self.qdrant_client is not None
+        QDRANT_BATCH_SIZE = 500
         try:
             # TODO: counts
-            _ = self.qdrant_client.upsert(
-                collection_name=self.collection_name,
-                wait=True,
-                points=Batch(ids=metadata, payloads=[{self._primary_field: v} for v in metadata], vectors=embeddings)
-            )
+            for offset in range(0, len(embeddings), QDRANT_BATCH_SIZE):
+                vectors = embeddings[offset: offset + QDRANT_BATCH_SIZE]
+                ids = metadata[offset: offset + QDRANT_BATCH_SIZE]
+                payloads=[{self._primary_field: v} for v in ids]
+                _ = self.qdrant_client.upsert(
+                    collection_name=self.collection_name,
+                    wait=True,
+                    points=Batch(ids=ids, payloads=payloads, vectors=vectors),
+                )
         except Exception as e:
             log.info(f"Failed to insert data, {e}")
             return 0, e

vectordb_bench/backend/clients/zilliz_cloud/config.py CHANGED Viewed

@@ -19,6 +19,7 @@ class ZillizCloudConfig(DBConfig):
 class AutoIndexConfig(MilvusIndexConfig, DBCaseConfig):
     index: IndexType = IndexType.AUTOINDEX
+    level: int = 1
     def index_param(self) -> dict:
         return {
@@ -30,6 +31,9 @@ class AutoIndexConfig(MilvusIndexConfig, DBCaseConfig):
     def search_param(self) -> dict:
         return {
             "metric_type": self.parse_metric(),
+            "params": {
+                "level": self.level,
+            }
         }

vectordb_bench/backend/data_source.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pathlib
 import typing
 from enum import Enum
 from tqdm import tqdm
-from hashlib import md5
 import os
 from abc import ABC, abstractmethod
@@ -32,14 +31,13 @@ class DatasetReader(ABC):
     remote_root: str
     @abstractmethod
-    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = True):
+    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
         """read dataset files from remote_root to local_ds_root,
         Args:
             dataset(str): for instance "sift_small_500k"
             files(list[str]):  all filenames of the dataset
             local_ds_root(pathlib.Path): whether to write the remote data.
-            check_etag(bool): whether to check the etag
         """
         pass
@@ -56,7 +54,7 @@ class AliyunOSSReader(DatasetReader):
         import oss2
         self.bucket = oss2.Bucket(oss2.AnonymousAuth(), self.remote_root, "benchmark", True)
-    def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: bool) -> bool:
+    def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
         info = self.bucket.get_object_meta(remote.as_posix())
         # check size equal
@@ -65,26 +63,21 @@ class AliyunOSSReader(DatasetReader):
             log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
             return False
-        # check etag equal
-        if check_etag:
-            return match_etag(info.etag.strip('"').lower(), local)
         return True
-    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = False):
+    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
         downloads = []
         if not local_ds_root.exists():
             log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
             local_ds_root.mkdir(parents=True)
-            downloads = [(pathlib.Path("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]
+            downloads = [(pathlib.PurePosixPath("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]
         else:
             for file in files:
-                remote_file = pathlib.Path("benchmark", dataset, file)
+                remote_file = pathlib.PurePosixPath("benchmark", dataset, file)
                 local_file = local_ds_root.joinpath(file)
-                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
+                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file)):
                     log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
                     downloads.append((remote_file, local_file))
@@ -93,8 +86,8 @@ class AliyunOSSReader(DatasetReader):
         log.info(f"Start to downloading files, total count: {len(downloads)}")
         for remote_file, local_file in tqdm(downloads):
-            log.debug(f"downloading file {remote_file} to {local_ds_root}")
-            self.bucket.get_object_to_file(remote_file.as_posix(), local_file.as_posix())
+            log.debug(f"downloading file {remote_file} to {local_file}")
+            self.bucket.get_object_to_file(remote_file.as_posix(), local_file.absolute())
         log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")
@@ -120,19 +113,19 @@ class AwsS3Reader(DatasetReader):
         return names
-    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, check_etag: bool = True):
+    def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
         downloads = []
         if not local_ds_root.exists():
             log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
             local_ds_root.mkdir(parents=True)
-            downloads = [pathlib.Path(self.remote_root, dataset, f) for f in files]
+            downloads = [pathlib.PurePosixPath(self.remote_root, dataset, f) for f in files]
         else:
             for file in files:
-                remote_file = pathlib.Path(self.remote_root, dataset, file)
+                remote_file = pathlib.PurePosixPath(self.remote_root, dataset, file)
                 local_file = local_ds_root.joinpath(file)
-                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
+                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file)):
                     log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
                     downloads.append(remote_file)
@@ -147,7 +140,7 @@ class AwsS3Reader(DatasetReader):
         log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")
-    def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: bool) -> bool:
+    def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
         # info() uses ls() inside, maybe we only need to ls once
         info = self.fs.info(remote)
@@ -157,48 +150,4 @@ class AwsS3Reader(DatasetReader):
             log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
             return False
-        # check etag equal
-        if check_etag:
-            return match_etag(info.get('ETag', "").strip('"'), local)
         return True
-def match_etag(expected_etag: str, local_file) -> bool:
-    """Check if local files' etag match with S3"""
-    def factor_of_1MB(filesize, num_parts):
-        x = filesize / int(num_parts)
-        y = x % 1048576
-        return int(x + 1048576 - y)
-    def calc_etag(inputfile, partsize):
-        md5_digests = []
-        with open(inputfile, 'rb') as f:
-            for chunk in iter(lambda: f.read(partsize), b''):
-                md5_digests.append(md5(chunk).digest())
-        return md5(b''.join(md5_digests)).hexdigest() + '-' + str(len(md5_digests))
-    def possible_partsizes(filesize, num_parts):
-        return lambda partsize: partsize < filesize and (float(filesize) / float(partsize)) <= num_parts
-    filesize = os.path.getsize(local_file)
-    le = ""
-    if '-' not in expected_etag: # no spliting uploading
-        with open(local_file, 'rb') as f:
-            le = md5(f.read()).hexdigest()
-            log.debug(f"calculated local etag {le}, expected etag: {expected_etag}")
-            return expected_etag == le
-    else:
-        num_parts = int(expected_etag.split('-')[-1])
-        partsizes = [ ## Default Partsizes Map
-            8388608, # aws_cli/boto3
-            15728640, # s3cmd
-            factor_of_1MB(filesize, num_parts) # Used by many clients to upload large files
-        ]
-        for partsize in filter(possible_partsizes(filesize, num_parts), partsizes):
-            le = calc_etag(local_file, partsize)
-            log.debug(f"calculated local etag {le}, expected etag: {expected_etag}")
-            if expected_etag == le:
-                return True
-    return False

vectordb-bench 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

vectordb-bench 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl