PyPI - chdb - Versions diffs - 3.6.0__cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - Mend

chdb 3.6.0__cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chdb might be problematic. Click here for more details.

Files changed (36) hide show

chdb/__init__.py +134 -0
chdb/__main__.py +38 -0
chdb/_chdb.abi3.so +0 -0
chdb/dataframe/__init__.py +19 -0
chdb/dataframe/query.py +356 -0
chdb/dbapi/__init__.py +79 -0
chdb/dbapi/connections.py +100 -0
chdb/dbapi/constants/FIELD_TYPE.py +31 -0
chdb/dbapi/constants/__init__.py +0 -0
chdb/dbapi/converters.py +293 -0
chdb/dbapi/cursors.py +351 -0
chdb/dbapi/err.py +61 -0
chdb/dbapi/times.py +20 -0
chdb/libpybind11nonlimitedapi_chdb_3.10.so +0 -0
chdb/libpybind11nonlimitedapi_chdb_3.11.so +0 -0
chdb/libpybind11nonlimitedapi_chdb_3.12.so +0 -0
chdb/libpybind11nonlimitedapi_chdb_3.13.so +0 -0
chdb/libpybind11nonlimitedapi_chdb_3.8.so +0 -0
chdb/libpybind11nonlimitedapi_chdb_3.9.so +0 -0
chdb/libpybind11nonlimitedapi_stubs.so +0 -0
chdb/rwabc.py +65 -0
chdb/session/__init__.py +3 -0
chdb/session/state.py +124 -0
chdb/state/__init__.py +3 -0
chdb/state/sqlitelike.py +505 -0
chdb/udf/__init__.py +3 -0
chdb/udf/udf.py +106 -0
chdb/utils/__init__.py +9 -0
chdb/utils/trace.py +74 -0
chdb/utils/types.py +234 -0
chdb-3.6.0.dist-info/LICENSE.txt +203 -0
chdb-3.6.0.dist-info/METADATA +554 -0
chdb-3.6.0.dist-info/RECORD +36 -0
chdb-3.6.0.dist-info/WHEEL +6 -0
chdb-3.6.0.dist-info/top_level.txt +2 -0
chdb.libs/libpybind11nonlimitedapi_stubs-18c482a6.so +0 -0

chdb/__init__.py ADDED Viewed

@@ -0,0 +1,134 @@
+import sys
+import os
+import threading
+class ChdbError(Exception):
+    """Base class for exceptions in this module."""
+_arrow_format = set({"dataframe", "arrowtable"})
+_process_result_format_funs = {
+    "dataframe": lambda x: to_df(x),
+    "arrowtable": lambda x: to_arrowTable(x),
+}
+# If any UDF is defined, the path of the UDF will be set to this variable
+# and the path will be deleted when the process exits
+# UDF config path will be f"{g_udf_path}/udf_config.xml"
+# UDF script path will be f"{g_udf_path}/{func_name}.py"
+g_udf_path = ""
+chdb_version = ('3', '6', '0')
+if sys.version_info[:2] >= (3, 7):
+    # get the path of the current file
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    # change the current working directory to the path of the current file
+    # and import _chdb then change the working directory back
+    cwd = os.getcwd()
+    os.chdir(current_path)
+    from . import _chdb  # noqa
+    os.chdir(cwd)
+    conn = _chdb.connect()
+    engine_version = str(conn.query("SELECT version();", "CSV").bytes())[3:-4]
+    conn.close()
+else:
+    raise NotImplementedError("Python 3.6 or lower version is not supported")
+try:
+    # Change here if project is renamed and does not equal the package name
+    dist_name = __name__
+    __version__ = ".".join(map(str, chdb_version))
+except:  # noqa
+    __version__ = "unknown"
+# return pyarrow table
+def to_arrowTable(res):
+    """convert res to arrow table"""
+    # try import pyarrow and pandas, if failed, raise ImportError with suggestion
+    try:
+        import pyarrow as pa  # noqa
+        import pandas as pd  # noqa
+    except ImportError as e:
+        print(f"ImportError: {e}")
+        print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
+        raise ImportError("Failed to import pyarrow or pandas") from None
+    if len(res) == 0:
+        return pa.Table.from_batches([], schema=pa.schema([]))
+    return pa.RecordBatchFileReader(res.bytes()).read_all()
+# return pandas dataframe
+def to_df(r):
+    """convert arrow table to Dataframe"""
+    t = to_arrowTable(r)
+    return t.to_pandas(use_threads=True)
+# global connection lock, for multi-threading use of legacy chdb.query()
+g_conn_lock = threading.Lock()
+# wrap _chdb functions
+def query(sql, output_format="CSV", path="", udf_path=""):
+    global g_udf_path
+    if udf_path != "":
+        g_udf_path = udf_path
+    conn_str = ""
+    if path == "":
+        conn_str = ":memory:"
+    else:
+        conn_str = f"{path}"
+    if g_udf_path != "":
+        if "?" in conn_str:
+            conn_str = f"{conn_str}&udf_path={g_udf_path}"
+        else:
+            conn_str = f"{conn_str}?udf_path={g_udf_path}"
+    if output_format == "Debug":
+        output_format = "CSV"
+        if "?" in conn_str:
+            conn_str = f"{conn_str}&verbose&log-level=test"
+        else:
+            conn_str = f"{conn_str}?verbose&log-level=test"
+    lower_output_format = output_format.lower()
+    result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
+    if lower_output_format in _arrow_format:
+        output_format = "Arrow"
+    with g_conn_lock:
+        conn = _chdb.connect(conn_str)
+        res = conn.query(sql, output_format)
+        if res.has_error():
+            conn.close()
+            raise ChdbError(res.error_message())
+        conn.close()
+    return result_func(res)
+# alias for query
+sql = query
+PyReader = _chdb.PyReader
+from . import dbapi, session, udf, utils  # noqa: E402
+from .state import connect  # noqa: E402
+__all__ = [
+    "_chdb",
+    "PyReader",
+    "ChdbError",
+    "query",
+    "sql",
+    "chdb_version",
+    "engine_version",
+    "to_df",
+    "to_arrowTable",
+    "dbapi",
+    "session",
+    "udf",
+    "utils",
+    "connect",
+]

chdb/__main__.py ADDED Viewed

@@ -0,0 +1,38 @@
+import argparse
+from .__init__ import query
+def main():
+    prog = 'python -m chdb'
+    custom_usage = "%(prog)s [-h] \"SELECT 1\" [format]"
+    description = ('''A simple command line interface for chdb
+                   to run SQL and output in specified format''')
+    parser = argparse.ArgumentParser(prog=prog,
+                                     usage=custom_usage,
+                                     description=description)
+    parser.add_argument('sql', nargs=1,
+                        type=str,
+                        help='sql, e.g: select 1112222222,555')
+    parser.add_argument('format', nargs='?',
+                        type=str,
+                        help='''sql result output format,
+                        e.g: CSV, Dataframe, JSON etc,
+                        more format checkout on
+                        https://clickhouse.com/docs/en/interfaces/formats''',
+                        default="CSV")
+    options = parser.parse_args()
+    sql = options.sql[0]
+    output_format = options.format
+    res = query(sql, output_format)
+    try:
+        if output_format.lower() in ("dataframe", "arrowtable"):
+            temp = res
+        else:
+            temp = res.data()
+        print(temp, end="")
+    except UnicodeDecodeError:
+        print(repr(res.bytes()))
+if __name__ == '__main__':
+    main()

chdb/_chdb.abi3.so ADDED Viewed

Binary file

chdb/dataframe/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+# try import pyarrow and pandas, if failed, raise ImportError with suggestion
+try:
+    import pyarrow as pa  # noqa
+    import pandas as pd  # noqa
+except ImportError as e:
+    print(f'ImportError: {e}')
+    print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
+    raise ImportError('Failed to import pyarrow or pandas') from None
+# check if pandas version >= 2.0.0
+if pd.__version__[0] < '2':
+    print('Please upgrade pandas to version 2.0.0 or higher to have better performance')
+from .query import Table, pandas_read_parquet  # noqa: C0413
+query = Table.queryStatic
+sql = Table.queryStatic
+__all__ = ["Table", "query", "sql", "pandas_read_parquet"]

chdb/dataframe/query.py ADDED Viewed

@@ -0,0 +1,356 @@
+import os
+import tempfile
+from io import BytesIO
+import re
+import pandas as pd
+import pyarrow as pa
+from chdb import query as chdb_query
+class Table:
+    """
+    Table is a wrapper of multiple formats of data buffer, including parquet file path,
+    parquet bytes, and pandas dataframe.
+    if use_memfd is True, will try using memfd_create to create a temp file in memory, which is
+    only available on Linux. If failed, will fallback to use tempfile.mkstemp to create a temp file
+    """
+    def __init__(
+        self,
+        parquet_path: str = None,
+        temp_parquet_path: str = None,
+        parquet_memoryview: memoryview = None,
+        dataframe: pd.DataFrame = None,
+        arrow_table: pa.Table = None,
+        use_memfd: bool = False,
+    ):
+        """
+        Initialize a Table object with one of parquet file path, parquet bytes, pandas dataframe or
+        parquet table.
+        """
+        self._parquet_path = parquet_path
+        self._temp_parquet_path = temp_parquet_path
+        self._parquet_memoryview = parquet_memoryview
+        self._dataframe = dataframe
+        self._arrow_table = arrow_table
+        self.use_memfd = use_memfd
+        self._rows_read = 0
+        self._bytes_read = 0
+        self._elapsed = 0
+    def __del__(self):
+        if self._temp_parquet_path is not None:
+            try:
+                os.remove(self._temp_parquet_path)
+            except OSError:
+                pass
+    def rows_read(self):
+        return self._rows_read
+    def bytes_read(self):
+        return self._bytes_read
+    def elapsed(self):
+        return self._elapsed
+    def to_pandas(self) -> pd.DataFrame:
+        if self._dataframe is None:
+            if self._arrow_table is not None:
+                return self._arrow_table.to_pandas()
+            elif self._parquet_memoryview is not None:
+                # wrap bytes to ReadBuffer
+                pq_reader = BytesIO(self._parquet_memoryview.tobytes())
+                return pandas_read_parquet(pq_reader)
+            elif self._parquet_path is not None:
+                return pandas_read_parquet(self._parquet_path)
+            elif self._temp_parquet_path is not None:
+                return pandas_read_parquet(self._temp_parquet_path)
+            else:
+                raise ValueError("No data buffer in Table object")
+        return self._dataframe
+    def flush_to_disk(self):
+        """
+        Flush the data in memory to disk.
+        """
+        if self._parquet_path is not None or self._temp_parquet_path is not None:
+            return
+        if self._dataframe is not None:
+            self._df_to_disk(self._dataframe)
+            self._dataframe = None
+        elif self._arrow_table is not None:
+            self._arrow_table_to_disk(self._arrow_table)
+            self._arrow_table = None
+        elif self._parquet_memoryview is not None:
+            self._memoryview_to_disk(self._parquet_memoryview)
+            self._parquet_memoryview = None
+        else:
+            raise ValueError("No data in Table object")
+    def _df_to_disk(self, df):
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            df.to_parquet(tmp)
+            self._temp_parquet_path = tmp.name
+    def _arrow_table_to_disk(self, arrow_table):
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            pa.parquet.write_table(arrow_table, tmp.name)
+            self._temp_parquet_path = tmp.name
+    def _memoryview_to_disk(self, memoryview):
+        # copy memoryview to temp file
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            tmp.write(memoryview.tobytes())
+            self._temp_parquet_path = tmp.name
+    def __repr__(self):
+        return repr(self.to_pandas())
+    def __str__(self):
+        return str(self.to_pandas())
+    def query(self, sql: str, **kwargs) -> "Table":
+        """
+        Query on current Table object, return a new Table object.
+        The `FROM` table name in SQL should always be `__table__`. eg:
+            `SELECT * FROM __table__ WHERE ...`
+        """
+        self._validate_sql(sql)
+        if (
+            self._parquet_path is not None
+        ):  # if we have parquet file path, run chdb query on it directly is faster
+            return self._query_on_path(self._parquet_path, sql, **kwargs)
+        elif self._temp_parquet_path is not None:
+            return self._query_on_path(self._temp_parquet_path, sql, **kwargs)
+        elif self._parquet_memoryview is not None:
+            return self.queryParquetBuffer(sql, **kwargs)
+        elif self._dataframe is not None:
+            return self.queryDF(sql, **kwargs)
+        elif self._arrow_table is not None:
+            return self.queryArrowTable(sql, **kwargs)
+        else:
+            raise ValueError("Table object is not initialized correctly")
+    # alias sql = query
+    sql = query
+    def show(self):
+        print(self.to_pandas())
+    def _query_on_path(self, path, sql, **kwargs):
+        new_sql = sql.replace("__table__", f'file("{path}", Parquet)')
+        res = chdb_query(new_sql, "Parquet", **kwargs)
+        tbl = Table(parquet_memoryview=res.get_memview())
+        tbl._rows_read = res.rows_read()
+        tbl._bytes_read = res.bytes_read()
+        tbl._elapsed = res.elapsed()
+        return tbl
+    def _validate_sql(self, sql):
+        if "__table__" not in sql:
+            raise ValueError("SQL should always contain `FROM __table__`")
+    def queryParquetBuffer(self, sql: str, **kwargs) -> "Table":
+        if "__table__" not in sql:
+            raise ValueError("SQL should always contain `FROM __table__`")
+        if self._parquet_memoryview is None:
+            raise ValueError("Parquet buffer is None")
+        temp_path = None
+        parquet_fd = -1
+        if self.use_memfd:
+            parquet_fd = memfd_create("parquet_buffer")
+        # if memfd_create failed, use tempfile to create a file descriptor for the memoryview
+        if parquet_fd == -1:
+            parquet_fd, temp_path = tempfile.mkstemp()
+        ffd = os.fdopen(parquet_fd, "wb")
+        ffd.write(self._parquet_memoryview.tobytes())
+        ffd.flush()
+        ret = self._run_on_temp(parquet_fd, temp_path, sql=sql, fmt="Parquet", **kwargs)
+        ffd.close()
+        if temp_path is not None:
+            os.remove(temp_path)
+        return ret
+    def queryArrowTable(self, sql: str, **kwargs) -> "Table":
+        if "__table__" not in sql:
+            raise ValueError("SQL should always contain `FROM __table__`")
+        if self._arrow_table is None:
+            raise ValueError("Arrow table is None")
+        temp_path = None
+        arrow_fd = -1
+        if self.use_memfd:
+            arrow_fd = memfd_create("arrow_buffer")
+        if arrow_fd == -1:
+            arrow_fd, temp_path = tempfile.mkstemp()
+        ffd = os.fdopen(arrow_fd, "wb")
+        with pa.RecordBatchFileWriter(ffd, self._arrow_table.schema) as writer:
+            writer.write_table(self._arrow_table)
+        ffd.flush()
+        ret = self._run_on_temp(arrow_fd, temp_path, sql=sql, fmt="Arrow", **kwargs)
+        ffd.close()
+        if temp_path is not None:
+            os.remove(temp_path)
+        return ret
+    def queryDF(self, sql: str, **kwargs) -> "Table":
+        if "__table__" not in sql:
+            raise ValueError("SQL should always contain `FROM __table__`")
+        if self._dataframe is None:
+            raise ValueError("Dataframe is None")
+        temp_path = None
+        parquet_fd = -1
+        if self.use_memfd:
+            parquet_fd = memfd_create()
+        if parquet_fd == -1:
+            parquet_fd, temp_path = tempfile.mkstemp()
+        ffd = os.fdopen(parquet_fd, "wb")
+        self._dataframe.to_parquet(ffd, engine="pyarrow", compression=None)
+        ffd.flush()
+        ret = self._run_on_temp(parquet_fd, temp_path, sql=sql, fmt="Parquet", **kwargs)
+        ffd.close()
+        if temp_path is not None:
+            os.remove(temp_path)
+        return ret
+    @staticmethod
+    def queryStatic(sql: str, **kwargs) -> "Table":
+        """
+        Query on multiple Tables, use Table variables as the table name in SQL
+        eg.
+            table1 = Table(...)
+            table2 = Table(...)
+            query("SELECT * FROM __table1__ JOIN __table2__ ON ...", table1=table1, table2=table2)
+        """
+        ansiTablePattern = re.compile(r"__([a-zA-Z][a-zA-Z0-9_]*)__")
+        temp_paths = []
+        ffds = []
+        def replace_table_name(match):
+            tableName = match.group(1)
+            if tableName not in kwargs:
+                raise ValueError(f"Table {tableName} should be passed as a parameter")
+            tbl = kwargs[tableName]
+            # if tbl is DataFrame, convert it to Table
+            if isinstance(tbl, pd.DataFrame):
+                tbl = Table(dataframe=tbl)
+            elif not isinstance(tbl, Table):
+                raise ValueError(
+                    f"Table {tableName} should be an instance of Table or DataFrame")
+            if tbl._parquet_path is not None:
+                return f'file("{tbl._parquet_path}", Parquet)'
+            if tbl._temp_parquet_path is not None:
+                return f'file("{tbl._temp_parquet_path}", Parquet)'
+            temp_path = None
+            data_fd = -1
+            if tbl.use_memfd:
+                data_fd = memfd_create()
+            if data_fd == -1:
+                data_fd, temp_path = tempfile.mkstemp()
+                temp_paths.append(temp_path)
+            ffd = os.fdopen(data_fd, "wb")
+            ffds.append(ffd)
+            if tbl._parquet_memoryview is not None:
+                ffd.write(tbl._parquet_memoryview.tobytes())
+                ffd.flush()
+                os.lseek(data_fd, 0, os.SEEK_SET)
+                return f'file("/dev/fd/{data_fd}", Parquet)'
+            if tbl._dataframe is not None:
+                ffd.write(tbl._dataframe.to_parquet(engine="pyarrow", compression=None))
+                ffd.flush()
+                os.lseek(data_fd, 0, os.SEEK_SET)
+                return f'file("/dev/fd/{data_fd}", Parquet)'
+            if tbl._arrow_table is not None:
+                with pa.RecordBatchFileWriter(ffd, tbl._arrow_table.schema) as writer:
+                    writer.write_table(tbl._arrow_table)
+                ffd.flush()
+                os.lseek(data_fd, 0, os.SEEK_SET)
+                return f'file("/dev/fd/{data_fd}", Arrow)'
+            raise ValueError(f"Table {tableName} is not initialized correctly")
+        sql = ansiTablePattern.sub(replace_table_name, sql)
+        res = chdb_query(sql, "Parquet")
+        for fd in ffds:
+            fd.close()
+        for tmp_path in temp_paths:
+            os.remove(tmp_path)
+        tbl = Table(parquet_memoryview=res.get_memview())
+        tbl._rows_read = res.rows_read()
+        tbl._bytes_read = res.bytes_read()
+        tbl._elapsed = res.elapsed()
+        return tbl
+    def _run_on_temp(
+        self,
+        fd: int,
+        temp_path: str = None,
+        sql: str = None,
+        fmt: str = "Parquet",
+        **kwargs,
+    ) -> "Table":
+        # replace "__table__" with file("temp_path", Parquet) or file("/dev/fd/{parquet_fd}", Parquet)
+        if temp_path is not None:
+            new_sql = sql.replace("__table__", f'file("{temp_path}", {fmt})')
+        else:
+            os.lseek(fd, 0, os.SEEK_SET)
+            new_sql = sql.replace("__table__", f'file("/dev/fd/{fd}", {fmt})')
+        res = chdb_query(new_sql, "Parquet", **kwargs)
+        tbl = Table(parquet_memoryview=res.get_memview())
+        tbl._rows_read = res.rows_read()
+        tbl._bytes_read = res.bytes_read()
+        tbl._elapsed = res.elapsed()
+        return tbl
+def pandas_read_parquet(path) -> pd.DataFrame:
+    return pd.read_parquet(path)
+def memfd_create(name: str = None) -> int:
+    """
+    Try to use memfd_create(2) to create a file descriptor with memory.
+    Only available on Linux 3.17 or newer with glibc 2.27 or newer.
+    """
+    if hasattr(os, "memfd_create"):
+        try:
+            fd = os.memfd_create(name, flags=os.MFD_CLOEXEC)
+            return fd
+        except:  # noqa
+            return -1
+    return -1
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Run SQL on parquet file")
+    parser.add_argument("parquet_path", type=str, help="path to parquet file")
+    parser.add_argument("sql", type=str, help="SQL to run")
+    parser.add_argument(
+        "--use-memfd",
+        action="store_true",
+        help="use memfd_create to create file descriptor",
+    )
+    args = parser.parse_args()
+    table = Table(parquet_path=args.parquet_path, use_memfd=args.use_memfd)
+    print(table.query(args.sql))

chdb/dbapi/__init__.py ADDED Viewed

@@ -0,0 +1,79 @@
+from .constants import FIELD_TYPE
+from . import connections as _orig_conn
+from .. import chdb_version
+if len(chdb_version) > 3 and chdb_version[3] is not None:
+    VERSION_STRING = "%s.%s.%s_%s" % chdb_version
+else:
+    VERSION_STRING = "%s.%s.%s" % chdb_version[:3]
+threadsafety = 1
+apilevel = "2.0"
+paramstyle = "format"
+class DBAPISet(frozenset):
+    def __ne__(self, other):
+        if isinstance(other, set):
+            return frozenset.__ne__(self, other)
+        else:
+            return other not in self
+    def __eq__(self, other):
+        if isinstance(other, frozenset):
+            return frozenset.__eq__(self, other)
+        else:
+            return other in self
+    def __hash__(self):
+        return frozenset.__hash__(self)
+# TODO it's in pep249 find out meaning and usage of this
+# https://www.python.org/dev/peps/pep-0249/#string
+STRING = DBAPISet([FIELD_TYPE.ENUM, FIELD_TYPE.STRING,
+                   FIELD_TYPE.VAR_STRING])
+BINARY = DBAPISet([FIELD_TYPE.BLOB, FIELD_TYPE.LONG_BLOB,
+                   FIELD_TYPE.MEDIUM_BLOB, FIELD_TYPE.TINY_BLOB])
+NUMBER = DBAPISet([FIELD_TYPE.DECIMAL, FIELD_TYPE.DOUBLE, FIELD_TYPE.FLOAT,
+                   FIELD_TYPE.INT24, FIELD_TYPE.LONG, FIELD_TYPE.LONGLONG,
+                   FIELD_TYPE.TINY, FIELD_TYPE.YEAR])
+DATE = DBAPISet([FIELD_TYPE.DATE, FIELD_TYPE.NEWDATE])
+TIME = DBAPISet([FIELD_TYPE.TIME])
+TIMESTAMP = DBAPISet([FIELD_TYPE.TIMESTAMP, FIELD_TYPE.DATETIME])
+DATETIME = TIMESTAMP
+ROWID = DBAPISet()
+def Binary(x):
+    """Return x as a binary type."""
+    return bytes(x)
+def Connect(*args, **kwargs):
+    """
+    Connect to the database; see connections.Connection.__init__() for
+    more information.
+    """
+    from .connections import Connection
+    return Connection(*args, **kwargs)
+if _orig_conn.Connection.__init__.__doc__ is not None:
+    Connect.__doc__ = _orig_conn.Connection.__init__.__doc__
+del _orig_conn
+def get_client_info():  # for MySQLdb compatibility
+    version = chdb_version
+    if len(chdb_version) > 3 and chdb_version[3] is None:
+        version = chdb_version[:3]
+    return '.'.join(map(str, version))
+connect = Connection = Connect
+NULL = "NULL"
+__version__ = get_client_info()