PyPI - chdb - Versions diffs - 3.4.1__cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - Mend

chdb 3.4.1__cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chdb might be problematic. Click here for more details.

Files changed (28) hide show

chdb/__init__.py +134 -0
chdb/__main__.py +35 -0
chdb/_chdb.cpython-310-x86_64-linux-gnu.so +0 -0
chdb/dataframe/__init__.py +19 -0
chdb/dataframe/query.py +356 -0
chdb/dbapi/__init__.py +79 -0
chdb/dbapi/connections.py +100 -0
chdb/dbapi/constants/FIELD_TYPE.py +31 -0
chdb/dbapi/constants/__init__.py +0 -0
chdb/dbapi/converters.py +293 -0
chdb/dbapi/cursors.py +247 -0
chdb/dbapi/err.py +61 -0
chdb/dbapi/times.py +20 -0
chdb/rwabc.py +65 -0
chdb/session/__init__.py +3 -0
chdb/session/state.py +123 -0
chdb/state/__init__.py +3 -0
chdb/state/sqlitelike.py +336 -0
chdb/udf/__init__.py +3 -0
chdb/udf/udf.py +106 -0
chdb/utils/__init__.py +9 -0
chdb/utils/trace.py +74 -0
chdb/utils/types.py +234 -0
chdb-3.4.1.dist-info/METADATA +532 -0
chdb-3.4.1.dist-info/RECORD +28 -0
chdb-3.4.1.dist-info/WHEEL +6 -0
chdb-3.4.1.dist-info/licenses/LICENSE.txt +203 -0
chdb-3.4.1.dist-info/top_level.txt +2 -0

chdb/dbapi/err.py ADDED Viewed

@@ -0,0 +1,61 @@
+class StandardError(Exception):
+    """Exception related to operation with chdb."""
+class Warning(StandardError):
+    """Exception raised for important warnings like data truncations
+    while inserting, etc."""
+class Error(StandardError):
+    """Exception that is the base class of all other error exceptions
+    (not Warning)."""
+class InterfaceError(Error):
+    """Exception raised for errors that are related to the database
+    interface rather than the database itself."""
+class DatabaseError(Error):
+    """Exception raised for errors that are related to the
+    database."""
+class DataError(DatabaseError):
+    """Exception raised for errors that are due to problems with the
+    processed data like division by zero, numeric value out of range,
+    etc."""
+class OperationalError(DatabaseError):
+    """Exception raised for errors that are related to the database's
+    operation and not necessarily under the control of the programmer,
+    e.g. an unexpected disconnect occurs, the data source name is not
+    found, a transaction could not be processed, a memory allocation
+    error occurred during processing, etc."""
+class IntegrityError(DatabaseError):
+    """Exception raised when the relational integrity of the database
+    is affected, e.g. a foreign key check fails, duplicate key,
+    etc."""
+class InternalError(DatabaseError):
+    """Exception raised when the database encounters an internal
+    error, e.g. the cursor is not valid anymore, the transaction is
+    out of sync, etc."""
+class ProgrammingError(DatabaseError):
+    """Exception raised for programming errors, e.g. table not found
+    or already exists, syntax error in the SQL statement, wrong number
+    of parameters specified, etc."""
+class NotSupportedError(DatabaseError):
+    """Exception raised in case a method or database API was used
+    which is not supported by the database, e.g. requesting a
+    .rollback() on a connection that does not support transaction or
+    has transactions turned off."""

chdb/dbapi/times.py ADDED Viewed

@@ -0,0 +1,20 @@
+from time import localtime
+from datetime import date, datetime, time, timedelta
+Date = date
+Time = time
+TimeDelta = timedelta
+Timestamp = datetime
+def DateFromTicks(ticks):
+    return date(*localtime(ticks)[:3])
+def TimeFromTicks(ticks):
+    return time(*localtime(ticks)[3:6])
+def TimestampFromTicks(ticks):
+    return datetime(*localtime(ticks)[:6])

chdb/rwabc.py ADDED Viewed

@@ -0,0 +1,65 @@
+from abc import ABC, abstractmethod
+from typing import List, Any
+class PyReader(ABC):
+    def __init__(self, data: Any):
+        """
+        Initialize the reader with data. The exact type and structure of `data` can vary.
+        Args:
+            data (Any): The data with which to initialize the reader, format and type are not strictly defined.
+        """
+        self.data = data
+    @abstractmethod
+    def read(self, col_names: List[str], count: int) -> List[Any]:
+        """
+        Read a specified number of rows from the given columns and return a list of objects,
+        where each object is a sequence of values for a column.
+        Args:
+            col_names (List[str]): List of column names to read.
+            count (int): Maximum number of rows to read.
+        Returns:
+            List[Any]: List of sequences, one for each column.
+        """
+        pass
+class PyWriter(ABC):
+    def __init__(self, col_names: List[str], types: List[type], data: Any):
+        """
+        Initialize the writer with column names, their types, and initial data.
+        Args:
+            col_names (List[str]): List of column names.
+            types (List[type]): List of types corresponding to each column.
+            data (Any): Initial data to setup the writer, format and type are not strictly defined.
+        """
+        self.col_names = col_names
+        self.types = types
+        self.data = data
+        self.blocks = []
+    @abstractmethod
+    def write(self, col_names: List[str], columns: List[List[Any]]) -> None:
+        """
+        Save columns of data to blocks. Must be implemented by subclasses.
+        Args:
+            col_names (List[str]): List of column names that are being written.
+            columns (List[List[Any]]): List of columns data, each column is represented by a list.
+        """
+        pass
+    @abstractmethod
+    def finalize(self) -> bytes:
+        """
+        Assemble and return the final data from blocks. Must be implemented by subclasses.
+        Returns:
+            bytes: The final serialized data.
+        """
+        pass

chdb/session/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .state import Session
+__all__ = ["Session"]

chdb/session/state.py ADDED Viewed

@@ -0,0 +1,123 @@
+import warnings
+import chdb
+from ..state import sqlitelike as chdb_stateful
+from ..state.sqlitelike import StreamingResult
+g_session = None
+g_session_path = None
+class Session:
+    """
+    Session will keep the state of query.
+    If path is None, it will create a temporary directory and use it as the database path
+    and the temporary directory will be removed when the session is closed.
+    You can also pass in a path to create a database at that path where will keep your data.
+    You can also use a connection string to pass in the path and other parameters.
+    Examples:
+        - ":memory:" (for in-memory database)
+        - "test.db" (for relative path)
+        - "file:test.db" (same as above)
+        - "/path/to/test.db" (for absolute path)
+        - "file:/path/to/test.db" (same as above)
+        - "file:test.db?param1=value1&param2=value2" (for relative path with query params)
+        - "file::memory:?verbose&log-level=test" (for in-memory database with query params)
+        - "///path/to/test.db?param1=value1&param2=value2" (for absolute path)
+    Connection string args handling:
+        Connection string can contain query params like "file:test.db?param1=value1&param2=value2"
+        "param1=value1" will be passed to ClickHouse engine as start up args.
+        For more details, see `clickhouse local --help --verbose`
+        Some special args handling:
+        - "mode=ro" would be "--readonly=1" for clickhouse (read-only mode)
+    Important:
+        - There can be only one session at a time. If you want to create a new session, you need to close the existing one.
+        - Creating a new session will close the existing one.
+    """
+    def __init__(self, path=None):
+        global g_session, g_session_path
+        if g_session is not None:
+            warnings.warn(
+                "There is already an active session. Creating a new session will close the existing one. "
+                "It is recommended to close the existing session before creating a new one. "
+                f"Closing the existing session {g_session_path}"
+            )
+            g_session.close()
+            g_session_path = None
+        if path is None:
+            self._path = ":memory:"
+        else:
+            self._path = path
+        if chdb.g_udf_path != "":
+            self._udf_path = chdb.g_udf_path
+            # add udf_path to conn_str here.
+            # - the `user_scripts_path` will be the value of `udf_path`
+            # - the `user_defined_executable_functions_config` will be `user_scripts_path/*.xml`
+            # Both of them will be added to the conn_str in the Connection class
+            if "?" in self._path:
+                self._conn_str = f"{self._path}&udf_path={self._udf_path}"
+            else:
+                self._conn_str = f"{self._path}?udf_path={self._udf_path}"
+        else:
+            self._udf_path = ""
+            self._conn_str = f"{self._path}"
+        self._conn = chdb_stateful.Connection(self._conn_str)
+        g_session = self
+        g_session_path = self._path
+    def __del__(self):
+        self.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+    def close(self):
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
+        global g_session, g_session_path
+        g_session = None
+        g_session_path = None
+    def cleanup(self):
+        try:
+            self.close()
+        except:  # noqa
+            pass
+    def query(self, sql, fmt="CSV", udf_path=""):
+        """
+        Execute a query.
+        """
+        if fmt == "Debug":
+            warnings.warn(
+                """Debug format is not supported in Session.query
+Please try use parameters in connection string instead:
+Eg: conn = connect(f"db_path?verbose&log-level=test")"""
+            )
+            fmt = "CSV"
+        return self._conn.query(sql, fmt)
+    # alias sql = query
+    sql = query
+    def send_query(self, sql, fmt="CSV") -> StreamingResult:
+        """
+        Execute a streaming query.
+        """
+        if fmt == "Debug":
+            warnings.warn(
+                """Debug format is not supported in Session.query
+Please try use parameters in connection string instead:
+Eg: conn = connect(f"db_path?verbose&log-level=test")"""
+            )
+            fmt = "CSV"
+        return self._conn.send_query(sql, fmt)

chdb/state/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .sqlitelike import connect
+__all__ = ["connect"]

chdb/state/sqlitelike.py ADDED Viewed

@@ -0,0 +1,336 @@
+from typing import Optional, Any
+from chdb import _chdb
+# try import pyarrow if failed, raise ImportError with suggestion
+try:
+    import pyarrow as pa  # noqa
+except ImportError as e:
+    print(f"ImportError: {e}")
+    print('Please install pyarrow via "pip install pyarrow"')
+    raise ImportError("Failed to import pyarrow") from None
+_arrow_format = set({"dataframe", "arrowtable"})
+_process_result_format_funs = {
+    "dataframe": lambda x: to_df(x),
+    "arrowtable": lambda x: to_arrowTable(x),
+}
+# return pyarrow table
+def to_arrowTable(res):
+    """convert res to arrow table"""
+    # try import pyarrow and pandas, if failed, raise ImportError with suggestion
+    try:
+        import pyarrow as pa  # noqa
+        import pandas as pd  # noqa
+    except ImportError as e:
+        print(f"ImportError: {e}")
+        print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
+        raise ImportError("Failed to import pyarrow or pandas") from None
+    if len(res) == 0:
+        return pa.Table.from_batches([], schema=pa.schema([]))
+    return pa.RecordBatchFileReader(res.bytes()).read_all()
+# return pandas dataframe
+def to_df(r):
+    """convert arrow table to Dataframe"""
+    t = to_arrowTable(r)
+    return t.to_pandas(use_threads=True)
+class StreamingResult:
+    def __init__(self, c_result, conn, result_func):
+        self._result = c_result
+        self._result_func = result_func
+        self._conn = conn
+        self._exhausted = False
+    def fetch(self):
+        """Fetch next chunk of streaming results"""
+        if self._exhausted:
+            return None
+        try:
+            result = self._conn.streaming_fetch_result(self._result)
+            if result is None or result.rows_read() == 0:
+                self._exhausted = True
+                return None
+            return self._result_func(result)
+        except Exception as e:
+            self._exhausted = True
+            raise RuntimeError(f"Streaming query failed: {str(e)}") from e
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._exhausted:
+            raise StopIteration
+        chunk = self.fetch()
+        if chunk is None:
+            self._exhausted = True
+            raise StopIteration
+        return chunk
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+    def cancel(self):
+        self._exhausted = True
+        try:
+            self._conn.streaming_cancel_query(self._result)
+        except Exception as e:
+            raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
+class Connection:
+    def __init__(self, connection_string: str):
+        # print("Connection", connection_string)
+        self._cursor: Optional[Cursor] = None
+        self._conn = _chdb.connect(connection_string)
+    def cursor(self) -> "Cursor":
+        self._cursor = Cursor(self._conn)
+        return self._cursor
+    def query(self, query: str, format: str = "CSV") -> Any:
+        lower_output_format = format.lower()
+        result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
+        if lower_output_format in _arrow_format:
+            format = "Arrow"
+        result = self._conn.query(query, format)
+        return result_func(result)
+    def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
+        lower_output_format = format.lower()
+        result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
+        if lower_output_format in _arrow_format:
+            format = "Arrow"
+        c_stream_result = self._conn.send_query(query, format)
+        return StreamingResult(c_stream_result, self._conn, result_func)
+    def close(self) -> None:
+        # print("close")
+        if self._cursor:
+            self._cursor.close()
+        self._conn.close()
+class Cursor:
+    def __init__(self, connection):
+        self._conn = connection
+        self._cursor = self._conn.cursor()
+        self._current_table: Optional[pa.Table] = None
+        self._current_row: int = 0
+    def execute(self, query: str) -> None:
+        self._cursor.execute(query)
+        result_mv = self._cursor.get_memview()
+        if self._cursor.has_error():
+            raise Exception(self._cursor.error_message())
+        if self._cursor.data_size() == 0:
+            self._current_table = None
+            self._current_row = 0
+            self._column_names = []
+            self._column_types = []
+            return
+        # Parse JSON data
+        json_data = result_mv.tobytes().decode("utf-8")
+        import json
+        try:
+            # First line contains column names
+            # Second line contains column types
+            # Following lines contain data
+            lines = json_data.strip().split("\n")
+            if len(lines) < 2:
+                self._current_table = None
+                self._current_row = 0
+                self._column_names = []
+                self._column_types = []
+                return
+            self._column_names = json.loads(lines[0])
+            self._column_types = json.loads(lines[1])
+            # Convert data rows
+            rows = []
+            for line in lines[2:]:
+                if not line.strip():
+                    continue
+                row_data = json.loads(line)
+                converted_row = []
+                for val, type_info in zip(row_data, self._column_types):
+                    # Handle NULL values first
+                    if val is None:
+                        converted_row.append(None)
+                        continue
+                    # Basic type conversion
+                    try:
+                        if type_info.startswith("Int") or type_info.startswith("UInt"):
+                            converted_row.append(int(val))
+                        elif type_info.startswith("Float"):
+                            converted_row.append(float(val))
+                        elif type_info == "Bool":
+                            converted_row.append(bool(val))
+                        elif type_info == "String" or type_info == "FixedString":
+                            converted_row.append(str(val))
+                        elif type_info.startswith("DateTime"):
+                            from datetime import datetime
+                            # Check if the value is numeric (timestamp)
+                            val_str = str(val)
+                            if val_str.replace(".", "").isdigit():
+                                converted_row.append(datetime.fromtimestamp(float(val)))
+                            else:
+                                # Handle datetime string formats
+                                if "." in val_str:  # Has microseconds
+                                    converted_row.append(
+                                        datetime.strptime(
+                                            val_str, "%Y-%m-%d %H:%M:%S.%f"
+                                        )
+                                    )
+                                else:  # No microseconds
+                                    converted_row.append(
+                                        datetime.strptime(val_str, "%Y-%m-%d %H:%M:%S")
+                                    )
+                        elif type_info.startswith("Date"):
+                            from datetime import date, datetime
+                            # Check if the value is numeric (days since epoch)
+                            val_str = str(val)
+                            if val_str.isdigit():
+                                converted_row.append(
+                                    date.fromtimestamp(float(val) * 86400)
+                                )
+                            else:
+                                # Handle date string format
+                                converted_row.append(
+                                    datetime.strptime(val_str, "%Y-%m-%d").date()
+                                )
+                        else:
+                            # For unsupported types, keep as string
+                            converted_row.append(str(val))
+                    except (ValueError, TypeError):
+                        # If conversion fails, keep original value as string
+                        converted_row.append(str(val))
+                rows.append(tuple(converted_row))
+            self._current_table = rows
+            self._current_row = 0
+        except json.JSONDecodeError as e:
+            raise Exception(f"Failed to parse JSON data: {e}")
+    def commit(self) -> None:
+        self._cursor.commit()
+    def fetchone(self) -> Optional[tuple]:
+        if not self._current_table or self._current_row >= len(self._current_table):
+            return None
+        # Now self._current_table is a list of row tuples
+        row = self._current_table[self._current_row]
+        self._current_row += 1
+        return row
+    def fetchmany(self, size: int = 1) -> tuple:
+        if not self._current_table:
+            return tuple()
+        rows = []
+        for _ in range(size):
+            if (row := self.fetchone()) is None:
+                break
+            rows.append(row)
+        return tuple(rows)
+    def fetchall(self) -> tuple:
+        if not self._current_table:
+            return tuple()
+        remaining_rows = []
+        while (row := self.fetchone()) is not None:
+            remaining_rows.append(row)
+        return tuple(remaining_rows)
+    def close(self) -> None:
+        self._cursor.close()
+    def __iter__(self):
+        return self
+    def __next__(self) -> tuple:
+        row = self.fetchone()
+        if row is None:
+            raise StopIteration
+        return row
+    def column_names(self) -> list:
+        """Return a list of column names from the last executed query"""
+        return self._column_names if hasattr(self, "_column_names") else []
+    def column_types(self) -> list:
+        """Return a list of column types from the last executed query"""
+        return self._column_types if hasattr(self, "_column_types") else []
+    @property
+    def description(self) -> list:
+        """
+        Return a description of the columns as per DB-API 2.0
+        Returns a list of 7-item tuples, each containing:
+        (name, type_code, display_size, internal_size, precision, scale, null_ok)
+        where only name and type_code are provided
+        """
+        if not hasattr(self, "_column_names") or not self._column_names:
+            return []
+        return [
+            (name, type_info, None, None, None, None, None)
+            for name, type_info in zip(self._column_names, self._column_types)
+        ]
+def connect(connection_string: str = ":memory:") -> Connection:
+    """
+    Create a connection to chDB backgroud server.
+    Only one open connection is allowed per process. Use `close` to close the connection.
+    If called with the same connection string, the same connection object will be returned.
+    You can use the connection object to create cursor object. `cursor` method will return a cursor object.
+    Args:
+        connection_string (str, optional): Connection string. Defaults to ":memory:".
+        Also support file path like:
+          - ":memory:" (for in-memory database)
+          - "test.db" (for relative path)
+          - "file:test.db" (same as above)
+          - "/path/to/test.db" (for absolute path)
+          - "file:/path/to/test.db" (same as above)
+          - "file:test.db?param1=value1&param2=value2" (for relative path with query params)
+          - "file::memory:?verbose&log-level=test" (for in-memory database with query params)
+          - "///path/to/test.db?param1=value1&param2=value2" (for absolute path)
+        Connection string args handling:
+          Connection string can contain query params like "file:test.db?param1=value1&param2=value2"
+          "param1=value1" will be passed to ClickHouse engine as start up args.
+          For more details, see `clickhouse local --help --verbose`
+          Some special args handling:
+            - "mode=ro" would be "--readonly=1" for clickhouse (read-only mode)
+    Returns:
+        Connection: Connection object
+    """
+    return Connection(connection_string)

chdb/udf/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .udf import chdb_udf, generate_udf
+__all__ = ["chdb_udf", "generate_udf"]