chdb 3.6.0__cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl → 3.7.0__cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chdb might be problematic. Click here for more details.

chdb/__init__.py CHANGED
@@ -4,7 +4,38 @@ import threading
4
4
 
5
5
 
6
6
  class ChdbError(Exception):
7
- """Base class for exceptions in this module."""
7
+ """Base exception class for chDB-related errors.
8
+
9
+ This exception is raised when chDB query execution fails or encounters
10
+ an error. It inherits from the standard Python Exception class and
11
+ provides error information from the underlying ClickHouse engine.
12
+
13
+ The exception message typically contains detailed error information
14
+ from ClickHouse, including syntax errors, type mismatches, missing
15
+ tables/columns, and other query execution issues.
16
+
17
+ Attributes:
18
+ args: Tuple containing the error message and any additional arguments
19
+
20
+ Examples:
21
+ >>> try:
22
+ ... result = chdb.query("SELECT * FROM non_existent_table")
23
+ ... except chdb.ChdbError as e:
24
+ ... print(f"Query failed: {e}")
25
+ Query failed: Table 'non_existent_table' doesn't exist
26
+
27
+ >>> try:
28
+ ... result = chdb.query("SELECT invalid_syntax FROM")
29
+ ... except chdb.ChdbError as e:
30
+ ... print(f"Syntax error: {e}")
31
+ Syntax error: Syntax error near 'FROM'
32
+
33
+ Note:
34
+ This exception is automatically raised by chdb.query() and related
35
+ functions when the underlying ClickHouse engine reports an error.
36
+ You should catch this exception when handling potentially failing
37
+ queries to provide appropriate error handling in your application.
38
+ """
8
39
 
9
40
 
10
41
  _arrow_format = set({"dataframe", "arrowtable"})
@@ -19,7 +50,7 @@ _process_result_format_funs = {
19
50
  # UDF script path will be f"{g_udf_path}/{func_name}.py"
20
51
  g_udf_path = ""
21
52
 
22
- chdb_version = ('3', '6', '0')
53
+ __version__ = "3.7.0"
23
54
  if sys.version_info[:2] >= (3, 7):
24
55
  # get the path of the current file
25
56
  current_path = os.path.dirname(os.path.abspath(__file__))
@@ -36,17 +67,32 @@ if sys.version_info[:2] >= (3, 7):
36
67
  else:
37
68
  raise NotImplementedError("Python 3.6 or lower version is not supported")
38
69
 
39
- try:
40
- # Change here if project is renamed and does not equal the package name
41
- dist_name = __name__
42
- __version__ = ".".join(map(str, chdb_version))
43
- except: # noqa
44
- __version__ = "unknown"
70
+ chdb_version = tuple(__version__.split('.'))
45
71
 
46
72
 
47
73
  # return pyarrow table
48
74
  def to_arrowTable(res):
49
- """convert res to arrow table"""
75
+ """Convert query result to PyArrow Table.
76
+
77
+ Converts a chDB query result to a PyArrow Table for efficient columnar data processing.
78
+ Returns an empty table if the result is empty.
79
+
80
+ Args:
81
+ res: chDB query result object containing binary Arrow data
82
+
83
+ Returns:
84
+ pa.Table: PyArrow Table containing the query results
85
+
86
+ Raises:
87
+ ImportError: If pyarrow or pandas are not installed
88
+
89
+ Example:
90
+ >>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow")
91
+ >>> table = chdb.to_arrowTable(result)
92
+ >>> print(table.to_pandas())
93
+ id msg
94
+ 0 1 hello
95
+ """
50
96
  # try import pyarrow and pandas, if failed, raise ImportError with suggestion
51
97
  try:
52
98
  import pyarrow as pa # noqa
@@ -57,12 +103,34 @@ def to_arrowTable(res):
57
103
  raise ImportError("Failed to import pyarrow or pandas") from None
58
104
  if len(res) == 0:
59
105
  return pa.Table.from_batches([], schema=pa.schema([]))
60
- return pa.RecordBatchFileReader(res.bytes()).read_all()
106
+
107
+ memview = res.get_memview()
108
+ return pa.RecordBatchFileReader(memview.view()).read_all()
61
109
 
62
110
 
63
111
  # return pandas dataframe
64
112
  def to_df(r):
65
- """convert arrow table to Dataframe"""
113
+ """Convert query result to pandas DataFrame.
114
+
115
+ Converts a chDB query result to a pandas DataFrame by first converting to
116
+ PyArrow Table and then to pandas using multi-threading for better performance.
117
+
118
+ Args:
119
+ r: chDB query result object containing binary Arrow data
120
+
121
+ Returns:
122
+ pd.DataFrame: pandas DataFrame containing the query results
123
+
124
+ Raises:
125
+ ImportError: If pyarrow or pandas are not installed
126
+
127
+ Example:
128
+ >>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow")
129
+ >>> df = chdb.to_df(result)
130
+ >>> print(df)
131
+ id msg
132
+ 0 1 hello
133
+ """
66
134
  t = to_arrowTable(r)
67
135
  return t.to_pandas(use_threads=True)
68
136
 
@@ -73,6 +141,59 @@ g_conn_lock = threading.Lock()
73
141
 
74
142
  # wrap _chdb functions
75
143
  def query(sql, output_format="CSV", path="", udf_path=""):
144
+ """Execute SQL query using chDB engine.
145
+
146
+ This is the main query function that executes SQL statements using the embedded
147
+ ClickHouse engine. Supports various output formats and can work with in-memory
148
+ or file-based databases.
149
+
150
+ Args:
151
+ sql (str): SQL query string to execute
152
+ output_format (str, optional): Output format for results. Defaults to "CSV".
153
+ Supported formats include:
154
+
155
+ - "CSV" - Comma-separated values
156
+ - "JSON" - JSON format
157
+ - "Arrow" - Apache Arrow format
158
+ - "Parquet" - Parquet format
159
+ - "DataFrame" - Pandas DataFrame
160
+ - "ArrowTable" - PyArrow Table
161
+ - "Debug" - Enable verbose logging
162
+
163
+ path (str, optional): Database file path. Defaults to "" (in-memory database).
164
+ Can be a file path or ":memory:" for in-memory database.
165
+ udf_path (str, optional): Path to User-Defined Functions directory. Defaults to "".
166
+
167
+ Returns:
168
+ Query result in the specified format:
169
+
170
+ - str: For text formats like CSV, JSON
171
+ - pd.DataFrame: When output_format is "DataFrame" or "dataframe"
172
+ - pa.Table: When output_format is "ArrowTable" or "arrowtable"
173
+ - chdb result object: For other formats
174
+
175
+ Raises:
176
+ ChdbError: If the SQL query execution fails
177
+ ImportError: If required dependencies are missing for DataFrame/Arrow formats
178
+
179
+ Examples:
180
+ >>> # Basic CSV query
181
+ >>> result = chdb.query("SELECT 1, 'hello'")
182
+ >>> print(result)
183
+ "1,hello"
184
+
185
+ >>> # Query with DataFrame output
186
+ >>> df = chdb.query("SELECT 1 as id, 'hello' as msg", "DataFrame")
187
+ >>> print(df)
188
+ id msg
189
+ 0 1 hello
190
+
191
+ >>> # Query with file-based database
192
+ >>> result = chdb.query("CREATE TABLE test (id INT)", path="mydb.chdb")
193
+
194
+ >>> # Query with UDF
195
+ >>> result = chdb.query("SELECT my_udf('test')", udf_path="/path/to/udfs")
196
+ """
76
197
  global g_udf_path
77
198
  if udf_path != "":
78
199
  g_udf_path = udf_path
chdb/_chdb.abi3.so CHANGED
Binary file
@@ -8,8 +8,13 @@ except ImportError as e:
8
8
  raise ImportError('Failed to import pyarrow or pandas') from None
9
9
 
10
10
  # check if pandas version >= 2.0.0
11
- if pd.__version__[0] < '2':
12
- print('Please upgrade pandas to version 2.0.0 or higher to have better performance')
11
+ try:
12
+ version_parts = pd.__version__.split('.')
13
+ major_version = int(version_parts[0])
14
+ if major_version < 2:
15
+ print('Please upgrade pandas to version 2.0.0 or higher to have better performance')
16
+ except (ValueError, IndexError, AttributeError):
17
+ pass
13
18
 
14
19
  from .query import Table, pandas_read_parquet # noqa: C0413
15
20
 
chdb/dataframe/query.py CHANGED
@@ -8,11 +8,38 @@ from chdb import query as chdb_query
8
8
 
9
9
 
10
10
  class Table:
11
- """
12
- Table is a wrapper of multiple formats of data buffer, including parquet file path,
13
- parquet bytes, and pandas dataframe.
14
- if use_memfd is True, will try using memfd_create to create a temp file in memory, which is
15
- only available on Linux. If failed, will fallback to use tempfile.mkstemp to create a temp file
11
+ """Wrapper for multiple data formats enabling SQL queries on DataFrames, Parquet files, and Arrow tables.
12
+
13
+ The Table class provides a unified interface for querying different data formats using SQL.
14
+ It supports pandas DataFrames, Parquet files (both on disk and in memory), and PyArrow Tables.
15
+ All data is internally converted to Parquet format for efficient querying with chDB.
16
+
17
+ Args:
18
+ parquet_path (str, optional): Path to an existing Parquet file
19
+ temp_parquet_path (str, optional): Path to a temporary Parquet file
20
+ parquet_memoryview (memoryview, optional): Parquet data in memory as memoryview
21
+ dataframe (pd.DataFrame, optional): pandas DataFrame to wrap
22
+ arrow_table (pa.Table, optional): PyArrow Table to wrap
23
+ use_memfd (bool, optional): Use memfd_create for temporary files (Linux only). Defaults to False.
24
+
25
+ Examples:
26
+ >>> # Create from pandas DataFrame
27
+ >>> import pandas as pd
28
+ >>> df = pd.DataFrame({'id': [1, 2], 'name': ['Alice', 'Bob']})
29
+ >>> table = Table(dataframe=df)
30
+ >>> result = table.query("SELECT * FROM __table__ WHERE id > 1")
31
+
32
+ >>> # Create from Parquet file
33
+ >>> table = Table(parquet_path="data.parquet")
34
+ >>> result = table.query("SELECT COUNT(*) FROM __table__")
35
+
36
+ >>> # Multi-table queries
37
+ >>> table1 = Table(dataframe=df1)
38
+ >>> table2 = Table(dataframe=df2)
39
+ >>> result = Table.queryStatic(
40
+ ... "SELECT * FROM __table1__ JOIN __table2__ ON __table1__.id = __table2__.id",
41
+ ... table1=table1, table2=table2
42
+ ... )
16
43
  """
17
44
 
18
45
  def __init__(
@@ -24,9 +51,18 @@ class Table:
24
51
  arrow_table: pa.Table = None,
25
52
  use_memfd: bool = False,
26
53
  ):
27
- """
28
- Initialize a Table object with one of parquet file path, parquet bytes, pandas dataframe or
29
- parquet table.
54
+ """Initialize a Table object with one of the supported data formats.
55
+
56
+ Only one data source should be provided. The Table will wrap the provided data
57
+ and enable SQL querying capabilities.
58
+
59
+ Args:
60
+ parquet_path (str, optional): Path to existing Parquet file
61
+ temp_parquet_path (str, optional): Path to temporary Parquet file
62
+ parquet_memoryview (memoryview, optional): Parquet data in memory
63
+ dataframe (pd.DataFrame, optional): pandas DataFrame to wrap
64
+ arrow_table (pa.Table, optional): PyArrow Table to wrap
65
+ use_memfd (bool, optional): Use memory-based file descriptors on Linux
30
66
  """
31
67
  self._parquet_path = parquet_path
32
68
  self._temp_parquet_path = temp_parquet_path
@@ -46,15 +82,47 @@ class Table:
46
82
  pass
47
83
 
48
84
  def rows_read(self):
85
+ """Get the number of rows read from the last query operation.
86
+
87
+ Returns:
88
+ int: Number of rows processed in the last query
89
+ """
49
90
  return self._rows_read
50
91
 
51
92
  def bytes_read(self):
93
+ """Get the number of bytes read from the last query operation.
94
+
95
+ Returns:
96
+ int: Number of bytes processed in the last query
97
+ """
52
98
  return self._bytes_read
53
99
 
54
100
  def elapsed(self):
101
+ """Get the elapsed time for the last query operation.
102
+
103
+ Returns:
104
+ float: Query execution time
105
+ """
55
106
  return self._elapsed
56
107
 
57
108
  def to_pandas(self) -> pd.DataFrame:
109
+ """Convert the Table data to a pandas DataFrame.
110
+
111
+ This method handles conversion from various internal formats (Parquet files,
112
+ memory buffers, Arrow tables) to a unified pandas DataFrame representation.
113
+
114
+ Returns:
115
+ pd.DataFrame: The table data as a pandas DataFrame
116
+
117
+ Raises:
118
+ ValueError: If no data source is available in the Table object
119
+
120
+ Example:
121
+ >>> table = Table(dataframe=df)
122
+ >>> result_table = table.query("SELECT * FROM __table__ LIMIT 5")
123
+ >>> df_result = result_table.to_pandas()
124
+ >>> print(df_result)
125
+ """
58
126
  if self._dataframe is None:
59
127
  if self._arrow_table is not None:
60
128
  return self._arrow_table.to_pandas()
@@ -71,8 +139,20 @@ class Table:
71
139
  return self._dataframe
72
140
 
73
141
  def flush_to_disk(self):
74
- """
75
- Flush the data in memory to disk.
142
+ """Flush in-memory data to disk as a temporary Parquet file.
143
+
144
+ This method converts in-memory data (DataFrame, Arrow table, or memory buffer)
145
+ to a temporary Parquet file on disk. This can be useful for memory management
146
+ or when working with large datasets.
147
+
148
+ The method does nothing if data is already stored on disk.
149
+
150
+ Raises:
151
+ ValueError: If the Table object contains no data to flush
152
+
153
+ Example:
154
+ >>> table = Table(dataframe=large_df)
155
+ >>> table.flush_to_disk() # Frees memory, keeps data accessible
76
156
  """
77
157
  if self._parquet_path is not None or self._temp_parquet_path is not None:
78
158
  return
@@ -112,10 +192,33 @@ class Table:
112
192
  return str(self.to_pandas())
113
193
 
114
194
  def query(self, sql: str, **kwargs) -> "Table":
115
- """
116
- Query on current Table object, return a new Table object.
117
- The `FROM` table name in SQL should always be `__table__`. eg:
118
- `SELECT * FROM __table__ WHERE ...`
195
+ """Execute SQL query on the current Table and return a new Table with results.
196
+
197
+ This method allows you to run SQL queries on the table data using chDB.
198
+ The table is referenced as '__table__' in the SQL statement.
199
+
200
+ Args:
201
+ sql (str): SQL query string. Must reference the table as '__table__'
202
+ **kwargs: Additional arguments passed to the chDB query engine
203
+
204
+ Returns:
205
+ Table: New Table object containing the query results
206
+
207
+ Raises:
208
+ ValueError: If SQL doesn't contain '__table__' reference or if Table is not initialized
209
+
210
+ Examples:
211
+ >>> table = Table(dataframe=df)
212
+ >>> # Filter rows
213
+ >>> result = table.query("SELECT * FROM __table__ WHERE age > 25")
214
+ >>>
215
+ >>> # Aggregate data
216
+ >>> summary = table.query("SELECT COUNT(*), AVG(salary) FROM __table__")
217
+ >>>
218
+ >>> # Complex operations
219
+ >>> processed = table.query(
220
+ ... "SELECT name, age * 2 as double_age FROM __table__ ORDER BY age DESC"
221
+ ... )
119
222
  """
120
223
  self._validate_sql(sql)
121
224
 
@@ -138,6 +241,18 @@ class Table:
138
241
  sql = query
139
242
 
140
243
  def show(self):
244
+ """Display the Table data by printing the pandas DataFrame representation.
245
+
246
+ This is a convenience method for quickly viewing the table contents.
247
+ Equivalent to print(table.to_pandas()).
248
+
249
+ Example:
250
+ >>> table = Table(dataframe=df)
251
+ >>> table.show()
252
+ id name
253
+ 0 1 Alice
254
+ 1 2 Bob
255
+ """
141
256
  print(self.to_pandas())
142
257
 
143
258
  def _query_on_path(self, path, sql, **kwargs):
@@ -220,12 +335,51 @@ class Table:
220
335
 
221
336
  @staticmethod
222
337
  def queryStatic(sql: str, **kwargs) -> "Table":
223
- """
224
- Query on multiple Tables, use Table variables as the table name in SQL
225
- eg.
226
- table1 = Table(...)
227
- table2 = Table(...)
228
- query("SELECT * FROM __table1__ JOIN __table2__ ON ...", table1=table1, table2=table2)
338
+ """Execute SQL query across multiple Table objects.
339
+
340
+ This static method enables complex queries involving multiple tables by referencing
341
+ them as '__tablename__' in the SQL and passing them as keyword arguments.
342
+
343
+ Args:
344
+ sql (str): SQL query with table references as '__name__' patterns
345
+ **kwargs: Table objects referenced in the SQL, where key matches the table name
346
+ Can also include pandas DataFrames, which will be auto-converted to Tables
347
+
348
+ Returns:
349
+ Table: New Table object containing the query results
350
+
351
+ Raises:
352
+ ValueError: If referenced table names are missing from kwargs or have invalid types
353
+
354
+ Examples:
355
+ >>> users = Table(dataframe=users_df)
356
+ >>> orders = Table(dataframe=orders_df)
357
+ >>>
358
+ >>> # Join two tables
359
+ >>> result = Table.queryStatic(
360
+ ... "SELECT u.name, COUNT(o.id) as order_count "
361
+ ... "FROM __users__ u LEFT JOIN __orders__ o ON u.id = o.user_id "
362
+ ... "GROUP BY u.name",
363
+ ... users=users, orders=orders
364
+ ... )
365
+ >>>
366
+ >>> # Works with pandas DataFrames directly
367
+ >>> result = Table.queryStatic(
368
+ ... "SELECT * FROM __df1__ UNION ALL SELECT * FROM __df2__",
369
+ ... df1=dataframe1, df2=dataframe2
370
+ ... )
371
+ >>>
372
+ >>> # Complex multi-table operations
373
+ >>> analytics = Table.queryStatic(
374
+ ... "SELECT p.category, AVG(o.amount) as avg_order "
375
+ ... "FROM __products__ p "
376
+ ... "JOIN __order_items__ oi ON p.id = oi.product_id "
377
+ ... "JOIN __orders__ o ON oi.order_id = o.id "
378
+ ... "GROUP BY p.category ORDER BY avg_order DESC",
379
+ ... products=products_table,
380
+ ... order_items=order_items_table,
381
+ ... orders=orders_table
382
+ ... )
229
383
  """
230
384
  ansiTablePattern = re.compile(r"__([a-zA-Z][a-zA-Z0-9_]*)__")
231
385
  temp_paths = []
@@ -322,13 +476,47 @@ class Table:
322
476
 
323
477
 
324
478
  def pandas_read_parquet(path) -> pd.DataFrame:
479
+ """Read a Parquet file into a pandas DataFrame.
480
+
481
+ This is a convenience wrapper around pandas.read_parquet() for consistency
482
+ with the chdb.dataframe module interface.
483
+
484
+ Args:
485
+ path: File path or file-like object to read from
486
+
487
+ Returns:
488
+ pd.DataFrame: The loaded DataFrame
489
+ """
325
490
  return pd.read_parquet(path)
326
491
 
327
492
 
328
493
  def memfd_create(name: str = None) -> int:
329
- """
330
- Try to use memfd_create(2) to create a file descriptor with memory.
331
- Only available on Linux 3.17 or newer with glibc 2.27 or newer.
494
+ """Create an in-memory file descriptor using memfd_create system call.
495
+
496
+ This function attempts to use the Linux-specific memfd_create(2) system call
497
+ to create a file descriptor that refers to an anonymous memory-backed file.
498
+ This provides better performance for temporary data operations.
499
+
500
+ Args:
501
+ name (str, optional): Name for the memory file (for debugging). Defaults to None.
502
+
503
+ Returns:
504
+ int: File descriptor on success, -1 on failure or if not supported
505
+
506
+ Note:
507
+ This function only works on Linux 3.17 or newer with glibc 2.27 or newer.
508
+ On other systems or if the call fails, it returns -1 and callers should
509
+ fall back to regular temporary files.
510
+
511
+ Example:
512
+ >>> fd = memfd_create("temp_data")
513
+ >>> if fd != -1:
514
+ ... # Use memory-based file descriptor
515
+ ... with os.fdopen(fd, 'wb') as f:
516
+ ... f.write(data)
517
+ ... else:
518
+ ... # Fall back to regular temp file
519
+ ... fd, path = tempfile.mkstemp()
332
520
  """
333
521
  if hasattr(os, "memfd_create"):
334
522
  try:
chdb/dbapi/__init__.py CHANGED
@@ -13,20 +13,58 @@ paramstyle = "format"
13
13
 
14
14
 
15
15
  class DBAPISet(frozenset):
16
+ """Extended frozenset for DB-API 2.0 type comparison.
17
+
18
+ This class extends frozenset to support DB-API 2.0 type comparison semantics.
19
+ It allows for flexible type checking where individual items can be compared
20
+ against the set using both equality and inequality operators.
21
+
22
+ This is used for type constants like STRING, BINARY, NUMBER, etc. to enable
23
+ comparisons like "field_type == STRING" where field_type is a single type value.
24
+
25
+ Examples:
26
+ >>> string_types = DBAPISet([FIELD_TYPE.STRING, FIELD_TYPE.VAR_STRING])
27
+ >>> FIELD_TYPE.STRING == string_types # Returns True
28
+ >>> FIELD_TYPE.INT != string_types # Returns True
29
+ >>> FIELD_TYPE.BLOB in string_types # Returns False
30
+ """
16
31
 
17
32
  def __ne__(self, other):
33
+ """Check inequality with flexible type comparison.
34
+
35
+ Args:
36
+ other: Value to compare against this set
37
+
38
+ Returns:
39
+ bool: True if other is not in this set (for non-set types) or
40
+ True if sets are not equal (for set types)
41
+ """
18
42
  if isinstance(other, set):
19
43
  return frozenset.__ne__(self, other)
20
44
  else:
21
45
  return other not in self
22
46
 
23
47
  def __eq__(self, other):
48
+ """Check equality with flexible type comparison.
49
+
50
+ Args:
51
+ other: Value to compare against this set
52
+
53
+ Returns:
54
+ bool: True if other is in this set (for non-set types) or
55
+ True if sets are equal (for set types)
56
+ """
24
57
  if isinstance(other, frozenset):
25
58
  return frozenset.__eq__(self, other)
26
59
  else:
27
60
  return other in self
28
61
 
29
62
  def __hash__(self):
63
+ """Return hash value for the set.
64
+
65
+ Returns:
66
+ int: Hash value of the underlying frozenset
67
+ """
30
68
  return frozenset.__hash__(self)
31
69
 
32
70
 
@@ -47,7 +85,17 @@ ROWID = DBAPISet()
47
85
 
48
86
 
49
87
  def Binary(x):
50
- """Return x as a binary type."""
88
+ """Return x as a binary type.
89
+
90
+ This function converts the input to bytes type for use with binary
91
+ database fields, following the DB-API 2.0 specification.
92
+
93
+ Args:
94
+ x: Input data to convert to binary
95
+
96
+ Returns:
97
+ bytes: The input converted to bytes
98
+ """
51
99
  return bytes(x)
52
100
 
53
101
 
@@ -65,7 +113,14 @@ if _orig_conn.Connection.__init__.__doc__ is not None:
65
113
  del _orig_conn
66
114
 
67
115
 
68
- def get_client_info(): # for MySQLdb compatibility
116
+ def get_client_info():
117
+ """Get client version information.
118
+
119
+ Returns the chDB client version as a string for MySQLdb compatibility.
120
+
121
+ Returns:
122
+ str: Version string in format 'major.minor.patch'
123
+ """
69
124
  version = chdb_version
70
125
  if len(chdb_version) > 3 and chdb_version[3] is None:
71
126
  version = chdb_version[:3]