PyPI - chdb - Versions diffs - 3.4.1__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 3.5.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

chdb 3.4.1__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 3.5.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chdb might be problematic. Click here for more details.

Files changed (9) hide show

chdb/__init__.py +1 -1
chdb/__main__.py +4 -1
chdb/_chdb.cpython-38-aarch64-linux-gnu.so +0 -0
chdb/state/sqlitelike.py +177 -8
{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/METADATA +28 -3
{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/RECORD +17 -17
{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/LICENSE.txt +0 -0
{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/WHEEL +0 -0
{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/top_level.txt +0 -0

chdb/__init__.py CHANGED Viewed

@@ -19,7 +19,7 @@ _process_result_format_funs = {
 # UDF script path will be f"{g_udf_path}/{func_name}.py"
 g_udf_path = ""
-chdb_version = ('3', '4', '1')
+chdb_version = ('3', '5', '0')
 if sys.version_info[:2] >= (3, 7):
     # get the path of the current file
     current_path = os.path.dirname(os.path.abspath(__file__))

chdb/__main__.py CHANGED Viewed

@@ -4,9 +4,12 @@ from .__init__ import query
 def main():
     prog = 'python -m chdb'
+    custom_usage = "%(prog)s [-h] \"SELECT 1\" [format]"
     description = ('''A simple command line interface for chdb
                    to run SQL and output in specified format''')
-    parser = argparse.ArgumentParser(prog=prog, description=description)
+    parser = argparse.ArgumentParser(prog=prog,
+                                     usage=custom_usage,
+                                     description=description)
     parser.add_argument('sql', nargs=1,
                         type=str,
                         help='sql, e.g: select 1112222222,555')

chdb/_chdb.cpython-38-aarch64-linux-gnu.so CHANGED Viewed

Binary file

chdb/state/sqlitelike.py CHANGED Viewed

@@ -41,11 +41,12 @@ def to_df(r):
 class StreamingResult:
-    def __init__(self, c_result, conn, result_func):
+    def __init__(self, c_result, conn, result_func, supports_record_batch):
         self._result = c_result
         self._result_func = result_func
         self._conn = conn
         self._exhausted = False
+        self._supports_record_batch = supports_record_batch
     def fetch(self):
         """Fetch next chunk of streaming results"""
@@ -80,15 +81,182 @@ class StreamingResult:
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
+        self.cancel()
+    def close(self):
+        self.cancel()
     def cancel(self):
-        self._exhausted = True
+        if not self._exhausted:
+            self._exhausted = True
+            try:
+                self._conn.streaming_cancel_query(self._result)
+            except Exception as e:
+                raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
-        try:
-            self._conn.streaming_cancel_query(self._result)
-        except Exception as e:
-            raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
+    def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader:
+        """
+        Create a PyArrow RecordBatchReader from this StreamingResult.
+        This method requires that the StreamingResult was created with arrow format.
+        It wraps the streaming result with ChdbRecordBatchReader to provide efficient
+        batching with configurable batch sizes.
+        Args:
+            rows_per_batch (int): Number of rows per batch. Defaults to 1000000.
+        Returns:
+            pa.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming
+        Raises:
+            ValueError: If the StreamingResult was not created with arrow format
+        """
+        if not self._supports_record_batch:
+            raise ValueError(
+                "record_batch() can only be used with arrow format. "
+                "Please use format='Arrow' when calling send_query."
+            )
+        chdb_reader = ChdbRecordBatchReader(self, rows_per_batch)
+        return pa.RecordBatchReader.from_batches(chdb_reader.schema(), chdb_reader)
+class ChdbRecordBatchReader:
+    """
+    A PyArrow RecordBatchReader wrapper for chdb StreamingResult.
+    This class provides an efficient way to read large result sets as PyArrow RecordBatches
+    with configurable batch sizes to optimize memory usage and performance.
+    """
+    def __init__(self, chdb_stream_result, batch_size_rows):
+        self._stream_result = chdb_stream_result
+        self._schema = None
+        self._closed = False
+        self._pending_batches = []
+        self._accumulator = []
+        self._batch_size_rows = batch_size_rows
+        self._current_rows = 0
+        self._first_batch = None
+        self._first_batch_consumed = True
+        self._schema = self.schema()
+    def schema(self):
+        if self._schema is None:
+            # Get the first chunk to determine schema
+            chunk = self._stream_result.fetch()
+            if chunk is not None:
+                arrow_bytes = chunk.bytes()
+                reader = pa.RecordBatchFileReader(arrow_bytes)
+                self._schema = reader.schema
+                table = reader.read_all()
+                if table.num_rows > 0:
+                    batches = table.to_batches()
+                    self._first_batch = batches[0]
+                    if len(batches) > 1:
+                        self._pending_batches = batches[1:]
+                    self._first_batch_consumed = False
+                else:
+                    self._first_batch = None
+                    self._first_batch_consumed = True
+            else:
+                self._schema = pa.schema([])
+                self._first_batch = None
+                self._first_batch_consumed = True
+                self._closed = True
+        return self._schema
+    def read_next_batch(self):
+        if self._accumulator:
+            result = self._accumulator.pop(0)
+            return result
+        if self._closed:
+            raise StopIteration
+        while True:
+            batch = None
+            # 1. Return the first batch if not consumed yet
+            if not self._first_batch_consumed:
+                self._first_batch_consumed = True
+                batch = self._first_batch
+            # 2. Check pending batches from current chunk
+            elif self._pending_batches:
+                batch = self._pending_batches.pop(0)
+            # 3. Fetch new chunk from chdb stream
+            else:
+                chunk = self._stream_result.fetch()
+                if chunk is None:
+                    # No more data - return accumulated batches if any
+                    break
+                arrow_bytes = chunk.bytes()
+                if not arrow_bytes:
+                    continue
+                reader = pa.RecordBatchFileReader(arrow_bytes)
+                table = reader.read_all()
+                if table.num_rows > 0:
+                    batches = table.to_batches()
+                    batch = batches[0]
+                    if len(batches) > 1:
+                        self._pending_batches = batches[1:]
+                else:
+                    continue
+            # Process the batch if we got one
+            if batch is not None:
+                self._accumulator.append(batch)
+                self._current_rows += batch.num_rows
+                # If accumulated enough rows, return combined batch
+                if self._current_rows >= self._batch_size_rows:
+                    if len(self._accumulator) == 1:
+                        result = self._accumulator.pop(0)
+                    else:
+                        if hasattr(pa, 'concat_batches'):
+                            result = pa.concat_batches(self._accumulator)
+                            self._accumulator = []
+                        else:
+                            result = self._accumulator.pop(0)
+                    self._current_rows = 0
+                    return result
+        # End of stream - return any accumulated batches
+        if self._accumulator:
+            if len(self._accumulator) == 1:
+                result = self._accumulator.pop(0)
+            else:
+                if hasattr(pa, 'concat_batches'):
+                    result = pa.concat_batches(self._accumulator)
+                    self._accumulator = []
+                else:
+                    result = self._accumulator.pop(0)
+            self._current_rows = 0
+            self._closed = True
+            return result
+        # No more data
+        self._closed = True
+        raise StopIteration
+    def close(self):
+        if not self._closed:
+            self._stream_result.close()
+            self._closed = True
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return self.read_next_batch()
 class Connection:
@@ -112,12 +280,13 @@ class Connection:
     def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
         lower_output_format = format.lower()
+        supports_record_batch = lower_output_format == "arrow"
         result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
         if lower_output_format in _arrow_format:
             format = "Arrow"
         c_stream_result = self._conn.send_query(query, format)
-        return StreamingResult(c_stream_result, self._conn, result_func)
+        return StreamingResult(c_stream_result, self._conn, result_func, supports_record_batch)
     def close(self) -> None:
         # print("close")

{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: chdb
-Version: 3.4.1
+Version: 3.5.0
 Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
 Home-page: https://github.com/chdb-io/chdb
 Author: auxten
@@ -295,16 +295,37 @@ while True:
     if chunk is None:
         break
     if rows_cnt > 0:
-        stream_result.cancel()
+        stream_result.close()
         break
     rows_cnt += chunk.rows_read()
 print(rows_cnt) # 65409
+# Example 4: Using PyArrow RecordBatchReader for batch export and integration with other libraries
+import pyarrow as pa
+from deltalake import write_deltalake
+# Get streaming result in arrow format
+stream_result = sess.send_query("SELECT * FROM numbers(100000)", "Arrow")
+# Create RecordBatchReader with custom batch size (default rows_per_batch=1000000)
+batch_reader = stream_result.record_batch(rows_per_batch=10000)
+# Use RecordBatchReader with external libraries like Delta Lake
+write_deltalake(
+    table_or_uri="./my_delta_table",
+    data=batch_reader,
+    mode="overwrite"
+)
+stream_result.close()
 sess.close()
 ```
-For more details, see [test_streaming_query.py](tests/test_streaming_query.py).
+**Important Note**: When using streaming queries, if the `StreamingResult` is not fully consumed (due to errors or early termination), you must explicitly call `stream_result.close()` to release resources, or use the `with` statement for automatic cleanup. Failure to do so may block subsequent queries.
+For more details, see [test_streaming_query.py](tests/test_streaming_query.py) and [test_arrow_record_reader_deltalake.py](tests/test_arrow_record_reader_deltalake.py).
 </details>
@@ -507,6 +528,10 @@ There are something you can help:
 We welcome bindings for other languages, please refer to [bindings](bindings.md) for more details.
+## Version Guide
+Please refer to [VERSION-GUIDE.md](VERSION-GUIDE.md) for more details.
 ## Paper
 - [ClickHouse - Lightning Fast Analytics for Everyone](https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf)

{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,28 +1,28 @@
-chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
-chdb/__init__.py,sha256=KjR7cb7QFtjCqvasu81WQvX-2LeHjx-rSB3preiRefI,3762
-chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=9hdZgNc0h05058RlHQCkoGF8u-oECZPvgTggNkBeLkA,568828904
 chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
+chdb/__main__.py,sha256=vl-gorTYCT9Uh_h4jbQ8O-a5_pokCJPFbF_yplIgKYc,1336
+chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=ljNW7N1IUXRFGwIJdiBZCfG_Fi7n25f2VOMWfb4Qsgw,568916304
+chdb/__init__.py,sha256=7GGNYb_0PsxWRaU5QubVCgH1gHXa5ZPAqPuKir1Rx-I,3762
 chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
 chdb/session/state.py,sha256=m7K9zZtoMQTlh-pfmSyJV38pAe6eHNTPtOvlHYrImhA,4436
-chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
-chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
 chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
-chdb/utils/types.py,sha256=MGLFIjoDvu7Uc2Wy8EDY60jjue66HmMPxbhrujjrZxQ,7530
 chdb/utils/trace.py,sha256=W-pvDoKlnzq6H_7FiWjr5_teN40UNE4E5--zbUrjOIc,2511
-chdb/udf/__init__.py,sha256=qSMaPEre7w1pYz8uJ-iZtuu8wYOUNRcI_8UNuaOymGE,80
+chdb/utils/types.py,sha256=MGLFIjoDvu7Uc2Wy8EDY60jjue66HmMPxbhrujjrZxQ,7530
+chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
+chdb/state/sqlitelike.py,sha256=PHdIJVfbSUvJWU6doMnrg0jVpovtHUG12I_-acZHOko,18338
 chdb/udf/udf.py,sha256=z0A1RmyZrx55bykpvvS-LpVt1lMrQOexjvU5zxCdCSA,3935
-chdb/dbapi/connections.py,sha256=RW0EcusyKueMGp7VmSaCO-ukyzY7l2ps_ibA9-pXDvo,2754
+chdb/udf/__init__.py,sha256=qSMaPEre7w1pYz8uJ-iZtuu8wYOUNRcI_8UNuaOymGE,80
+chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
+chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
 chdb/dbapi/times.py,sha256=_qXgDaYwsHntvpIKSKXp1rrYIgtq6Z9pLyLnO2XNoL0,360
-chdb/dbapi/__init__.py,sha256=aaNhxXNBC1ZkFr260cbGR8msOinTp0VoNTT_j8AXGUc,2205
 chdb/dbapi/converters.py,sha256=0SDqgixUTCz0LtWke_HHzgF1lFJhpsQrR_-ky3b-JRY,7447
 chdb/dbapi/cursors.py,sha256=3ufVB1zt3x7SzCYowVbwAOsuzkMxYPO74q9XW6ctkKo,8120
+chdb/dbapi/__init__.py,sha256=aaNhxXNBC1ZkFr260cbGR8msOinTp0VoNTT_j8AXGUc,2205
+chdb/dbapi/connections.py,sha256=RW0EcusyKueMGp7VmSaCO-ukyzY7l2ps_ibA9-pXDvo,2754
 chdb/dbapi/err.py,sha256=kUI9-A8LNqBoMoo4jh2NFsLCOLoPEwh9YIuz_qMoLoM,2017
-chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSAb60z8,370
 chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
-chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
-chdb-3.4.1.dist-info/RECORD,,
-chdb-3.4.1.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
-chdb-3.4.1.dist-info/WHEEL,sha256=kTTaziUGh4IgOXqp-kkEc0J_ej7kOw0CdWocYU-1ZR0,149
-chdb-3.4.1.dist-info/METADATA,sha256=5nvTBNyYcoHPpDBLIxbCz5ZMWbFueTWEOOTwwFHq1c0,24622
-chdb-3.4.1.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
+chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSAb60z8,370
+chdb-3.5.0.dist-info/WHEEL,sha256=kTTaziUGh4IgOXqp-kkEc0J_ej7kOw0CdWocYU-1ZR0,149
+chdb-3.5.0.dist-info/METADATA,sha256=BV7QlaT95FZsDo26kyyq8eN9z6hVkIyI58ZogC45s2s,25714
+chdb-3.5.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
+chdb-3.5.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
+chdb-3.5.0.dist-info/RECORD,,

{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{chdb-3.4.1.dist-info → chdb-3.5.0.dist-info}/top_level.txt RENAMED Viewed

File without changes