PyPI - chdb - Versions diffs - 3.1.2__cp38-cp38-macosx_10_15_x86_64.whl → 3.3.0__cp38-cp38-macosx_10_15_x86_64.whl - Mend

chdb 3.1.2__cp38-cp38-macosx_10_15_x86_64.whl → 3.3.0__cp38-cp38-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chdb might be problematic. Click here for more details.

Files changed (9) hide show

chdb/__init__.py +1 -1
chdb/_chdb.cpython-38-darwin.so +0 -0
chdb/session/state.py +14 -1
chdb/state/sqlitelike.py +60 -0
{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/METADATA +172 -11
{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/RECORD +9 -9
{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/LICENSE.txt +0 -0
{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/WHEEL +0 -0
{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/top_level.txt +0 -0

chdb/__init__.py CHANGED Viewed

@@ -19,7 +19,7 @@ _process_result_format_funs = {
 # UDF script path will be f"{g_udf_path}/{func_name}.py"
 g_udf_path = ""
-chdb_version = ('3', '1', '2')
+chdb_version = ('3', '3', '0')
 if sys.version_info[:2] >= (3, 7):
     # get the path of the current file
     current_path = os.path.dirname(os.path.abspath(__file__))

chdb/_chdb.cpython-38-darwin.so CHANGED Viewed

Binary file

chdb/session/state.py CHANGED Viewed

@@ -4,7 +4,7 @@ import warnings
 import chdb
 from ..state import sqlitelike as chdb_stateful
+from ..state.sqlitelike import StreamingResult
 g_session = None
 g_session_path = None
@@ -120,3 +120,16 @@ Eg: conn = connect(f"db_path?verbose&log-level=test")"""
     # alias sql = query
     sql = query
+    def send_query(self, sql, fmt="CSV") -> StreamingResult:
+        """
+        Execute a streaming query.
+        """
+        if fmt == "Debug":
+            warnings.warn(
+                """Debug format is not supported in Session.query
+Please try use parameters in connection string instead:
+Eg: conn = connect(f"db_path?verbose&log-level=test")"""
+            )
+            fmt = "CSV"
+        return self._conn.send_query(sql, fmt)

chdb/state/sqlitelike.py CHANGED Viewed

@@ -40,6 +40,57 @@ def to_df(r):
     return t.to_pandas(use_threads=True)
+class StreamingResult:
+    def __init__(self, c_result, conn, result_func):
+        self._result = c_result
+        self._result_func = result_func
+        self._conn = conn
+        self._exhausted = False
+    def fetch(self):
+        """Fetch next chunk of streaming results"""
+        if self._exhausted:
+            return None
+        try:
+            result = self._conn.streaming_fetch_result(self._result)
+            if result is None or result.rows_read() == 0:
+                self._exhausted = True
+                return None
+            return self._result_func(result)
+        except Exception as e:
+            self._exhausted = True
+            raise RuntimeError(f"Streaming query failed: {str(e)}") from e
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._exhausted:
+            raise StopIteration
+        chunk = self.fetch()
+        if chunk is None:
+            self._exhausted = True
+            raise StopIteration
+        return chunk
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+    def cancel(self):
+        self._exhausted = True
+        try:
+            self._conn.streaming_cancel_query(self._result)
+        except Exception as e:
+            raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
 class Connection:
     def __init__(self, connection_string: str):
         # print("Connection", connection_string)
@@ -59,6 +110,15 @@ class Connection:
         result = self._conn.query(query, format)
         return result_func(result)
+    def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
+        lower_output_format = format.lower()
+        result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
+        if lower_output_format in _arrow_format:
+            format = "Arrow"
+        c_stream_result = self._conn.send_query(query, format)
+        return StreamingResult(c_stream_result, self._conn, result_func)
     def close(self) -> None:
         # print("close")
         if self._cursor:

{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: chdb
-Version: 3.1.2
+Version: 3.3.0
 Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
 Home-page: https://github.com/chdb-io/chdb
 Author: auxten
@@ -51,11 +51,11 @@ Requires-Dist: pandas >=2.0.0
 > chDB is an in-process SQL OLAP Engine powered by ClickHouse  [^1]
-> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
+> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
 ## Features
 * In-process SQL OLAP Engine, powered by ClickHouse
 * No need to install ClickHouse
 * Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
@@ -93,6 +93,44 @@ python3 -m chdb "SELECT 1,'abc'" Pretty
 ### Data Input
 The following methods are available to access on-disk and in-memory data formats:
+<details>
+    <summary><h4>🗂️ Connection based API (recommended)</h4></summary>
+```python
+import chdb
+# Create a connection (in-memory by default)
+conn = chdb.connect(":memory:")
+# Or use file-based: conn = chdb.connect("test.db")
+# Create a cursor
+cur = conn.cursor()
+# Execute queries
+cur.execute("SELECT number, toString(number) as str FROM system.numbers LIMIT 3")
+# Fetch data in different ways
+print(cur.fetchone())    # Single row: (0, '0')
+print(cur.fetchmany(2))  # Multiple rows: ((1, '1'), (2, '2'))
+# Get column information
+print(cur.column_names())  # ['number', 'str']
+print(cur.column_types())  # ['UInt64', 'String']
+# Use the cursor as an iterator
+cur.execute("SELECT number FROM system.numbers LIMIT 3")
+for row in cur:
+    print(row)
+# Always close resources when done
+cur.close()
+conn.close()
+```
+For more details, see [examples/connect.py](examples/connect.py).
+</details>
 <details>
     <summary><h4>🗂️ Query On File</h4> (Parquet, CSV, JSON, Arrow, ORC and 60+)</summary>
@@ -108,7 +146,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
 # See more data type format in tests/format_output.py
 res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
 res = chdb.query('select * from file("data.csv", CSV)', 'CSV');  print(res)
-print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
+print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
 ```
 ### Pandas dataframe output
@@ -133,6 +171,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
 print(ret_tbl)
 # Query on the DataFrame Table
 print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
+# Pandas DataFrames are automatically registered as temporary tables in ClickHouse
+chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
 ```
 </details>
@@ -218,6 +258,56 @@ see also: [test_udf.py](tests/test_udf.py).
 </details>
+<details>
+    <summary><h4>🗂️ Streaming Query</h4></summary>
+Process large datasets with constant memory usage through chunked streaming.
+```python
+from chdb import session as chs
+sess = chs.Session()
+# Example 1: Basic example of using streaming query
+rows_cnt = 0
+with sess.send_query("SELECT * FROM numbers(200000)", "CSV") as stream_result:
+    for chunk in stream_result:
+        rows_cnt += chunk.rows_read()
+print(rows_cnt) # 200000
+# Example 2: Manual iteration with fetch()
+rows_cnt = 0
+stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
+while True:
+    chunk = stream_result.fetch()
+    if chunk is None:
+        break
+    rows_cnt += chunk.rows_read()
+print(rows_cnt) # 200000
+# Example 3: Early cancellation demo
+rows_cnt = 0
+stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
+while True:
+    chunk = stream_result.fetch()
+    if chunk is None:
+        break
+    if rows_cnt > 0:
+        stream_result.cancel()
+        break
+    rows_cnt += chunk.rows_read()
+print(rows_cnt) # 65409
+sess.close()
+```
+For more details, see [test_streaming_query.py](tests/test_streaming_query.py).
+</details>
 <details>
     <summary><h4>🗂️ Python Table Engine</h4></summary>
@@ -230,10 +320,19 @@ df = pd.DataFrame(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
+            {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
+            {'id': 3, 'name': 'tom'},
+            {'id': 4, 'value': '100'},
+            {'id': 5, 'value': 101},
+            {'id': 6, 'value': 102},
+        ],
     }
 )
 chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
 ```
 ### Query on Arrow Table
@@ -245,12 +344,19 @@ arrow_table = pa.table(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'value': 'tom'},
+            {'id': 2, 'value': 'jerry'},
+            {'id': 3, 'value': 'auxten'},
+            {'id': 4, 'value': 'tom'},
+            {'id': 5, 'value': 'jerry'},
+            {'id': 6, 'value': 'auxten'},
+        ],
     }
 )
-chdb.query(
-    "SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b", "debug"
-).show()
+chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
 ```
 ### Query on chdb.PyReader class instance
@@ -274,24 +380,79 @@ class myReader(chdb.PyReader):
     def read(self, col_names, count):
         print("Python func read", col_names, count, self.cursor)
         if self.cursor >= len(self.data["a"]):
+            self.cursor = 0
             return []
         block = [self.data[col] for col in col_names]
         self.cursor += len(block[0])
         return block
+    def get_schema(self):
+        return [
+            ("a", "int"),
+            ("b", "str"),
+            ("dict_col", "json")
+        ]
 reader = myReader(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
+            {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
+            {'id': 3, 'name': 'tom'},
+            {'id': 4, 'value': '100'},
+            {'id': 5, 'value': 101},
+            {'id': 6, 'value': 102}
+        ],
     }
 )
-chdb.query(
-    "SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
-).show()
+chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
 ```
-see also: [test_query_py.py](tests/test_query_py.py).
+see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
+### JSON Type Inference
+chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
+1. **Pandas DataFrame**
+    - Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
+    - Control sampling via SQL settings:
+      ```sql
+      SET pandas_analyze_sample = 10000  -- Default sampling
+      SET pandas_analyze_sample = 0      -- Force String type
+      SET pandas_analyze_sample = -1     -- Force JSON type
+      ```
+    - Columns are converted to `String` if sampling finds non-dictionary values.
+2. **Arrow Table**
+    - `struct` type columns are automatically mapped to JSON columns.
+    - Nested structures preserve type information.
+3. **chdb.PyReader**
+    - Implement custom schema mapping in `get_schema()`:
+      ```python
+      def get_schema(self):
+          return [
+              ("c1", "JSON"),  # Explicit JSON mapping
+              ("c2", "String")
+          ]
+      ```
+    - Column types declared as "JSON" will bypass auto-detection.
+When converting Python dictionary objects to JSON columns:
+1. **Nested Structures**
+    - Recursively process nested dictionaries, lists, tuples and NumPy arrays.
+2. **Primitive Types**
+    - Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
+3. **Complex Objects**
+    - Non-primitive types will be converted to strings.
 ### Limitations

{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-chdb/__init__.py,sha256=Wb4a4CPgJ0j44kDuehkwITZV9Q6QOqyUmxA5PM6BbYk,3762
+chdb/__init__.py,sha256=KsqKKRN2T2Rspn94XwwtR45fT5viF5h6KUJ7JIETo1w,3762
 chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
-chdb/_chdb.cpython-38-darwin.so,sha256=oaIxhT5WTYorisJ5eRe7YgR1z761jimyMYqgWLUWaMk,422084800
+chdb/_chdb.cpython-38-darwin.so,sha256=kZkCo6Y0s9qWOY9D-JAFaKOTXb3T-Q14YVxetRhfRvg,422308456
 chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
 chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
 chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
@@ -13,16 +13,16 @@ chdb/dbapi/times.py,sha256=_qXgDaYwsHntvpIKSKXp1rrYIgtq6Z9pLyLnO2XNoL0,360
 chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSAb60z8,370
 chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
-chdb/session/state.py,sha256=nx9KlqZyPTHAflToXCJVRBUSMjJFvyh6x2akP7Gc7h0,4360
+chdb/session/state.py,sha256=UtObxVuyNgeqFkTXVHtmOknR90Pe1dEzbOpKFDBYOkg,4845
 chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
-chdb/state/sqlitelike.py,sha256=6Y57vnf7LnA0KnpByKQq7PkEkEEOKK-ExaHQLb1bedQ,10498
+chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
 chdb/udf/__init__.py,sha256=qSMaPEre7w1pYz8uJ-iZtuu8wYOUNRcI_8UNuaOymGE,80
 chdb/udf/udf.py,sha256=z0A1RmyZrx55bykpvvS-LpVt1lMrQOexjvU5zxCdCSA,3935
 chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
 chdb/utils/trace.py,sha256=W-pvDoKlnzq6H_7FiWjr5_teN40UNE4E5--zbUrjOIc,2511
 chdb/utils/types.py,sha256=MGLFIjoDvu7Uc2Wy8EDY60jjue66HmMPxbhrujjrZxQ,7530
-chdb-3.1.2.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
-chdb-3.1.2.dist-info/METADATA,sha256=EzsMy4v_dK-KxFytLdwErIH-JjEg_TrkW0PkmeN5GBI,19444
-chdb-3.1.2.dist-info/WHEEL,sha256=WyxCboCiRNvHww1lxL6mr82B-yTKwQTJtmqg4JQiVzc,109
-chdb-3.1.2.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
-chdb-3.1.2.dist-info/RECORD,,
+chdb-3.3.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
+chdb-3.3.0.dist-info/METADATA,sha256=ILO7J8Bgj69p_fcUbm37--PinnHF3ZJA-1TewaHIoqo,24622
+chdb-3.3.0.dist-info/WHEEL,sha256=WyxCboCiRNvHww1lxL6mr82B-yTKwQTJtmqg4JQiVzc,109
+chdb-3.3.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
+chdb-3.3.0.dist-info/RECORD,,

{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{chdb-3.1.2.dist-info → chdb-3.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes