PyPI - chdb - Versions diffs - 3.2.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 3.4.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - Mend

chdb 3.2.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 3.4.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of chdb might be problematic. Click here for more details.

Files changed (8) hide show

chdb/__init__.py +1 -1
chdb/_chdb.cpython-38-aarch64-linux-gnu.so +0 -0
chdb/session/state.py +3 -15
{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/METADATA +84 -11
{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/RECORD +8 -8
{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/LICENSE.txt +0 -0
{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/WHEEL +0 -0
{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/top_level.txt +0 -0

chdb/__init__.py CHANGED Viewed

@@ -19,7 +19,7 @@ _process_result_format_funs = {
 # UDF script path will be f"{g_udf_path}/{func_name}.py"
 g_udf_path = ""
-chdb_version = ('3', '2', '0')
+chdb_version = ('3', '4', '0')
 if sys.version_info[:2] >= (3, 7):
     # get the path of the current file
     current_path = os.path.dirname(os.path.abspath(__file__))

chdb/_chdb.cpython-38-aarch64-linux-gnu.so CHANGED Viewed

Binary file

chdb/session/state.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import tempfile
-import shutil
 import warnings
 import chdb
@@ -51,11 +49,9 @@ class Session:
             )
             g_session.close()
             g_session_path = None
-        if path is None or ":memory:" in path:
-            self._cleanup = True
-            self._path = tempfile.mkdtemp()
+        if path is None:
+            self._path = ":memory:"
         else:
-            self._cleanup = False
             self._path = path
         if chdb.g_udf_path != "":
             self._udf_path = chdb.g_udf_path
@@ -84,8 +80,6 @@ class Session:
         self.close()
     def close(self):
-        if self._cleanup:
-            self.cleanup()
         if self._conn is not None:
             self._conn.close()
             self._conn = None
@@ -95,13 +89,7 @@ class Session:
     def cleanup(self):
         try:
-            if self._conn is not None:
-                self._conn.close()
-                self._conn = None
-            shutil.rmtree(self._path)
-            global g_session, g_session_path
-            g_session = None
-            g_session_path = None
+            self.close()
         except:  # noqa
             pass

{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: chdb
-Version: 3.2.0
+Version: 3.4.0
 Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
 Home-page: https://github.com/chdb-io/chdb
 Author: auxten
@@ -51,11 +51,11 @@ Requires-Dist: pandas >=2.0.0
 > chDB is an in-process SQL OLAP Engine powered by ClickHouse  [^1]
-> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
+> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
 ## Features
 * In-process SQL OLAP Engine, powered by ClickHouse
 * No need to install ClickHouse
 * Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
@@ -146,7 +146,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
 # See more data type format in tests/format_output.py
 res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
 res = chdb.query('select * from file("data.csv", CSV)', 'CSV');  print(res)
-print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
+print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
 ```
 ### Pandas dataframe output
@@ -171,6 +171,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
 print(ret_tbl)
 # Query on the DataFrame Table
 print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
+# Pandas DataFrames are automatically registered as temporary tables in ClickHouse
+chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
 ```
 </details>
@@ -318,10 +320,19 @@ df = pd.DataFrame(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
+            {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
+            {'id': 3, 'name': 'tom'},
+            {'id': 4, 'value': '100'},
+            {'id': 5, 'value': 101},
+            {'id': 6, 'value': 102},
+        ],
     }
 )
 chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
 ```
 ### Query on Arrow Table
@@ -333,12 +344,19 @@ arrow_table = pa.table(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'value': 'tom'},
+            {'id': 2, 'value': 'jerry'},
+            {'id': 3, 'value': 'auxten'},
+            {'id': 4, 'value': 'tom'},
+            {'id': 5, 'value': 'jerry'},
+            {'id': 6, 'value': 'auxten'},
+        ],
     }
 )
-chdb.query(
-    "SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b", "debug"
-).show()
+chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
 ```
 ### Query on chdb.PyReader class instance
@@ -362,24 +380,79 @@ class myReader(chdb.PyReader):
     def read(self, col_names, count):
         print("Python func read", col_names, count, self.cursor)
         if self.cursor >= len(self.data["a"]):
+            self.cursor = 0
             return []
         block = [self.data[col] for col in col_names]
         self.cursor += len(block[0])
         return block
+    def get_schema(self):
+        return [
+            ("a", "int"),
+            ("b", "str"),
+            ("dict_col", "json")
+        ]
 reader = myReader(
     {
         "a": [1, 2, 3, 4, 5, 6],
         "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
+        "dict_col": [
+            {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
+            {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
+            {'id': 3, 'name': 'tom'},
+            {'id': 4, 'value': '100'},
+            {'id': 5, 'value': 101},
+            {'id': 6, 'value': 102}
+        ],
     }
 )
-chdb.query(
-    "SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
-).show()
+chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
+chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
 ```
-see also: [test_query_py.py](tests/test_query_py.py).
+see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
+### JSON Type Inference
+chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
+1. **Pandas DataFrame**
+    - Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
+    - Control sampling via SQL settings:
+      ```sql
+      SET pandas_analyze_sample = 10000  -- Default sampling
+      SET pandas_analyze_sample = 0      -- Force String type
+      SET pandas_analyze_sample = -1     -- Force JSON type
+      ```
+    - Columns are converted to `String` if sampling finds non-dictionary values.
+2. **Arrow Table**
+    - `struct` type columns are automatically mapped to JSON columns.
+    - Nested structures preserve type information.
+3. **chdb.PyReader**
+    - Implement custom schema mapping in `get_schema()`:
+      ```python
+      def get_schema(self):
+          return [
+              ("c1", "JSON"),  # Explicit JSON mapping
+              ("c2", "String")
+          ]
+      ```
+    - Column types declared as "JSON" will bypass auto-detection.
+When converting Python dictionary objects to JSON columns:
+1. **Nested Structures**
+    - Recursively process nested dictionaries, lists, tuples and NumPy arrays.
+2. **Primitive Types**
+    - Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
+3. **Complex Objects**
+    - Non-primitive types will be converted to strings.
 ### Limitations

{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
 chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
-chdb/__init__.py,sha256=yuWj0i3_5-uBRZCyZMBKIiBR1MmjEyAjcuxKTm076jI,3762
-chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=p6q6ZujWMVJL1w2bFECrDzmf69T2tDXH7BYbjmtGEQQ,533414320
+chdb/__init__.py,sha256=pNYyLRqm2s5hG1rPhVLECZXvJQ60EGPt_OC88W90j0w,3762
+chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=jWd7xJeE74xK21ep7dDeYMtYrEW-VVzxYN4HgQf8PiY,568828616
 chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
 chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
-chdb/session/state.py,sha256=UtObxVuyNgeqFkTXVHtmOknR90Pe1dEzbOpKFDBYOkg,4845
+chdb/session/state.py,sha256=m7K9zZtoMQTlh-pfmSyJV38pAe6eHNTPtOvlHYrImhA,4436
 chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
 chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
 chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
@@ -21,8 +21,8 @@ chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSA
 chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
 chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
-chdb-3.2.0.dist-info/RECORD,,
-chdb-3.2.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
-chdb-3.2.0.dist-info/WHEEL,sha256=kTTaziUGh4IgOXqp-kkEc0J_ej7kOw0CdWocYU-1ZR0,149
-chdb-3.2.0.dist-info/METADATA,sha256=SPZ4Gn_IrMBxh3-zVDXQpQejpUV33SdEAVdqGarSLJ8,21517
-chdb-3.2.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
+chdb-3.4.0.dist-info/RECORD,,
+chdb-3.4.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
+chdb-3.4.0.dist-info/WHEEL,sha256=kTTaziUGh4IgOXqp-kkEc0J_ej7kOw0CdWocYU-1ZR0,149
+chdb-3.4.0.dist-info/METADATA,sha256=niEMQj5RD1T34yBzfkJzW9K4gziEvAnITAuzNN10AIs,24622
+chdb-3.4.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11

{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes