chdb 3.1.2__cp39-cp39-macosx_10_15_x86_64.whl → 3.3.0__cp39-cp39-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chdb might be problematic. Click here for more details.

chdb/__init__.py CHANGED
@@ -19,7 +19,7 @@ _process_result_format_funs = {
19
19
  # UDF script path will be f"{g_udf_path}/{func_name}.py"
20
20
  g_udf_path = ""
21
21
 
22
- chdb_version = ('3', '1', '2')
22
+ chdb_version = ('3', '3', '0')
23
23
  if sys.version_info[:2] >= (3, 7):
24
24
  # get the path of the current file
25
25
  current_path = os.path.dirname(os.path.abspath(__file__))
Binary file
chdb/session/state.py CHANGED
@@ -4,7 +4,7 @@ import warnings
4
4
 
5
5
  import chdb
6
6
  from ..state import sqlitelike as chdb_stateful
7
-
7
+ from ..state.sqlitelike import StreamingResult
8
8
 
9
9
  g_session = None
10
10
  g_session_path = None
@@ -120,3 +120,16 @@ Eg: conn = connect(f"db_path?verbose&log-level=test")"""
120
120
 
121
121
  # alias sql = query
122
122
  sql = query
123
+
124
+ def send_query(self, sql, fmt="CSV") -> StreamingResult:
125
+ """
126
+ Execute a streaming query.
127
+ """
128
+ if fmt == "Debug":
129
+ warnings.warn(
130
+ """Debug format is not supported in Session.query
131
+ Please try use parameters in connection string instead:
132
+ Eg: conn = connect(f"db_path?verbose&log-level=test")"""
133
+ )
134
+ fmt = "CSV"
135
+ return self._conn.send_query(sql, fmt)
chdb/state/sqlitelike.py CHANGED
@@ -40,6 +40,57 @@ def to_df(r):
40
40
  return t.to_pandas(use_threads=True)
41
41
 
42
42
 
43
+ class StreamingResult:
44
+ def __init__(self, c_result, conn, result_func):
45
+ self._result = c_result
46
+ self._result_func = result_func
47
+ self._conn = conn
48
+ self._exhausted = False
49
+
50
+ def fetch(self):
51
+ """Fetch next chunk of streaming results"""
52
+ if self._exhausted:
53
+ return None
54
+
55
+ try:
56
+ result = self._conn.streaming_fetch_result(self._result)
57
+ if result is None or result.rows_read() == 0:
58
+ self._exhausted = True
59
+ return None
60
+ return self._result_func(result)
61
+ except Exception as e:
62
+ self._exhausted = True
63
+ raise RuntimeError(f"Streaming query failed: {str(e)}") from e
64
+
65
+ def __iter__(self):
66
+ return self
67
+
68
+ def __next__(self):
69
+ if self._exhausted:
70
+ raise StopIteration
71
+
72
+ chunk = self.fetch()
73
+ if chunk is None:
74
+ self._exhausted = True
75
+ raise StopIteration
76
+
77
+ return chunk
78
+
79
+ def __enter__(self):
80
+ return self
81
+
82
+ def __exit__(self, exc_type, exc_val, exc_tb):
83
+ pass
84
+
85
+ def cancel(self):
86
+ self._exhausted = True
87
+
88
+ try:
89
+ self._conn.streaming_cancel_query(self._result)
90
+ except Exception as e:
91
+ raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
92
+
93
+
43
94
  class Connection:
44
95
  def __init__(self, connection_string: str):
45
96
  # print("Connection", connection_string)
@@ -59,6 +110,15 @@ class Connection:
59
110
  result = self._conn.query(query, format)
60
111
  return result_func(result)
61
112
 
113
+ def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
114
+ lower_output_format = format.lower()
115
+ result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
116
+ if lower_output_format in _arrow_format:
117
+ format = "Arrow"
118
+
119
+ c_stream_result = self._conn.send_query(query, format)
120
+ return StreamingResult(c_stream_result, self._conn, result_func)
121
+
62
122
  def close(self) -> None:
63
123
  # print("close")
64
124
  if self._cursor:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chdb
3
- Version: 3.1.2
3
+ Version: 3.3.0
4
4
  Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
5
5
  Home-page: https://github.com/chdb-io/chdb
6
6
  Author: auxten
@@ -54,11 +54,11 @@ Dynamic: requires-python
54
54
 
55
55
 
56
56
  > chDB is an in-process SQL OLAP Engine powered by ClickHouse [^1]
57
- > For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
57
+ > For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
58
58
 
59
59
 
60
60
  ## Features
61
-
61
+
62
62
  * In-process SQL OLAP Engine, powered by ClickHouse
63
63
  * No need to install ClickHouse
64
64
  * Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
@@ -96,6 +96,44 @@ python3 -m chdb "SELECT 1,'abc'" Pretty
96
96
  ### Data Input
97
97
  The following methods are available to access on-disk and in-memory data formats:
98
98
 
99
+ <details>
100
+ <summary><h4>🗂️ Connection based API (recommended)</h4></summary>
101
+
102
+ ```python
103
+ import chdb
104
+
105
+ # Create a connection (in-memory by default)
106
+ conn = chdb.connect(":memory:")
107
+ # Or use file-based: conn = chdb.connect("test.db")
108
+
109
+ # Create a cursor
110
+ cur = conn.cursor()
111
+
112
+ # Execute queries
113
+ cur.execute("SELECT number, toString(number) as str FROM system.numbers LIMIT 3")
114
+
115
+ # Fetch data in different ways
116
+ print(cur.fetchone()) # Single row: (0, '0')
117
+ print(cur.fetchmany(2)) # Multiple rows: ((1, '1'), (2, '2'))
118
+
119
+ # Get column information
120
+ print(cur.column_names()) # ['number', 'str']
121
+ print(cur.column_types()) # ['UInt64', 'String']
122
+
123
+ # Use the cursor as an iterator
124
+ cur.execute("SELECT number FROM system.numbers LIMIT 3")
125
+ for row in cur:
126
+ print(row)
127
+
128
+ # Always close resources when done
129
+ cur.close()
130
+ conn.close()
131
+ ```
132
+
133
+ For more details, see [examples/connect.py](examples/connect.py).
134
+ </details>
135
+
136
+
99
137
  <details>
100
138
  <summary><h4>🗂️ Query On File</h4> (Parquet, CSV, JSON, Arrow, ORC and 60+)</summary>
101
139
 
@@ -111,7 +149,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
111
149
  # See more data type format in tests/format_output.py
112
150
  res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
113
151
  res = chdb.query('select * from file("data.csv", CSV)', 'CSV'); print(res)
114
- print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
152
+ print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
115
153
  ```
116
154
 
117
155
  ### Pandas dataframe output
@@ -136,6 +174,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
136
174
  print(ret_tbl)
137
175
  # Query on the DataFrame Table
138
176
  print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
177
+ # Pandas DataFrames are automatically registered as temporary tables in ClickHouse
178
+ chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
139
179
  ```
140
180
  </details>
141
181
 
@@ -221,6 +261,56 @@ see also: [test_udf.py](tests/test_udf.py).
221
261
  </details>
222
262
 
223
263
 
264
+ <details>
265
+ <summary><h4>🗂️ Streaming Query</h4></summary>
266
+
267
+ Process large datasets with constant memory usage through chunked streaming.
268
+
269
+ ```python
270
+ from chdb import session as chs
271
+
272
+ sess = chs.Session()
273
+
274
+ # Example 1: Basic example of using streaming query
275
+ rows_cnt = 0
276
+ with sess.send_query("SELECT * FROM numbers(200000)", "CSV") as stream_result:
277
+ for chunk in stream_result:
278
+ rows_cnt += chunk.rows_read()
279
+
280
+ print(rows_cnt) # 200000
281
+
282
+ # Example 2: Manual iteration with fetch()
283
+ rows_cnt = 0
284
+ stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
285
+ while True:
286
+ chunk = stream_result.fetch()
287
+ if chunk is None:
288
+ break
289
+ rows_cnt += chunk.rows_read()
290
+
291
+ print(rows_cnt) # 200000
292
+
293
+ # Example 3: Early cancellation demo
294
+ rows_cnt = 0
295
+ stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
296
+ while True:
297
+ chunk = stream_result.fetch()
298
+ if chunk is None:
299
+ break
300
+ if rows_cnt > 0:
301
+ stream_result.cancel()
302
+ break
303
+ rows_cnt += chunk.rows_read()
304
+
305
+ print(rows_cnt) # 65409
306
+
307
+ sess.close()
308
+ ```
309
+
310
+ For more details, see [test_streaming_query.py](tests/test_streaming_query.py).
311
+ </details>
312
+
313
+
224
314
  <details>
225
315
  <summary><h4>🗂️ Python Table Engine</h4></summary>
226
316
 
@@ -233,10 +323,19 @@ df = pd.DataFrame(
233
323
  {
234
324
  "a": [1, 2, 3, 4, 5, 6],
235
325
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
326
+ "dict_col": [
327
+ {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
328
+ {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
329
+ {'id': 3, 'name': 'tom'},
330
+ {'id': 4, 'value': '100'},
331
+ {'id': 5, 'value': 101},
332
+ {'id': 6, 'value': 102},
333
+ ],
236
334
  }
237
335
  )
238
336
 
239
337
  chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
338
+ chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
240
339
  ```
241
340
 
242
341
  ### Query on Arrow Table
@@ -248,12 +347,19 @@ arrow_table = pa.table(
248
347
  {
249
348
  "a": [1, 2, 3, 4, 5, 6],
250
349
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
350
+ "dict_col": [
351
+ {'id': 1, 'value': 'tom'},
352
+ {'id': 2, 'value': 'jerry'},
353
+ {'id': 3, 'value': 'auxten'},
354
+ {'id': 4, 'value': 'tom'},
355
+ {'id': 5, 'value': 'jerry'},
356
+ {'id': 6, 'value': 'auxten'},
357
+ ],
251
358
  }
252
359
  )
253
360
 
254
- chdb.query(
255
- "SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b", "debug"
256
- ).show()
361
+ chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
362
+ chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
257
363
  ```
258
364
 
259
365
  ### Query on chdb.PyReader class instance
@@ -277,24 +383,79 @@ class myReader(chdb.PyReader):
277
383
  def read(self, col_names, count):
278
384
  print("Python func read", col_names, count, self.cursor)
279
385
  if self.cursor >= len(self.data["a"]):
386
+ self.cursor = 0
280
387
  return []
281
388
  block = [self.data[col] for col in col_names]
282
389
  self.cursor += len(block[0])
283
390
  return block
284
391
 
392
+ def get_schema(self):
393
+ return [
394
+ ("a", "int"),
395
+ ("b", "str"),
396
+ ("dict_col", "json")
397
+ ]
398
+
285
399
  reader = myReader(
286
400
  {
287
401
  "a": [1, 2, 3, 4, 5, 6],
288
402
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
403
+ "dict_col": [
404
+ {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
405
+ {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
406
+ {'id': 3, 'name': 'tom'},
407
+ {'id': 4, 'value': '100'},
408
+ {'id': 5, 'value': 101},
409
+ {'id': 6, 'value': 102}
410
+ ],
289
411
  }
290
412
  )
291
413
 
292
- chdb.query(
293
- "SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
294
- ).show()
414
+ chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
415
+ chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
295
416
  ```
296
417
 
297
- see also: [test_query_py.py](tests/test_query_py.py).
418
+ see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
419
+
420
+ ### JSON Type Inference
421
+
422
+ chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
423
+
424
+ 1. **Pandas DataFrame**
425
+ - Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
426
+ - Control sampling via SQL settings:
427
+ ```sql
428
+ SET pandas_analyze_sample = 10000 -- Default sampling
429
+ SET pandas_analyze_sample = 0 -- Force String type
430
+ SET pandas_analyze_sample = -1 -- Force JSON type
431
+ ```
432
+ - Columns are converted to `String` if sampling finds non-dictionary values.
433
+
434
+ 2. **Arrow Table**
435
+ - `struct` type columns are automatically mapped to JSON columns.
436
+ - Nested structures preserve type information.
437
+
438
+ 3. **chdb.PyReader**
439
+ - Implement custom schema mapping in `get_schema()`:
440
+ ```python
441
+ def get_schema(self):
442
+ return [
443
+ ("c1", "JSON"), # Explicit JSON mapping
444
+ ("c2", "String")
445
+ ]
446
+ ```
447
+ - Column types declared as "JSON" will bypass auto-detection.
448
+
449
+ When converting Python dictionary objects to JSON columns:
450
+
451
+ 1. **Nested Structures**
452
+ - Recursively process nested dictionaries, lists, tuples and NumPy arrays.
453
+
454
+ 2. **Primitive Types**
455
+ - Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
456
+
457
+ 3. **Complex Objects**
458
+ - Non-primitive types will be converted to strings.
298
459
 
299
460
  ### Limitations
300
461
 
@@ -1,6 +1,6 @@
1
- chdb/__init__.py,sha256=Wb4a4CPgJ0j44kDuehkwITZV9Q6QOqyUmxA5PM6BbYk,3762
1
+ chdb/__init__.py,sha256=KsqKKRN2T2Rspn94XwwtR45fT5viF5h6KUJ7JIETo1w,3762
2
2
  chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
3
- chdb/_chdb.cpython-39-darwin.so,sha256=HtRrZzYGC5GBZqY0SSkvKnd3PNz7GQjzMV8QUeiZMiA,422085256
3
+ chdb/_chdb.cpython-39-darwin.so,sha256=-IrhrMdI1OnciveQnl9jK7QzxTptb4biBLFIVa9brtc,422309072
4
4
  chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
5
5
  chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
6
6
  chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
@@ -13,16 +13,16 @@ chdb/dbapi/times.py,sha256=_qXgDaYwsHntvpIKSKXp1rrYIgtq6Z9pLyLnO2XNoL0,360
13
13
  chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSAb60z8,370
14
14
  chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
16
- chdb/session/state.py,sha256=nx9KlqZyPTHAflToXCJVRBUSMjJFvyh6x2akP7Gc7h0,4360
16
+ chdb/session/state.py,sha256=UtObxVuyNgeqFkTXVHtmOknR90Pe1dEzbOpKFDBYOkg,4845
17
17
  chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
18
- chdb/state/sqlitelike.py,sha256=6Y57vnf7LnA0KnpByKQq7PkEkEEOKK-ExaHQLb1bedQ,10498
18
+ chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
19
19
  chdb/udf/__init__.py,sha256=qSMaPEre7w1pYz8uJ-iZtuu8wYOUNRcI_8UNuaOymGE,80
20
20
  chdb/udf/udf.py,sha256=z0A1RmyZrx55bykpvvS-LpVt1lMrQOexjvU5zxCdCSA,3935
21
21
  chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
22
22
  chdb/utils/trace.py,sha256=W-pvDoKlnzq6H_7FiWjr5_teN40UNE4E5--zbUrjOIc,2511
23
23
  chdb/utils/types.py,sha256=MGLFIjoDvu7Uc2Wy8EDY60jjue66HmMPxbhrujjrZxQ,7530
24
- chdb-3.1.2.dist-info/licenses/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
25
- chdb-3.1.2.dist-info/METADATA,sha256=cr0z9tTvWd_9_cckipHt8UBVZ0FBVOarA_6CTYwDxjA,19512
26
- chdb-3.1.2.dist-info/WHEEL,sha256=frmk8uyyRBGmQ11xC4icmh-xoy3HnCVGUtquUWdhsEY,109
27
- chdb-3.1.2.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
28
- chdb-3.1.2.dist-info/RECORD,,
24
+ chdb-3.3.0.dist-info/licenses/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
25
+ chdb-3.3.0.dist-info/METADATA,sha256=fRAJ-SRLZW1U7SpJeiGr2S7rKJyesMCGoQPpcPc6U1s,24690
26
+ chdb-3.3.0.dist-info/WHEEL,sha256=rhBzRBb5NVgz08EmkbnoZQLYki4NZ37t5Zn8NZLHrUo,109
27
+ chdb-3.3.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
28
+ chdb-3.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp39-cp39-macosx_10_15_x86_64
5
5