chdb 3.1.2__cp38-cp38-macosx_10_15_x86_64.whl → 3.3.0__cp38-cp38-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chdb might be problematic. Click here for more details.

chdb/__init__.py CHANGED
@@ -19,7 +19,7 @@ _process_result_format_funs = {
19
19
  # UDF script path will be f"{g_udf_path}/{func_name}.py"
20
20
  g_udf_path = ""
21
21
 
22
- chdb_version = ('3', '1', '2')
22
+ chdb_version = ('3', '3', '0')
23
23
  if sys.version_info[:2] >= (3, 7):
24
24
  # get the path of the current file
25
25
  current_path = os.path.dirname(os.path.abspath(__file__))
Binary file
chdb/session/state.py CHANGED
@@ -4,7 +4,7 @@ import warnings
4
4
 
5
5
  import chdb
6
6
  from ..state import sqlitelike as chdb_stateful
7
-
7
+ from ..state.sqlitelike import StreamingResult
8
8
 
9
9
  g_session = None
10
10
  g_session_path = None
@@ -120,3 +120,16 @@ Eg: conn = connect(f"db_path?verbose&log-level=test")"""
120
120
 
121
121
  # alias sql = query
122
122
  sql = query
123
+
124
+ def send_query(self, sql, fmt="CSV") -> StreamingResult:
125
+ """
126
+ Execute a streaming query.
127
+ """
128
+ if fmt == "Debug":
129
+ warnings.warn(
130
+ """Debug format is not supported in Session.query
131
+ Please try use parameters in connection string instead:
132
+ Eg: conn = connect(f"db_path?verbose&log-level=test")"""
133
+ )
134
+ fmt = "CSV"
135
+ return self._conn.send_query(sql, fmt)
chdb/state/sqlitelike.py CHANGED
@@ -40,6 +40,57 @@ def to_df(r):
40
40
  return t.to_pandas(use_threads=True)
41
41
 
42
42
 
43
+ class StreamingResult:
44
+ def __init__(self, c_result, conn, result_func):
45
+ self._result = c_result
46
+ self._result_func = result_func
47
+ self._conn = conn
48
+ self._exhausted = False
49
+
50
+ def fetch(self):
51
+ """Fetch next chunk of streaming results"""
52
+ if self._exhausted:
53
+ return None
54
+
55
+ try:
56
+ result = self._conn.streaming_fetch_result(self._result)
57
+ if result is None or result.rows_read() == 0:
58
+ self._exhausted = True
59
+ return None
60
+ return self._result_func(result)
61
+ except Exception as e:
62
+ self._exhausted = True
63
+ raise RuntimeError(f"Streaming query failed: {str(e)}") from e
64
+
65
+ def __iter__(self):
66
+ return self
67
+
68
+ def __next__(self):
69
+ if self._exhausted:
70
+ raise StopIteration
71
+
72
+ chunk = self.fetch()
73
+ if chunk is None:
74
+ self._exhausted = True
75
+ raise StopIteration
76
+
77
+ return chunk
78
+
79
+ def __enter__(self):
80
+ return self
81
+
82
+ def __exit__(self, exc_type, exc_val, exc_tb):
83
+ pass
84
+
85
+ def cancel(self):
86
+ self._exhausted = True
87
+
88
+ try:
89
+ self._conn.streaming_cancel_query(self._result)
90
+ except Exception as e:
91
+ raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
92
+
93
+
43
94
  class Connection:
44
95
  def __init__(self, connection_string: str):
45
96
  # print("Connection", connection_string)
@@ -59,6 +110,15 @@ class Connection:
59
110
  result = self._conn.query(query, format)
60
111
  return result_func(result)
61
112
 
113
+ def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
114
+ lower_output_format = format.lower()
115
+ result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
116
+ if lower_output_format in _arrow_format:
117
+ format = "Arrow"
118
+
119
+ c_stream_result = self._conn.send_query(query, format)
120
+ return StreamingResult(c_stream_result, self._conn, result_func)
121
+
62
122
  def close(self) -> None:
63
123
  # print("close")
64
124
  if self._cursor:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: chdb
3
- Version: 3.1.2
3
+ Version: 3.3.0
4
4
  Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
5
5
  Home-page: https://github.com/chdb-io/chdb
6
6
  Author: auxten
@@ -51,11 +51,11 @@ Requires-Dist: pandas >=2.0.0
51
51
 
52
52
 
53
53
  > chDB is an in-process SQL OLAP Engine powered by ClickHouse [^1]
54
- > For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
54
+ > For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
55
55
 
56
56
 
57
57
  ## Features
58
-
58
+
59
59
  * In-process SQL OLAP Engine, powered by ClickHouse
60
60
  * No need to install ClickHouse
61
61
  * Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
@@ -93,6 +93,44 @@ python3 -m chdb "SELECT 1,'abc'" Pretty
93
93
  ### Data Input
94
94
  The following methods are available to access on-disk and in-memory data formats:
95
95
 
96
+ <details>
97
+ <summary><h4>🗂️ Connection based API (recommended)</h4></summary>
98
+
99
+ ```python
100
+ import chdb
101
+
102
+ # Create a connection (in-memory by default)
103
+ conn = chdb.connect(":memory:")
104
+ # Or use file-based: conn = chdb.connect("test.db")
105
+
106
+ # Create a cursor
107
+ cur = conn.cursor()
108
+
109
+ # Execute queries
110
+ cur.execute("SELECT number, toString(number) as str FROM system.numbers LIMIT 3")
111
+
112
+ # Fetch data in different ways
113
+ print(cur.fetchone()) # Single row: (0, '0')
114
+ print(cur.fetchmany(2)) # Multiple rows: ((1, '1'), (2, '2'))
115
+
116
+ # Get column information
117
+ print(cur.column_names()) # ['number', 'str']
118
+ print(cur.column_types()) # ['UInt64', 'String']
119
+
120
+ # Use the cursor as an iterator
121
+ cur.execute("SELECT number FROM system.numbers LIMIT 3")
122
+ for row in cur:
123
+ print(row)
124
+
125
+ # Always close resources when done
126
+ cur.close()
127
+ conn.close()
128
+ ```
129
+
130
+ For more details, see [examples/connect.py](examples/connect.py).
131
+ </details>
132
+
133
+
96
134
  <details>
97
135
  <summary><h4>🗂️ Query On File</h4> (Parquet, CSV, JSON, Arrow, ORC and 60+)</summary>
98
136
 
@@ -108,7 +146,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
108
146
  # See more data type format in tests/format_output.py
109
147
  res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
110
148
  res = chdb.query('select * from file("data.csv", CSV)', 'CSV'); print(res)
111
- print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
149
+ print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
112
150
  ```
113
151
 
114
152
  ### Pandas dataframe output
@@ -133,6 +171,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
133
171
  print(ret_tbl)
134
172
  # Query on the DataFrame Table
135
173
  print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
174
+ # Pandas DataFrames are automatically registered as temporary tables in ClickHouse
175
+ chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
136
176
  ```
137
177
  </details>
138
178
 
@@ -218,6 +258,56 @@ see also: [test_udf.py](tests/test_udf.py).
218
258
  </details>
219
259
 
220
260
 
261
+ <details>
262
+ <summary><h4>🗂️ Streaming Query</h4></summary>
263
+
264
+ Process large datasets with constant memory usage through chunked streaming.
265
+
266
+ ```python
267
+ from chdb import session as chs
268
+
269
+ sess = chs.Session()
270
+
271
+ # Example 1: Basic example of using streaming query
272
+ rows_cnt = 0
273
+ with sess.send_query("SELECT * FROM numbers(200000)", "CSV") as stream_result:
274
+ for chunk in stream_result:
275
+ rows_cnt += chunk.rows_read()
276
+
277
+ print(rows_cnt) # 200000
278
+
279
+ # Example 2: Manual iteration with fetch()
280
+ rows_cnt = 0
281
+ stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
282
+ while True:
283
+ chunk = stream_result.fetch()
284
+ if chunk is None:
285
+ break
286
+ rows_cnt += chunk.rows_read()
287
+
288
+ print(rows_cnt) # 200000
289
+
290
+ # Example 3: Early cancellation demo
291
+ rows_cnt = 0
292
+ stream_result = sess.send_query("SELECT * FROM numbers(200000)", "CSV")
293
+ while True:
294
+ chunk = stream_result.fetch()
295
+ if chunk is None:
296
+ break
297
+ if rows_cnt > 0:
298
+ stream_result.cancel()
299
+ break
300
+ rows_cnt += chunk.rows_read()
301
+
302
+ print(rows_cnt) # 65409
303
+
304
+ sess.close()
305
+ ```
306
+
307
+ For more details, see [test_streaming_query.py](tests/test_streaming_query.py).
308
+ </details>
309
+
310
+
221
311
  <details>
222
312
  <summary><h4>🗂️ Python Table Engine</h4></summary>
223
313
 
@@ -230,10 +320,19 @@ df = pd.DataFrame(
230
320
  {
231
321
  "a": [1, 2, 3, 4, 5, 6],
232
322
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
323
+ "dict_col": [
324
+ {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
325
+ {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
326
+ {'id': 3, 'name': 'tom'},
327
+ {'id': 4, 'value': '100'},
328
+ {'id': 5, 'value': 101},
329
+ {'id': 6, 'value': 102},
330
+ ],
233
331
  }
234
332
  )
235
333
 
236
334
  chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
335
+ chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
237
336
  ```
238
337
 
239
338
  ### Query on Arrow Table
@@ -245,12 +344,19 @@ arrow_table = pa.table(
245
344
  {
246
345
  "a": [1, 2, 3, 4, 5, 6],
247
346
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
347
+ "dict_col": [
348
+ {'id': 1, 'value': 'tom'},
349
+ {'id': 2, 'value': 'jerry'},
350
+ {'id': 3, 'value': 'auxten'},
351
+ {'id': 4, 'value': 'tom'},
352
+ {'id': 5, 'value': 'jerry'},
353
+ {'id': 6, 'value': 'auxten'},
354
+ ],
248
355
  }
249
356
  )
250
357
 
251
- chdb.query(
252
- "SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b", "debug"
253
- ).show()
358
+ chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
359
+ chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
254
360
  ```
255
361
 
256
362
  ### Query on chdb.PyReader class instance
@@ -274,24 +380,79 @@ class myReader(chdb.PyReader):
274
380
  def read(self, col_names, count):
275
381
  print("Python func read", col_names, count, self.cursor)
276
382
  if self.cursor >= len(self.data["a"]):
383
+ self.cursor = 0
277
384
  return []
278
385
  block = [self.data[col] for col in col_names]
279
386
  self.cursor += len(block[0])
280
387
  return block
281
388
 
389
+ def get_schema(self):
390
+ return [
391
+ ("a", "int"),
392
+ ("b", "str"),
393
+ ("dict_col", "json")
394
+ ]
395
+
282
396
  reader = myReader(
283
397
  {
284
398
  "a": [1, 2, 3, 4, 5, 6],
285
399
  "b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
400
+ "dict_col": [
401
+ {'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
402
+ {'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
403
+ {'id': 3, 'name': 'tom'},
404
+ {'id': 4, 'value': '100'},
405
+ {'id': 5, 'value': 101},
406
+ {'id': 6, 'value': 102}
407
+ ],
286
408
  }
287
409
  )
288
410
 
289
- chdb.query(
290
- "SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
291
- ).show()
411
+ chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
412
+ chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
292
413
  ```
293
414
 
294
- see also: [test_query_py.py](tests/test_query_py.py).
415
+ see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
416
+
417
+ ### JSON Type Inference
418
+
419
+ chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
420
+
421
+ 1. **Pandas DataFrame**
422
+ - Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
423
+ - Control sampling via SQL settings:
424
+ ```sql
425
+ SET pandas_analyze_sample = 10000 -- Default sampling
426
+ SET pandas_analyze_sample = 0 -- Force String type
427
+ SET pandas_analyze_sample = -1 -- Force JSON type
428
+ ```
429
+ - Columns are converted to `String` if sampling finds non-dictionary values.
430
+
431
+ 2. **Arrow Table**
432
+ - `struct` type columns are automatically mapped to JSON columns.
433
+ - Nested structures preserve type information.
434
+
435
+ 3. **chdb.PyReader**
436
+ - Implement custom schema mapping in `get_schema()`:
437
+ ```python
438
+ def get_schema(self):
439
+ return [
440
+ ("c1", "JSON"), # Explicit JSON mapping
441
+ ("c2", "String")
442
+ ]
443
+ ```
444
+ - Column types declared as "JSON" will bypass auto-detection.
445
+
446
+ When converting Python dictionary objects to JSON columns:
447
+
448
+ 1. **Nested Structures**
449
+ - Recursively process nested dictionaries, lists, tuples and NumPy arrays.
450
+
451
+ 2. **Primitive Types**
452
+ - Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
453
+
454
+ 3. **Complex Objects**
455
+ - Non-primitive types will be converted to strings.
295
456
 
296
457
  ### Limitations
297
458
 
@@ -1,6 +1,6 @@
1
- chdb/__init__.py,sha256=Wb4a4CPgJ0j44kDuehkwITZV9Q6QOqyUmxA5PM6BbYk,3762
1
+ chdb/__init__.py,sha256=KsqKKRN2T2Rspn94XwwtR45fT5viF5h6KUJ7JIETo1w,3762
2
2
  chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
3
- chdb/_chdb.cpython-38-darwin.so,sha256=oaIxhT5WTYorisJ5eRe7YgR1z761jimyMYqgWLUWaMk,422084800
3
+ chdb/_chdb.cpython-38-darwin.so,sha256=kZkCo6Y0s9qWOY9D-JAFaKOTXb3T-Q14YVxetRhfRvg,422308456
4
4
  chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
5
5
  chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
6
6
  chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
@@ -13,16 +13,16 @@ chdb/dbapi/times.py,sha256=_qXgDaYwsHntvpIKSKXp1rrYIgtq6Z9pLyLnO2XNoL0,360
13
13
  chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSAb60z8,370
14
14
  chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
16
- chdb/session/state.py,sha256=nx9KlqZyPTHAflToXCJVRBUSMjJFvyh6x2akP7Gc7h0,4360
16
+ chdb/session/state.py,sha256=UtObxVuyNgeqFkTXVHtmOknR90Pe1dEzbOpKFDBYOkg,4845
17
17
  chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
18
- chdb/state/sqlitelike.py,sha256=6Y57vnf7LnA0KnpByKQq7PkEkEEOKK-ExaHQLb1bedQ,10498
18
+ chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
19
19
  chdb/udf/__init__.py,sha256=qSMaPEre7w1pYz8uJ-iZtuu8wYOUNRcI_8UNuaOymGE,80
20
20
  chdb/udf/udf.py,sha256=z0A1RmyZrx55bykpvvS-LpVt1lMrQOexjvU5zxCdCSA,3935
21
21
  chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
22
22
  chdb/utils/trace.py,sha256=W-pvDoKlnzq6H_7FiWjr5_teN40UNE4E5--zbUrjOIc,2511
23
23
  chdb/utils/types.py,sha256=MGLFIjoDvu7Uc2Wy8EDY60jjue66HmMPxbhrujjrZxQ,7530
24
- chdb-3.1.2.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
25
- chdb-3.1.2.dist-info/METADATA,sha256=EzsMy4v_dK-KxFytLdwErIH-JjEg_TrkW0PkmeN5GBI,19444
26
- chdb-3.1.2.dist-info/WHEEL,sha256=WyxCboCiRNvHww1lxL6mr82B-yTKwQTJtmqg4JQiVzc,109
27
- chdb-3.1.2.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
28
- chdb-3.1.2.dist-info/RECORD,,
24
+ chdb-3.3.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
25
+ chdb-3.3.0.dist-info/METADATA,sha256=ILO7J8Bgj69p_fcUbm37--PinnHF3ZJA-1TewaHIoqo,24622
26
+ chdb-3.3.0.dist-info/WHEEL,sha256=WyxCboCiRNvHww1lxL6mr82B-yTKwQTJtmqg4JQiVzc,109
27
+ chdb-3.3.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
28
+ chdb-3.3.0.dist-info/RECORD,,
File without changes