chdb 3.7.1__cp38-abi3-musllinux_1_2_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chdb might be problematic. Click here for more details.

@@ -0,0 +1,1101 @@
1
+ from typing import Optional, Any
2
+ from chdb import _chdb
3
+
4
+ # try import pyarrow if failed, raise ImportError with suggestion
5
+ try:
6
+ import pyarrow as pa # noqa
7
+ except ImportError as e:
8
+ print(f"ImportError: {e}")
9
+ print('Please install pyarrow via "pip install pyarrow"')
10
+ raise ImportError("Failed to import pyarrow") from None
11
+
12
+
13
+ _arrow_format = set({"dataframe", "arrowtable"})
14
+ _process_result_format_funs = {
15
+ "dataframe": lambda x: to_df(x),
16
+ "arrowtable": lambda x: to_arrowTable(x),
17
+ }
18
+
19
+
20
+ # return pyarrow table
21
+ def to_arrowTable(res):
22
+ """Convert query result to PyArrow Table.
23
+
24
+ This function converts chdb query results to a PyArrow Table format,
25
+ which provides efficient columnar data access and interoperability
26
+ with other data processing libraries.
27
+
28
+ Args:
29
+ res: Query result object from chdb containing Arrow format data
30
+
31
+ Returns:
32
+ pyarrow.Table: PyArrow Table containing the query results
33
+
34
+ Raises:
35
+ ImportError: If pyarrow or pandas packages are not installed
36
+
37
+ .. note::
38
+ This function requires both pyarrow and pandas to be installed.
39
+ Install them with: ``pip install pyarrow pandas``
40
+
41
+ .. warning::
42
+ Empty results return an empty PyArrow Table with no schema.
43
+
44
+ Examples:
45
+ >>> import chdb
46
+ >>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow")
47
+ >>> table = to_arrowTable(result)
48
+ >>> print(table.schema)
49
+ num: int64
50
+ text: string
51
+ >>> print(table.to_pandas())
52
+ num text
53
+ 0 1 hello
54
+ """
55
+ # try import pyarrow and pandas, if failed, raise ImportError with suggestion
56
+ try:
57
+ import pyarrow as pa # noqa
58
+ import pandas as pd # noqa
59
+ except ImportError as e:
60
+ print(f"ImportError: {e}")
61
+ print('Please install pyarrow and pandas via "pip install pyarrow pandas"')
62
+ raise ImportError("Failed to import pyarrow or pandas") from None
63
+ if len(res) == 0:
64
+ return pa.Table.from_batches([], schema=pa.schema([]))
65
+
66
+ memview = res.get_memview()
67
+ return pa.RecordBatchFileReader(memview.view()).read_all()
68
+
69
+
70
+ # return pandas dataframe
71
+ def to_df(r):
72
+ """Convert query result to Pandas DataFrame.
73
+
74
+ This function converts chdb query results to a Pandas DataFrame format
75
+ by first converting to PyArrow Table and then to DataFrame. This provides
76
+ convenient data analysis capabilities with Pandas API.
77
+
78
+ Args:
79
+ r: Query result object from chdb containing Arrow format data
80
+
81
+ Returns:
82
+ pandas.DataFrame: DataFrame containing the query results with
83
+ appropriate column names and data types
84
+
85
+ Raises:
86
+ ImportError: If pyarrow or pandas packages are not installed
87
+
88
+ .. note::
89
+ This function uses multi-threading for the Arrow to Pandas conversion
90
+ to improve performance on large datasets.
91
+
92
+ .. seealso::
93
+ :func:`to_arrowTable` - For PyArrow Table format conversion
94
+
95
+ Examples:
96
+ >>> import chdb
97
+ >>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow")
98
+ >>> df = to_df(result)
99
+ >>> print(df)
100
+ num text
101
+ 0 1 hello
102
+ >>> print(df.dtypes)
103
+ num int64
104
+ text object
105
+ dtype: object
106
+ """
107
+ t = to_arrowTable(r)
108
+ return t.to_pandas(use_threads=True)
109
+
110
+
111
+ class StreamingResult:
112
+ def __init__(self, c_result, conn, result_func, supports_record_batch):
113
+ self._result = c_result
114
+ self._result_func = result_func
115
+ self._conn = conn
116
+ self._exhausted = False
117
+ self._supports_record_batch = supports_record_batch
118
+
119
+ def fetch(self):
120
+ """Fetch the next chunk of streaming results.
121
+
122
+ This method retrieves the next available chunk of data from the streaming
123
+ query result. It automatically handles exhaustion detection and applies
124
+ the configured result transformation function.
125
+
126
+ Returns:
127
+ The next chunk of results in the format specified during query execution,
128
+ or None if no more data is available
129
+
130
+ Raises:
131
+ RuntimeError: If the streaming query encounters an error
132
+
133
+ .. note::
134
+ Once the stream is exhausted (returns None), subsequent calls will
135
+ continue to return None.
136
+
137
+ .. warning::
138
+ This method should be called sequentially. Concurrent calls may
139
+ result in undefined behavior.
140
+
141
+ Examples:
142
+ >>> conn = Connection(":memory:")
143
+ >>> stream = conn.send_query("SELECT number FROM numbers(100)")
144
+ >>> chunk = stream.fetch()
145
+ >>> while chunk is not None:
146
+ ... print(f"Got chunk with {len(chunk)} bytes")
147
+ ... chunk = stream.fetch()
148
+ """
149
+ if self._exhausted:
150
+ return None
151
+
152
+ try:
153
+ result = self._conn.streaming_fetch_result(self._result)
154
+ if result is None or result.rows_read() == 0:
155
+ self._exhausted = True
156
+ return None
157
+ return self._result_func(result)
158
+ except Exception as e:
159
+ self._exhausted = True
160
+ raise RuntimeError(f"Streaming query failed: {str(e)}") from e
161
+
162
+ def __iter__(self):
163
+ return self
164
+
165
+ def __next__(self):
166
+ if self._exhausted:
167
+ raise StopIteration
168
+
169
+ chunk = self.fetch()
170
+ if chunk is None:
171
+ self._exhausted = True
172
+ raise StopIteration
173
+
174
+ return chunk
175
+
176
+ def __enter__(self):
177
+ return self
178
+
179
+ def __exit__(self, exc_type, exc_val, exc_tb):
180
+ self.cancel()
181
+
182
+ def close(self):
183
+ """Close the streaming result and cleanup resources.
184
+
185
+ This method is an alias for :meth:`cancel` and provides a more
186
+ intuitive interface for resource cleanup. It cancels the streaming
187
+ query and marks the result as exhausted.
188
+
189
+ .. seealso::
190
+ :meth:`cancel` - The underlying cancellation method
191
+
192
+ Examples:
193
+ >>> stream = conn.send_query("SELECT * FROM large_table")
194
+ >>> # Process some data
195
+ >>> chunk = stream.fetch()
196
+ >>> # Close when done
197
+ >>> stream.close()
198
+ """
199
+ self.cancel()
200
+
201
+ def cancel(self):
202
+ """Cancel the streaming query and cleanup resources.
203
+
204
+ This method cancels the streaming query on the server side and marks
205
+ the StreamingResult as exhausted. After calling this method, no more
206
+ data can be fetched from this result.
207
+
208
+ Raises:
209
+ RuntimeError: If cancellation fails on the server side
210
+
211
+ .. note::
212
+ This method is idempotent - calling it multiple times is safe
213
+ and will not cause errors.
214
+
215
+ .. warning::
216
+ Once cancelled, the streaming result cannot be resumed or reset.
217
+ You must create a new query to get fresh results.
218
+
219
+ Examples:
220
+ >>> stream = conn.send_query("SELECT * FROM huge_table")
221
+ >>> # Process first few chunks
222
+ >>> for i, chunk in enumerate(stream):
223
+ ... if i >= 5: # Stop after 5 chunks
224
+ ... stream.cancel()
225
+ ... break
226
+ ... process_chunk(chunk)
227
+ """
228
+ if not self._exhausted:
229
+ self._exhausted = True
230
+ try:
231
+ self._conn.streaming_cancel_query(self._result)
232
+ except Exception as e:
233
+ raise RuntimeError(f"Failed to cancel streaming query: {str(e)}") from e
234
+
235
+ def record_batch(self, rows_per_batch: int = 1000000) -> pa.RecordBatchReader:
236
+ """
237
+ Create a PyArrow RecordBatchReader from this StreamingResult.
238
+
239
+ This method requires that the StreamingResult was created with arrow format.
240
+ It wraps the streaming result with ChdbRecordBatchReader to provide efficient
241
+ batching with configurable batch sizes.
242
+
243
+ Args:
244
+ rows_per_batch (int): Number of rows per batch. Defaults to 1000000.
245
+
246
+ Returns:
247
+ pa.RecordBatchReader: PyArrow RecordBatchReader for efficient streaming
248
+
249
+ Raises:
250
+ ValueError: If the StreamingResult was not created with arrow format
251
+ """
252
+ if not self._supports_record_batch:
253
+ raise ValueError(
254
+ "record_batch() can only be used with arrow format. "
255
+ "Please use format='Arrow' when calling send_query."
256
+ )
257
+
258
+ chdb_reader = ChdbRecordBatchReader(self, rows_per_batch)
259
+ return pa.RecordBatchReader.from_batches(chdb_reader.schema(), chdb_reader)
260
+
261
+
262
+ class ChdbRecordBatchReader:
263
+ """
264
+ A PyArrow RecordBatchReader wrapper for chdb StreamingResult.
265
+
266
+ This class provides an efficient way to read large result sets as PyArrow RecordBatches
267
+ with configurable batch sizes to optimize memory usage and performance.
268
+ """
269
+
270
+ def __init__(self, chdb_stream_result, batch_size_rows):
271
+ self._stream_result = chdb_stream_result
272
+ self._schema = None
273
+ self._closed = False
274
+ self._pending_batches = []
275
+ self._accumulator = []
276
+ self._batch_size_rows = batch_size_rows
277
+ self._current_rows = 0
278
+ self._first_batch = None
279
+ self._first_batch_consumed = True
280
+ self._schema = self.schema()
281
+
282
+ def schema(self):
283
+ if self._schema is None:
284
+ # Get the first chunk to determine schema
285
+ chunk = self._stream_result.fetch()
286
+ if chunk is not None:
287
+ arrow_bytes = chunk.bytes()
288
+ reader = pa.RecordBatchFileReader(arrow_bytes)
289
+ self._schema = reader.schema
290
+
291
+ table = reader.read_all()
292
+ if table.num_rows > 0:
293
+ batches = table.to_batches()
294
+ self._first_batch = batches[0]
295
+ if len(batches) > 1:
296
+ self._pending_batches = batches[1:]
297
+ self._first_batch_consumed = False
298
+ else:
299
+ self._first_batch = None
300
+ self._first_batch_consumed = True
301
+ else:
302
+ self._schema = pa.schema([])
303
+ self._first_batch = None
304
+ self._first_batch_consumed = True
305
+ self._closed = True
306
+ return self._schema
307
+
308
+ def read_next_batch(self):
309
+ if self._accumulator:
310
+ result = self._accumulator.pop(0)
311
+ return result
312
+
313
+ if self._closed:
314
+ raise StopIteration
315
+
316
+ while True:
317
+ batch = None
318
+
319
+ # 1. Return the first batch if not consumed yet
320
+ if not self._first_batch_consumed:
321
+ self._first_batch_consumed = True
322
+ batch = self._first_batch
323
+
324
+ # 2. Check pending batches from current chunk
325
+ elif self._pending_batches:
326
+ batch = self._pending_batches.pop(0)
327
+
328
+ # 3. Fetch new chunk from chdb stream
329
+ else:
330
+ chunk = self._stream_result.fetch()
331
+ if chunk is None:
332
+ # No more data - return accumulated batches if any
333
+ break
334
+
335
+ arrow_bytes = chunk.bytes()
336
+ if not arrow_bytes:
337
+ continue
338
+
339
+ reader = pa.RecordBatchFileReader(arrow_bytes)
340
+ table = reader.read_all()
341
+
342
+ if table.num_rows > 0:
343
+ batches = table.to_batches()
344
+ batch = batches[0]
345
+ if len(batches) > 1:
346
+ self._pending_batches = batches[1:]
347
+ else:
348
+ continue
349
+
350
+ # Process the batch if we got one
351
+ if batch is not None:
352
+ self._accumulator.append(batch)
353
+ self._current_rows += batch.num_rows
354
+
355
+ # If accumulated enough rows, return combined batch
356
+ if self._current_rows >= self._batch_size_rows:
357
+ if len(self._accumulator) == 1:
358
+ result = self._accumulator.pop(0)
359
+ else:
360
+ if hasattr(pa, 'concat_batches'):
361
+ result = pa.concat_batches(self._accumulator)
362
+ self._accumulator = []
363
+ else:
364
+ result = self._accumulator.pop(0)
365
+
366
+ self._current_rows = 0
367
+ return result
368
+
369
+ # End of stream - return any accumulated batches
370
+ if self._accumulator:
371
+ if len(self._accumulator) == 1:
372
+ result = self._accumulator.pop(0)
373
+ else:
374
+ if hasattr(pa, 'concat_batches'):
375
+ result = pa.concat_batches(self._accumulator)
376
+ self._accumulator = []
377
+ else:
378
+ result = self._accumulator.pop(0)
379
+
380
+ self._current_rows = 0
381
+ self._closed = True
382
+ return result
383
+
384
+ # No more data
385
+ self._closed = True
386
+ raise StopIteration
387
+
388
+ def close(self):
389
+ if not self._closed:
390
+ self._stream_result.close()
391
+ self._closed = True
392
+
393
+ def __iter__(self):
394
+ return self
395
+
396
+ def __next__(self):
397
+ return self.read_next_batch()
398
+
399
+
400
+ class Connection:
401
+ def __init__(self, connection_string: str):
402
+ # print("Connection", connection_string)
403
+ self._cursor: Optional[Cursor] = None
404
+ self._conn = _chdb.connect(connection_string)
405
+
406
+ def cursor(self) -> "Cursor":
407
+ """Create a cursor object for executing queries.
408
+
409
+ This method creates a database cursor that provides the standard
410
+ DB-API 2.0 interface for executing queries and fetching results.
411
+ The cursor allows for fine-grained control over query execution
412
+ and result retrieval.
413
+
414
+ Returns:
415
+ Cursor: A cursor object for database operations
416
+
417
+ .. note::
418
+ Creating a new cursor will replace any existing cursor associated
419
+ with this connection. Only one cursor per connection is supported.
420
+
421
+ Examples:
422
+ >>> conn = connect(":memory:")
423
+ >>> cursor = conn.cursor()
424
+ >>> cursor.execute("CREATE TABLE test (id INT, name String)")
425
+ >>> cursor.execute("INSERT INTO test VALUES (1, 'Alice')")
426
+ >>> cursor.execute("SELECT * FROM test")
427
+ >>> rows = cursor.fetchall()
428
+ >>> print(rows)
429
+ ((1, 'Alice'),)
430
+
431
+ .. seealso::
432
+ :class:`Cursor` - Database cursor implementation
433
+ """
434
+ self._cursor = Cursor(self._conn)
435
+ return self._cursor
436
+
437
+ def query(self, query: str, format: str = "CSV") -> Any:
438
+ """Execute a SQL query and return the complete results.
439
+
440
+ This method executes a SQL query synchronously and returns the complete
441
+ result set. It supports various output formats and automatically applies
442
+ format-specific post-processing.
443
+
444
+ Args:
445
+ query (str): SQL query string to execute
446
+ format (str, optional): Output format for results. Defaults to "CSV".
447
+ Supported formats:
448
+
449
+ - "CSV" - Comma-separated values (string)
450
+ - "JSON" - JSON format (string)
451
+ - "Arrow" - Apache Arrow format (bytes)
452
+ - "Dataframe" - Pandas DataFrame (requires pandas)
453
+ - "Arrowtable" - PyArrow Table (requires pyarrow)
454
+
455
+ Returns:
456
+ Query results in the specified format. Type depends on format:
457
+
458
+ - String formats return str
459
+ - Arrow format returns bytes
460
+ - dataframe format returns pandas.DataFrame
461
+ - arrowtable format returns pyarrow.Table
462
+
463
+ Raises:
464
+ RuntimeError: If query execution fails
465
+ ImportError: If required packages for format are not installed
466
+
467
+ .. warning::
468
+ This method loads the entire result set into memory. For large
469
+ results, consider using :meth:`send_query` for streaming.
470
+
471
+ Examples:
472
+ >>> conn = connect(":memory:")
473
+ >>>
474
+ >>> # Basic CSV query
475
+ >>> result = conn.query("SELECT 1 as num, 'hello' as text")
476
+ >>> print(result)
477
+ num,text
478
+ 1,hello
479
+
480
+ >>> # DataFrame format
481
+ >>> df = conn.query("SELECT number FROM numbers(5)", "dataframe")
482
+ >>> print(df)
483
+ number
484
+ 0 0
485
+ 1 1
486
+ 2 2
487
+ 3 3
488
+ 4 4
489
+
490
+ .. seealso::
491
+ :meth:`send_query` - For streaming query execution
492
+ """
493
+ lower_output_format = format.lower()
494
+ result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
495
+ if lower_output_format in _arrow_format:
496
+ format = "Arrow"
497
+
498
+ result = self._conn.query(query, format)
499
+ return result_func(result)
500
+
501
+ def send_query(self, query: str, format: str = "CSV") -> StreamingResult:
502
+ """Execute a SQL query and return a streaming result iterator.
503
+
504
+ This method executes a SQL query and returns a StreamingResult object
505
+ that allows you to iterate over the results without loading everything
506
+ into memory at once. This is ideal for processing large result sets.
507
+
508
+ Args:
509
+ query (str): SQL query string to execute
510
+ format (str, optional): Output format for results. Defaults to "CSV".
511
+ Supported formats:
512
+
513
+ - "CSV" - Comma-separated values
514
+ - "JSON" - JSON format
515
+ - "Arrow" - Apache Arrow format (enables record_batch() method)
516
+ - "dataframe" - Pandas DataFrame chunks
517
+ - "arrowtable" - PyArrow Table chunks
518
+
519
+ Returns:
520
+ StreamingResult: A streaming iterator for query results that supports:
521
+
522
+ - Iterator protocol (for loops)
523
+ - Context manager protocol (with statements)
524
+ - Manual fetching with fetch() method
525
+ - PyArrow RecordBatch streaming (Arrow format only)
526
+
527
+ Raises:
528
+ RuntimeError: If query execution fails
529
+ ImportError: If required packages for format are not installed
530
+
531
+ .. note::
532
+ Only the "Arrow" format supports the record_batch() method on the
533
+ returned StreamingResult.
534
+
535
+ Examples:
536
+ >>> conn = connect(":memory:")
537
+ >>>
538
+ >>> # Basic streaming
539
+ >>> stream = conn.send_query("SELECT number FROM numbers(1000)")
540
+ >>> for chunk in stream:
541
+ ... print(f"Processing chunk: {len(chunk)} bytes")
542
+
543
+ >>> # Using context manager for cleanup
544
+ >>> with conn.send_query("SELECT * FROM large_table") as stream:
545
+ ... chunk = stream.fetch()
546
+ ... while chunk:
547
+ ... process_data(chunk)
548
+ ... chunk = stream.fetch()
549
+
550
+ >>> # Arrow format with RecordBatch streaming
551
+ >>> stream = conn.send_query("SELECT * FROM data", "Arrow")
552
+ >>> reader = stream.record_batch(rows_per_batch=10000)
553
+ >>> for batch in reader:
554
+ ... print(f"Batch shape: {batch.num_rows} x {batch.num_columns}")
555
+
556
+ .. seealso::
557
+ :meth:`query` - For non-streaming query execution
558
+ :class:`StreamingResult` - Streaming result iterator
559
+ """
560
+ lower_output_format = format.lower()
561
+ supports_record_batch = lower_output_format == "arrow"
562
+ result_func = _process_result_format_funs.get(lower_output_format, lambda x: x)
563
+ if lower_output_format in _arrow_format:
564
+ format = "Arrow"
565
+
566
+ c_stream_result = self._conn.send_query(query, format)
567
+ return StreamingResult(c_stream_result, self._conn, result_func, supports_record_batch)
568
+
569
+ def __enter__(self):
570
+ """Enter the context manager and return the connection.
571
+
572
+ Returns:
573
+ Connection: The connection object itself
574
+ """
575
+ return self
576
+
577
+ def __exit__(self, exc_type, exc_val, exc_tb):
578
+ """Exit the context manager and close the connection.
579
+
580
+ Args:
581
+ exc_type: Exception type if an exception was raised
582
+ exc_val: Exception value if an exception was raised
583
+ exc_tb: Exception traceback if an exception was raised
584
+
585
+ Returns:
586
+ False to propagate any exception that occurred
587
+ """
588
+ self.close()
589
+ return False
590
+
591
+ def close(self) -> None:
592
+ """Close the connection and cleanup resources.
593
+
594
+ This method closes the database connection and cleans up any associated
595
+ resources including active cursors. After calling this method, the
596
+ connection becomes invalid and cannot be used for further operations.
597
+
598
+ .. note::
599
+ This method is idempotent - calling it multiple times is safe.
600
+
601
+ .. warning::
602
+ Any ongoing streaming queries will be cancelled when the connection
603
+ is closed. Ensure all important data is processed before closing.
604
+
605
+ Examples:
606
+ >>> conn = connect("test.db")
607
+ >>> # Use connection for queries
608
+ >>> conn.query("CREATE TABLE test (id INT)")
609
+ >>> # Close when done
610
+ >>> conn.close()
611
+
612
+ >>> # Using with context manager (automatic cleanup)
613
+ >>> with connect("test.db") as conn:
614
+ ... conn.query("SELECT 1")
615
+ ... # Connection automatically closed
616
+ """
617
+ # print("close")
618
+ if self._cursor:
619
+ self._cursor.close()
620
+ self._conn.close()
621
+
622
+
623
+ class Cursor:
624
+ def __init__(self, connection):
625
+ self._conn = connection
626
+ self._cursor = self._conn.cursor()
627
+ self._current_table: Optional[pa.Table] = None
628
+ self._current_row: int = 0
629
+
630
+ def execute(self, query: str) -> None:
631
+ """Execute a SQL query and prepare results for fetching.
632
+
633
+ This method executes a SQL query and prepares the results for retrieval
634
+ using the fetch methods. It handles the parsing of result data and
635
+ automatic type conversion for ClickHouse data types.
636
+
637
+ Args:
638
+ query (str): SQL query string to execute
639
+
640
+ Raises:
641
+ Exception: If query execution fails or result parsing fails
642
+
643
+ .. note::
644
+ This method follows DB-API 2.0 specifications for cursor.execute().
645
+ After execution, use fetchone(), fetchmany(), or fetchall() to
646
+ retrieve results.
647
+
648
+ .. note::
649
+ The method automatically converts ClickHouse data types to appropriate
650
+ Python types:
651
+
652
+ - Int/UInt types → int
653
+ - Float types → float
654
+ - String/FixedString → str
655
+ - DateTime → datetime.datetime
656
+ - Date → datetime.date
657
+ - Bool → bool
658
+
659
+ Examples:
660
+ >>> cursor = conn.cursor()
661
+ >>>
662
+ >>> # Execute DDL
663
+ >>> cursor.execute("CREATE TABLE test (id INT, name String)")
664
+ >>>
665
+ >>> # Execute DML
666
+ >>> cursor.execute("INSERT INTO test VALUES (1, 'Alice')")
667
+ >>>
668
+ >>> # Execute SELECT and fetch results
669
+ >>> cursor.execute("SELECT * FROM test")
670
+ >>> rows = cursor.fetchall()
671
+ >>> print(rows)
672
+ ((1, 'Alice'),)
673
+
674
+ .. seealso::
675
+ :meth:`fetchone` - Fetch single row
676
+ :meth:`fetchmany` - Fetch multiple rows
677
+ :meth:`fetchall` - Fetch all remaining rows
678
+ """
679
+ self._cursor.execute(query)
680
+ result_mv = self._cursor.get_memview()
681
+ if self._cursor.has_error():
682
+ raise Exception(self._cursor.error_message())
683
+ if self._cursor.data_size() == 0:
684
+ self._current_table = None
685
+ self._current_row = 0
686
+ self._column_names = []
687
+ self._column_types = []
688
+ return
689
+
690
+ # Parse JSON data
691
+ json_data = result_mv.tobytes().decode("utf-8")
692
+ import json
693
+
694
+ try:
695
+ # First line contains column names
696
+ # Second line contains column types
697
+ # Following lines contain data
698
+ lines = json_data.strip().split("\n")
699
+ if len(lines) < 2:
700
+ self._current_table = None
701
+ self._current_row = 0
702
+ self._column_names = []
703
+ self._column_types = []
704
+ return
705
+
706
+ self._column_names = json.loads(lines[0])
707
+ self._column_types = json.loads(lines[1])
708
+
709
+ # Convert data rows
710
+ rows = []
711
+ for line in lines[2:]:
712
+ if not line.strip():
713
+ continue
714
+ row_data = json.loads(line)
715
+ converted_row = []
716
+ for val, type_info in zip(row_data, self._column_types):
717
+ # Handle NULL values first
718
+ if val is None:
719
+ converted_row.append(None)
720
+ continue
721
+
722
+ # Basic type conversion
723
+ try:
724
+ if type_info.startswith("Int") or type_info.startswith("UInt"):
725
+ converted_row.append(int(val))
726
+ elif type_info.startswith("Float"):
727
+ converted_row.append(float(val))
728
+ elif type_info == "Bool":
729
+ converted_row.append(bool(val))
730
+ elif type_info == "String" or type_info == "FixedString":
731
+ converted_row.append(str(val))
732
+ elif type_info.startswith("DateTime"):
733
+ from datetime import datetime
734
+
735
+ # Check if the value is numeric (timestamp)
736
+ val_str = str(val)
737
+ if val_str.replace(".", "").isdigit():
738
+ converted_row.append(datetime.fromtimestamp(float(val)))
739
+ else:
740
+ # Handle datetime string formats
741
+ if "." in val_str: # Has microseconds
742
+ converted_row.append(
743
+ datetime.strptime(
744
+ val_str, "%Y-%m-%d %H:%M:%S.%f"
745
+ )
746
+ )
747
+ else: # No microseconds
748
+ converted_row.append(
749
+ datetime.strptime(val_str, "%Y-%m-%d %H:%M:%S")
750
+ )
751
+ elif type_info.startswith("Date"):
752
+ from datetime import date, datetime
753
+
754
+ # Check if the value is numeric (days since epoch)
755
+ val_str = str(val)
756
+ if val_str.isdigit():
757
+ converted_row.append(
758
+ date.fromtimestamp(float(val) * 86400)
759
+ )
760
+ else:
761
+ # Handle date string format
762
+ converted_row.append(
763
+ datetime.strptime(val_str, "%Y-%m-%d").date()
764
+ )
765
+ else:
766
+ # For unsupported types, keep as string
767
+ converted_row.append(str(val))
768
+ except (ValueError, TypeError):
769
+ # If conversion fails, keep original value as string
770
+ converted_row.append(str(val))
771
+ rows.append(tuple(converted_row))
772
+
773
+ self._current_table = rows
774
+ self._current_row = 0
775
+
776
+ except json.JSONDecodeError as e:
777
+ raise Exception(f"Failed to parse JSON data: {e}")
778
+
779
+ def commit(self) -> None:
780
+ """Commit any pending transaction.
781
+
782
+ This method commits any pending database transaction. In ClickHouse,
783
+ most operations are auto-committed, but this method is provided for
784
+ DB-API 2.0 compatibility.
785
+
786
+ .. note::
787
+ ClickHouse typically auto-commits operations, so explicit commits
788
+ are usually not necessary. This method is provided for compatibility
789
+ with standard DB-API 2.0 workflow.
790
+
791
+ Examples:
792
+ >>> cursor = conn.cursor()
793
+ >>> cursor.execute("INSERT INTO test VALUES (1, 'data')")
794
+ >>> cursor.commit()
795
+ """
796
+ self._cursor.commit()
797
+
798
+ def fetchone(self) -> Optional[tuple]:
799
+ """Fetch the next row from the query result.
800
+
801
+ This method retrieves the next available row from the current query
802
+ result set. It returns a tuple containing the column values with
803
+ appropriate Python type conversion applied.
804
+
805
+ Returns:
806
+ Optional[tuple]: Next row as a tuple of column values, or None
807
+ if no more rows are available
808
+
809
+ .. note::
810
+ This method follows DB-API 2.0 specifications. Column values are
811
+ automatically converted to appropriate Python types based on
812
+ ClickHouse column types.
813
+
814
+ Examples:
815
+ >>> cursor = conn.cursor()
816
+ >>> cursor.execute("SELECT id, name FROM users")
817
+ >>> row = cursor.fetchone()
818
+ >>> while row is not None:
819
+ ... user_id, user_name = row
820
+ ... print(f"User {user_id}: {user_name}")
821
+ ... row = cursor.fetchone()
822
+
823
+ .. seealso::
824
+ :meth:`fetchmany` - Fetch multiple rows
825
+ :meth:`fetchall` - Fetch all remaining rows
826
+ """
827
+ if not self._current_table or self._current_row >= len(self._current_table):
828
+ return None
829
+
830
+ # Now self._current_table is a list of row tuples
831
+ row = self._current_table[self._current_row]
832
+ self._current_row += 1
833
+ return row
834
+
835
+ def fetchmany(self, size: int = 1) -> tuple:
836
+ """Fetch multiple rows from the query result.
837
+
838
+ This method retrieves up to 'size' rows from the current query result
839
+ set. It returns a tuple of row tuples, with each row containing column
840
+ values with appropriate Python type conversion.
841
+
842
+ Args:
843
+ size (int, optional): Maximum number of rows to fetch. Defaults to 1.
844
+
845
+ Returns:
846
+ tuple: Tuple containing up to 'size' row tuples. May contain fewer
847
+ rows if the result set is exhausted.
848
+
849
+ .. note::
850
+ This method follows DB-API 2.0 specifications. It will return fewer
851
+ than 'size' rows if the result set is exhausted.
852
+
853
+ Examples:
854
+ >>> cursor = conn.cursor()
855
+ >>> cursor.execute("SELECT * FROM large_table")
856
+ >>>
857
+ >>> # Process results in batches
858
+ >>> while True:
859
+ ... batch = cursor.fetchmany(100) # Fetch 100 rows at a time
860
+ ... if not batch:
861
+ ... break
862
+ ... process_batch(batch)
863
+
864
+ .. seealso::
865
+ :meth:`fetchone` - Fetch single row
866
+ :meth:`fetchall` - Fetch all remaining rows
867
+ """
868
+ if not self._current_table:
869
+ return tuple()
870
+
871
+ rows = []
872
+ for _ in range(size):
873
+ if (row := self.fetchone()) is None:
874
+ break
875
+ rows.append(row)
876
+ return tuple(rows)
877
+
878
+ def fetchall(self) -> tuple:
879
+ """Fetch all remaining rows from the query result.
880
+
881
+ This method retrieves all remaining rows from the current query result
882
+ set starting from the current cursor position. It returns a tuple of
883
+ row tuples with appropriate Python type conversion applied.
884
+
885
+ Returns:
886
+ tuple: Tuple containing all remaining row tuples from the result set.
887
+ Returns empty tuple if no rows are available.
888
+
889
+ .. warning::
890
+ This method loads all remaining rows into memory at once. For large
891
+ result sets, consider using :meth:`fetchmany` to process results
892
+ in batches.
893
+
894
+ Examples:
895
+ >>> cursor = conn.cursor()
896
+ >>> cursor.execute("SELECT id, name FROM users")
897
+ >>> all_users = cursor.fetchall()
898
+ >>> for user_id, user_name in all_users:
899
+ ... print(f"User {user_id}: {user_name}")
900
+
901
+ .. seealso::
902
+ :meth:`fetchone` - Fetch single row
903
+ :meth:`fetchmany` - Fetch multiple rows in batches
904
+ """
905
+ if not self._current_table:
906
+ return tuple()
907
+
908
+ remaining_rows = []
909
+ while (row := self.fetchone()) is not None:
910
+ remaining_rows.append(row)
911
+ return tuple(remaining_rows)
912
+
913
+ def close(self) -> None:
914
+ """Close the cursor and cleanup resources.
915
+
916
+ This method closes the cursor and cleans up any associated resources.
917
+ After calling this method, the cursor becomes invalid and cannot be
918
+ used for further operations.
919
+
920
+ .. note::
921
+ This method is idempotent - calling it multiple times is safe.
922
+ The cursor is also automatically closed when the connection is closed.
923
+
924
+ Examples:
925
+ >>> cursor = conn.cursor()
926
+ >>> cursor.execute("SELECT 1")
927
+ >>> result = cursor.fetchone()
928
+ >>> cursor.close() # Cleanup cursor resources
929
+ """
930
+ self._cursor.close()
931
+
932
+ def __iter__(self):
933
+ return self
934
+
935
+ def __next__(self) -> tuple:
936
+ row = self.fetchone()
937
+ if row is None:
938
+ raise StopIteration
939
+ return row
940
+
941
+ def column_names(self) -> list:
942
+ """Return a list of column names from the last executed query.
943
+
944
+ This method returns the column names from the most recently executed
945
+ SELECT query. The names are returned in the same order as they appear
946
+ in the result set.
947
+
948
+ Returns:
949
+ list: List of column name strings, or empty list if no query
950
+ has been executed or the query returned no columns
951
+
952
+ Examples:
953
+ >>> cursor = conn.cursor()
954
+ >>> cursor.execute("SELECT id, name, email FROM users LIMIT 1")
955
+ >>> print(cursor.column_names())
956
+ ['id', 'name', 'email']
957
+
958
+ .. seealso::
959
+ :meth:`column_types` - Get column type information
960
+ :attr:`description` - DB-API 2.0 column description
961
+ """
962
+ return self._column_names if hasattr(self, "_column_names") else []
963
+
964
+ def column_types(self) -> list:
965
+ """Return a list of column types from the last executed query.
966
+
967
+ This method returns the ClickHouse column type names from the most
968
+ recently executed SELECT query. The types are returned in the same
969
+ order as they appear in the result set.
970
+
971
+ Returns:
972
+ list: List of ClickHouse type name strings, or empty list if no
973
+ query has been executed or the query returned no columns
974
+
975
+ Examples:
976
+ >>> cursor = conn.cursor()
977
+ >>> cursor.execute("SELECT toInt32(1), toString('hello')")
978
+ >>> print(cursor.column_types())
979
+ ['Int32', 'String']
980
+
981
+ .. seealso::
982
+ :meth:`column_names` - Get column name information
983
+ :attr:`description` - DB-API 2.0 column description
984
+ """
985
+ return self._column_types if hasattr(self, "_column_types") else []
986
+
987
+ @property
988
+ def description(self) -> list:
989
+ """Return column description as per DB-API 2.0 specification.
990
+
991
+ This property returns a list of 7-item tuples describing each column
992
+ in the result set of the last executed SELECT query. Each tuple contains:
993
+ (name, type_code, display_size, internal_size, precision, scale, null_ok)
994
+
995
+ Currently, only name and type_code are provided, with other fields set to None.
996
+
997
+ Returns:
998
+ list: List of 7-tuples describing each column, or empty list if no
999
+ SELECT query has been executed
1000
+
1001
+ .. note::
1002
+ This follows the DB-API 2.0 specification for cursor.description.
1003
+ Only the first two elements (name and type_code) contain meaningful
1004
+ data in this implementation.
1005
+
1006
+ Examples:
1007
+ >>> cursor = conn.cursor()
1008
+ >>> cursor.execute("SELECT id, name FROM users LIMIT 1")
1009
+ >>> for desc in cursor.description:
1010
+ ... print(f"Column: {desc[0]}, Type: {desc[1]}")
1011
+ Column: id, Type: Int32
1012
+ Column: name, Type: String
1013
+
1014
+ .. seealso::
1015
+ :meth:`column_names` - Get just column names
1016
+ :meth:`column_types` - Get just column types
1017
+ """
1018
+ if not hasattr(self, "_column_names") or not self._column_names:
1019
+ return []
1020
+
1021
+ return [
1022
+ (name, type_info, None, None, None, None, None)
1023
+ for name, type_info in zip(self._column_names, self._column_types)
1024
+ ]
1025
+
1026
+
1027
+ def connect(connection_string: str = ":memory:") -> Connection:
1028
+ """Create a connection to chDB background server.
1029
+
1030
+ This function establishes a connection to the chDB (ClickHouse) database engine.
1031
+ Only one open connection is allowed per process. Multiple calls with the same
1032
+ connection string will return the same connection object.
1033
+
1034
+ Args:
1035
+ connection_string (str, optional): Database connection string. Defaults to ":memory:".
1036
+ Supported connection string formats:
1037
+
1038
+ **Basic formats:**
1039
+
1040
+ - ":memory:" - In-memory database (default)
1041
+ - "test.db" - Relative path database file
1042
+ - "file:test.db" - Same as relative path
1043
+ - "/path/to/test.db" - Absolute path database file
1044
+ - "file:/path/to/test.db" - Same as absolute path
1045
+
1046
+ **With query parameters:**
1047
+
1048
+ - "file:test.db?param1=value1&param2=value2" - Relative path with params
1049
+ - "file::memory:?verbose&log-level=test" - In-memory with params
1050
+ - "///path/to/test.db?param1=value1&param2=value2" - Absolute path with params
1051
+
1052
+ **Query parameter handling:**
1053
+
1054
+ Query parameters are passed to ClickHouse engine as startup arguments.
1055
+ Special parameter handling:
1056
+
1057
+ - "mode=ro" becomes "--readonly=1" (read-only mode)
1058
+ - "verbose" enables verbose logging
1059
+ - "log-level=test" sets logging level
1060
+
1061
+ For complete parameter list, see ``clickhouse local --help --verbose``
1062
+
1063
+ Returns:
1064
+ Connection: Database connection object that supports:
1065
+
1066
+ - Creating cursors with :meth:`Connection.cursor`
1067
+ - Direct queries with :meth:`Connection.query`
1068
+ - Streaming queries with :meth:`Connection.send_query`
1069
+ - Context manager protocol for automatic cleanup
1070
+
1071
+ Raises:
1072
+ RuntimeError: If connection to database fails
1073
+
1074
+ .. warning::
1075
+ Only one connection per process is supported. Creating a new connection
1076
+ will close any existing connection.
1077
+
1078
+ Examples:
1079
+ >>> # In-memory database
1080
+ >>> conn = connect()
1081
+ >>> conn = connect(":memory:")
1082
+ >>>
1083
+ >>> # File-based database
1084
+ >>> conn = connect("my_data.db")
1085
+ >>> conn = connect("/path/to/data.db")
1086
+ >>>
1087
+ >>> # With parameters
1088
+ >>> conn = connect("data.db?mode=ro") # Read-only mode
1089
+ >>> conn = connect(":memory:?verbose&log-level=debug") # Debug logging
1090
+ >>>
1091
+ >>> # Using context manager for automatic cleanup
1092
+ >>> with connect("data.db") as conn:
1093
+ ... result = conn.query("SELECT 1")
1094
+ ... print(result)
1095
+ >>> # Connection automatically closed
1096
+
1097
+ .. seealso::
1098
+ :class:`Connection` - Database connection class
1099
+ :class:`Cursor` - Database cursor for DB-API 2.0 operations
1100
+ """
1101
+ return Connection(connection_string)