chdb 3.2.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 3.4.0__cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chdb might be problematic. Click here for more details.
- chdb/__init__.py +1 -1
- chdb/_chdb.cpython-38-aarch64-linux-gnu.so +0 -0
- chdb/session/state.py +3 -15
- {chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/METADATA +84 -11
- {chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/RECORD +8 -8
- {chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/LICENSE.txt +0 -0
- {chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/WHEEL +0 -0
- {chdb-3.2.0.dist-info → chdb-3.4.0.dist-info}/top_level.txt +0 -0
chdb/__init__.py
CHANGED
|
@@ -19,7 +19,7 @@ _process_result_format_funs = {
|
|
|
19
19
|
# UDF script path will be f"{g_udf_path}/{func_name}.py"
|
|
20
20
|
g_udf_path = ""
|
|
21
21
|
|
|
22
|
-
chdb_version = ('3', '
|
|
22
|
+
chdb_version = ('3', '4', '0')
|
|
23
23
|
if sys.version_info[:2] >= (3, 7):
|
|
24
24
|
# get the path of the current file
|
|
25
25
|
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
Binary file
|
chdb/session/state.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
import shutil
|
|
3
1
|
import warnings
|
|
4
2
|
|
|
5
3
|
import chdb
|
|
@@ -51,11 +49,9 @@ class Session:
|
|
|
51
49
|
)
|
|
52
50
|
g_session.close()
|
|
53
51
|
g_session_path = None
|
|
54
|
-
if path is None
|
|
55
|
-
self.
|
|
56
|
-
self._path = tempfile.mkdtemp()
|
|
52
|
+
if path is None:
|
|
53
|
+
self._path = ":memory:"
|
|
57
54
|
else:
|
|
58
|
-
self._cleanup = False
|
|
59
55
|
self._path = path
|
|
60
56
|
if chdb.g_udf_path != "":
|
|
61
57
|
self._udf_path = chdb.g_udf_path
|
|
@@ -84,8 +80,6 @@ class Session:
|
|
|
84
80
|
self.close()
|
|
85
81
|
|
|
86
82
|
def close(self):
|
|
87
|
-
if self._cleanup:
|
|
88
|
-
self.cleanup()
|
|
89
83
|
if self._conn is not None:
|
|
90
84
|
self._conn.close()
|
|
91
85
|
self._conn = None
|
|
@@ -95,13 +89,7 @@ class Session:
|
|
|
95
89
|
|
|
96
90
|
def cleanup(self):
|
|
97
91
|
try:
|
|
98
|
-
|
|
99
|
-
self._conn.close()
|
|
100
|
-
self._conn = None
|
|
101
|
-
shutil.rmtree(self._path)
|
|
102
|
-
global g_session, g_session_path
|
|
103
|
-
g_session = None
|
|
104
|
-
g_session_path = None
|
|
92
|
+
self.close()
|
|
105
93
|
except: # noqa
|
|
106
94
|
pass
|
|
107
95
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: chdb
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.4.0
|
|
4
4
|
Summary: chDB is an in-process SQL OLAP Engine powered by ClickHouse
|
|
5
5
|
Home-page: https://github.com/chdb-io/chdb
|
|
6
6
|
Author: auxten
|
|
@@ -51,11 +51,11 @@ Requires-Dist: pandas >=2.0.0
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
> chDB is an in-process SQL OLAP Engine powered by ClickHouse [^1]
|
|
54
|
-
> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
|
|
54
|
+
> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
## Features
|
|
58
|
-
|
|
58
|
+
|
|
59
59
|
* In-process SQL OLAP Engine, powered by ClickHouse
|
|
60
60
|
* No need to install ClickHouse
|
|
61
61
|
* Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
|
|
@@ -146,7 +146,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
|
|
|
146
146
|
# See more data type format in tests/format_output.py
|
|
147
147
|
res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
|
|
148
148
|
res = chdb.query('select * from file("data.csv", CSV)', 'CSV'); print(res)
|
|
149
|
-
print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
|
|
149
|
+
print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
|
|
150
150
|
```
|
|
151
151
|
|
|
152
152
|
### Pandas dataframe output
|
|
@@ -171,6 +171,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
|
|
|
171
171
|
print(ret_tbl)
|
|
172
172
|
# Query on the DataFrame Table
|
|
173
173
|
print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
|
|
174
|
+
# Pandas DataFrames are automatically registered as temporary tables in ClickHouse
|
|
175
|
+
chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
|
|
174
176
|
```
|
|
175
177
|
</details>
|
|
176
178
|
|
|
@@ -318,10 +320,19 @@ df = pd.DataFrame(
|
|
|
318
320
|
{
|
|
319
321
|
"a": [1, 2, 3, 4, 5, 6],
|
|
320
322
|
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
|
|
323
|
+
"dict_col": [
|
|
324
|
+
{'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
|
|
325
|
+
{'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
|
|
326
|
+
{'id': 3, 'name': 'tom'},
|
|
327
|
+
{'id': 4, 'value': '100'},
|
|
328
|
+
{'id': 5, 'value': 101},
|
|
329
|
+
{'id': 6, 'value': 102},
|
|
330
|
+
],
|
|
321
331
|
}
|
|
322
332
|
)
|
|
323
333
|
|
|
324
334
|
chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
|
|
335
|
+
chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
|
|
325
336
|
```
|
|
326
337
|
|
|
327
338
|
### Query on Arrow Table
|
|
@@ -333,12 +344,19 @@ arrow_table = pa.table(
|
|
|
333
344
|
{
|
|
334
345
|
"a": [1, 2, 3, 4, 5, 6],
|
|
335
346
|
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
|
|
347
|
+
"dict_col": [
|
|
348
|
+
{'id': 1, 'value': 'tom'},
|
|
349
|
+
{'id': 2, 'value': 'jerry'},
|
|
350
|
+
{'id': 3, 'value': 'auxten'},
|
|
351
|
+
{'id': 4, 'value': 'tom'},
|
|
352
|
+
{'id': 5, 'value': 'jerry'},
|
|
353
|
+
{'id': 6, 'value': 'auxten'},
|
|
354
|
+
],
|
|
336
355
|
}
|
|
337
356
|
)
|
|
338
357
|
|
|
339
|
-
chdb.query(
|
|
340
|
-
|
|
341
|
-
).show()
|
|
358
|
+
chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
|
|
359
|
+
chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
|
|
342
360
|
```
|
|
343
361
|
|
|
344
362
|
### Query on chdb.PyReader class instance
|
|
@@ -362,24 +380,79 @@ class myReader(chdb.PyReader):
|
|
|
362
380
|
def read(self, col_names, count):
|
|
363
381
|
print("Python func read", col_names, count, self.cursor)
|
|
364
382
|
if self.cursor >= len(self.data["a"]):
|
|
383
|
+
self.cursor = 0
|
|
365
384
|
return []
|
|
366
385
|
block = [self.data[col] for col in col_names]
|
|
367
386
|
self.cursor += len(block[0])
|
|
368
387
|
return block
|
|
369
388
|
|
|
389
|
+
def get_schema(self):
|
|
390
|
+
return [
|
|
391
|
+
("a", "int"),
|
|
392
|
+
("b", "str"),
|
|
393
|
+
("dict_col", "json")
|
|
394
|
+
]
|
|
395
|
+
|
|
370
396
|
reader = myReader(
|
|
371
397
|
{
|
|
372
398
|
"a": [1, 2, 3, 4, 5, 6],
|
|
373
399
|
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
|
|
400
|
+
"dict_col": [
|
|
401
|
+
{'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
|
|
402
|
+
{'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
|
|
403
|
+
{'id': 3, 'name': 'tom'},
|
|
404
|
+
{'id': 4, 'value': '100'},
|
|
405
|
+
{'id': 5, 'value': 101},
|
|
406
|
+
{'id': 6, 'value': 102}
|
|
407
|
+
],
|
|
374
408
|
}
|
|
375
409
|
)
|
|
376
410
|
|
|
377
|
-
chdb.query(
|
|
378
|
-
|
|
379
|
-
).show()
|
|
411
|
+
chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
|
|
412
|
+
chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
|
|
380
413
|
```
|
|
381
414
|
|
|
382
|
-
see also: [test_query_py.py](tests/test_query_py.py).
|
|
415
|
+
see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
|
|
416
|
+
|
|
417
|
+
### JSON Type Inference
|
|
418
|
+
|
|
419
|
+
chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
|
|
420
|
+
|
|
421
|
+
1. **Pandas DataFrame**
|
|
422
|
+
- Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
|
|
423
|
+
- Control sampling via SQL settings:
|
|
424
|
+
```sql
|
|
425
|
+
SET pandas_analyze_sample = 10000 -- Default sampling
|
|
426
|
+
SET pandas_analyze_sample = 0 -- Force String type
|
|
427
|
+
SET pandas_analyze_sample = -1 -- Force JSON type
|
|
428
|
+
```
|
|
429
|
+
- Columns are converted to `String` if sampling finds non-dictionary values.
|
|
430
|
+
|
|
431
|
+
2. **Arrow Table**
|
|
432
|
+
- `struct` type columns are automatically mapped to JSON columns.
|
|
433
|
+
- Nested structures preserve type information.
|
|
434
|
+
|
|
435
|
+
3. **chdb.PyReader**
|
|
436
|
+
- Implement custom schema mapping in `get_schema()`:
|
|
437
|
+
```python
|
|
438
|
+
def get_schema(self):
|
|
439
|
+
return [
|
|
440
|
+
("c1", "JSON"), # Explicit JSON mapping
|
|
441
|
+
("c2", "String")
|
|
442
|
+
]
|
|
443
|
+
```
|
|
444
|
+
- Column types declared as "JSON" will bypass auto-detection.
|
|
445
|
+
|
|
446
|
+
When converting Python dictionary objects to JSON columns:
|
|
447
|
+
|
|
448
|
+
1. **Nested Structures**
|
|
449
|
+
- Recursively process nested dictionaries, lists, tuples and NumPy arrays.
|
|
450
|
+
|
|
451
|
+
2. **Primitive Types**
|
|
452
|
+
- Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
|
|
453
|
+
|
|
454
|
+
3. **Complex Objects**
|
|
455
|
+
- Non-primitive types will be converted to strings.
|
|
383
456
|
|
|
384
457
|
### Limitations
|
|
385
458
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
chdb/__main__.py,sha256=xNNtDY38d973YM5dlxiIazcqqKhXJSpNb7JflyyrXGE,1185
|
|
2
|
-
chdb/__init__.py,sha256=
|
|
3
|
-
chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=
|
|
2
|
+
chdb/__init__.py,sha256=pNYyLRqm2s5hG1rPhVLECZXvJQ60EGPt_OC88W90j0w,3762
|
|
3
|
+
chdb/_chdb.cpython-38-aarch64-linux-gnu.so,sha256=jWd7xJeE74xK21ep7dDeYMtYrEW-VVzxYN4HgQf8PiY,568828616
|
|
4
4
|
chdb/rwabc.py,sha256=tbiwCrXirfrfx46wCJxS64yvFe6pVWIPGdSuvrAL5Ys,2102
|
|
5
5
|
chdb/session/__init__.py,sha256=fCUROZ5L1-92o2lcASiWJpFu-80-kDoSrNfouLEmLg8,50
|
|
6
|
-
chdb/session/state.py,sha256=
|
|
6
|
+
chdb/session/state.py,sha256=m7K9zZtoMQTlh-pfmSyJV38pAe6eHNTPtOvlHYrImhA,4436
|
|
7
7
|
chdb/dataframe/__init__.py,sha256=1_mrZZiJwqBTnH_P8_FCbbYXIWWY5sxnaFpe3-tDLF4,680
|
|
8
8
|
chdb/dataframe/query.py,sha256=ggvE8A5vtabFg9gSTp99S7LCrnIEwbWtb-PtJVT8Ct0,12759
|
|
9
9
|
chdb/utils/__init__.py,sha256=tXRcwBRGW2YQNBZWV4Mitw5QlCu_qlSRCjllw15XHbs,171
|
|
@@ -21,8 +21,8 @@ chdb/dbapi/constants/FIELD_TYPE.py,sha256=ytFzgAnGmb9hvdsBlnK68qdZv_a6jYFIXT6VSA
|
|
|
21
21
|
chdb/dbapi/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
chdb/state/__init__.py,sha256=RVUIWDqDi7gte4Os7Mz1wPXFyFpdHT_p1klJC7QtluI,55
|
|
23
23
|
chdb/state/sqlitelike.py,sha256=v0xh9jWirHzhDVq26C2213LxfaDbRulSAhSHaTiZ24c,12283
|
|
24
|
-
chdb-3.
|
|
25
|
-
chdb-3.
|
|
26
|
-
chdb-3.
|
|
27
|
-
chdb-3.
|
|
28
|
-
chdb-3.
|
|
24
|
+
chdb-3.4.0.dist-info/RECORD,,
|
|
25
|
+
chdb-3.4.0.dist-info/LICENSE.txt,sha256=isYVtNCO5910aj6e9bJJ6kQceivkLqsMlFSNYwzGGKI,11366
|
|
26
|
+
chdb-3.4.0.dist-info/WHEEL,sha256=kTTaziUGh4IgOXqp-kkEc0J_ej7kOw0CdWocYU-1ZR0,149
|
|
27
|
+
chdb-3.4.0.dist-info/METADATA,sha256=niEMQj5RD1T34yBzfkJzW9K4gziEvAnITAuzNN10AIs,24622
|
|
28
|
+
chdb-3.4.0.dist-info/top_level.txt,sha256=se0Jj0A2-ijfMW51hIjiuNyDJPqy5xJU1G8a_IEdllI,11
|
|
File without changes
|
|
File without changes
|
|
File without changes
|