chdb 3.3.0__cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chdb might be problematic. Click here for more details.
- chdb/__init__.py +134 -0
- chdb/__main__.py +35 -0
- chdb/_chdb.cpython-311-aarch64-linux-gnu.so +0 -0
- chdb/dataframe/__init__.py +19 -0
- chdb/dataframe/query.py +356 -0
- chdb/dbapi/__init__.py +79 -0
- chdb/dbapi/connections.py +100 -0
- chdb/dbapi/constants/FIELD_TYPE.py +31 -0
- chdb/dbapi/constants/__init__.py +0 -0
- chdb/dbapi/converters.py +293 -0
- chdb/dbapi/cursors.py +247 -0
- chdb/dbapi/err.py +61 -0
- chdb/dbapi/times.py +20 -0
- chdb/rwabc.py +65 -0
- chdb/session/__init__.py +3 -0
- chdb/session/state.py +135 -0
- chdb/state/__init__.py +3 -0
- chdb/state/sqlitelike.py +336 -0
- chdb/udf/__init__.py +3 -0
- chdb/udf/udf.py +106 -0
- chdb/utils/__init__.py +9 -0
- chdb/utils/trace.py +74 -0
- chdb/utils/types.py +234 -0
- chdb-3.3.0.dist-info/METADATA +532 -0
- chdb-3.3.0.dist-info/RECORD +28 -0
- chdb-3.3.0.dist-info/WHEEL +6 -0
- chdb-3.3.0.dist-info/licenses/LICENSE.txt +203 -0
- chdb-3.3.0.dist-info/top_level.txt +2 -0
chdb/udf/udf.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import tempfile
|
|
6
|
+
import atexit
|
|
7
|
+
import shutil
|
|
8
|
+
import textwrap
|
|
9
|
+
from xml.etree import ElementTree as ET
|
|
10
|
+
import chdb
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_udf(func_name, args, return_type, udf_body):
|
|
14
|
+
# generate python script
|
|
15
|
+
with open(f"{chdb.g_udf_path}/{func_name}.py", "w") as f:
|
|
16
|
+
f.write(f"#!{sys.executable}\n")
|
|
17
|
+
f.write("import sys\n")
|
|
18
|
+
f.write("\n")
|
|
19
|
+
for line in udf_body.split("\n"):
|
|
20
|
+
f.write(f"{line}\n")
|
|
21
|
+
f.write("\n")
|
|
22
|
+
f.write("if __name__ == '__main__':\n")
|
|
23
|
+
f.write(" for line in sys.stdin:\n")
|
|
24
|
+
f.write(" args = line.strip().split('\t')\n")
|
|
25
|
+
for i, arg in enumerate(args):
|
|
26
|
+
f.write(f" {arg} = args[{i}]\n")
|
|
27
|
+
f.write(f" print({func_name}({', '.join(args)}))\n")
|
|
28
|
+
f.write(" sys.stdout.flush()\n")
|
|
29
|
+
os.chmod(f"{chdb.g_udf_path}/{func_name}.py", 0o755)
|
|
30
|
+
# generate xml file
|
|
31
|
+
xml_file = f"{chdb.g_udf_path}/udf_config.xml"
|
|
32
|
+
root = ET.Element("functions")
|
|
33
|
+
if os.path.exists(xml_file):
|
|
34
|
+
tree = ET.parse(xml_file)
|
|
35
|
+
root = tree.getroot()
|
|
36
|
+
function = ET.SubElement(root, "function")
|
|
37
|
+
ET.SubElement(function, "type").text = "executable"
|
|
38
|
+
ET.SubElement(function, "name").text = func_name
|
|
39
|
+
ET.SubElement(function, "return_type").text = return_type
|
|
40
|
+
ET.SubElement(function, "format").text = "TabSeparated"
|
|
41
|
+
ET.SubElement(function, "command").text = f"{func_name}.py"
|
|
42
|
+
for arg in args:
|
|
43
|
+
argument = ET.SubElement(function, "argument")
|
|
44
|
+
# We use TabSeparated format, so assume all arguments are strings
|
|
45
|
+
ET.SubElement(argument, "type").text = "String"
|
|
46
|
+
ET.SubElement(argument, "name").text = arg
|
|
47
|
+
tree = ET.ElementTree(root)
|
|
48
|
+
tree.write(xml_file)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def chdb_udf(return_type="String"):
|
|
52
|
+
"""
|
|
53
|
+
Decorator for chDB Python UDF(User Defined Function).
|
|
54
|
+
1. The function should be stateless. So, only UDFs are supported, not UDAFs(User Defined Aggregation Function).
|
|
55
|
+
2. Default return type is String. If you want to change the return type, you can pass in the return type as an argument.
|
|
56
|
+
The return type should be one of the following: https://clickhouse.com/docs/en/sql-reference/data-types
|
|
57
|
+
3. The function should take in arguments of type String. As the input is TabSeparated, all arguments are strings.
|
|
58
|
+
4. The function will be called for each line of input. Something like this:
|
|
59
|
+
```
|
|
60
|
+
def sum_udf(lhs, rhs):
|
|
61
|
+
return int(lhs) + int(rhs)
|
|
62
|
+
|
|
63
|
+
for line in sys.stdin:
|
|
64
|
+
args = line.strip().split('\t')
|
|
65
|
+
lhs = args[0]
|
|
66
|
+
rhs = args[1]
|
|
67
|
+
print(sum_udf(lhs, rhs))
|
|
68
|
+
sys.stdout.flush()
|
|
69
|
+
```
|
|
70
|
+
5. The function should be pure python function. You SHOULD import all python modules used IN THE FUNCTION.
|
|
71
|
+
```
|
|
72
|
+
def func_use_json(arg):
|
|
73
|
+
import json
|
|
74
|
+
...
|
|
75
|
+
```
|
|
76
|
+
6. Python interpertor used is the same as the one used to run the script. Get from `sys.executable`
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def decorator(func):
|
|
80
|
+
func_name = func.__name__
|
|
81
|
+
sig = inspect.signature(func)
|
|
82
|
+
args = list(sig.parameters.keys())
|
|
83
|
+
src = inspect.getsource(func)
|
|
84
|
+
src = textwrap.dedent(src)
|
|
85
|
+
udf_body = src.split("\n", 1)[1] # remove the first line "@chdb_udf()"
|
|
86
|
+
# create tmp dir and make sure the dir is deleted when the process exits
|
|
87
|
+
if chdb.g_udf_path == "":
|
|
88
|
+
chdb.g_udf_path = tempfile.mkdtemp()
|
|
89
|
+
|
|
90
|
+
# clean up the tmp dir on exit
|
|
91
|
+
@atexit.register
|
|
92
|
+
def _cleanup():
|
|
93
|
+
try:
|
|
94
|
+
shutil.rmtree(chdb.g_udf_path)
|
|
95
|
+
except: # noqa
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
generate_udf(func_name, args, return_type, udf_body)
|
|
99
|
+
|
|
100
|
+
@functools.wraps(func)
|
|
101
|
+
def wrapper(*args, **kwargs):
|
|
102
|
+
return func(*args, **kwargs)
|
|
103
|
+
|
|
104
|
+
return wrapper
|
|
105
|
+
|
|
106
|
+
return decorator
|
chdb/utils/__init__.py
ADDED
chdb/utils/trace.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
import sys
|
|
4
|
+
import linecache
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
enable_print = False
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def print_lines(func):
|
|
11
|
+
if not enable_print:
|
|
12
|
+
return func
|
|
13
|
+
|
|
14
|
+
@functools.wraps(func)
|
|
15
|
+
def wrapper(*args, **kwargs):
|
|
16
|
+
# Get function name and determine if it's a method
|
|
17
|
+
is_method = inspect.ismethod(func) or (
|
|
18
|
+
len(args) > 0 and hasattr(args[0].__class__, func.__name__)
|
|
19
|
+
)
|
|
20
|
+
class_name = args[0].__class__.__name__ if is_method else None # type: ignore
|
|
21
|
+
|
|
22
|
+
# Get the source code of the function
|
|
23
|
+
try:
|
|
24
|
+
source_lines, start_line = inspect.getsourcelines(func)
|
|
25
|
+
except OSError:
|
|
26
|
+
# Handle cases where source might not be available
|
|
27
|
+
print(f"Warning: Could not get source for {func.__name__}")
|
|
28
|
+
return func(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
def trace(frame, event, arg):
|
|
31
|
+
if event == "line":
|
|
32
|
+
# Get the current line number and code
|
|
33
|
+
line_no = frame.f_lineno
|
|
34
|
+
line = linecache.getline(frame.f_code.co_filename, line_no).strip()
|
|
35
|
+
|
|
36
|
+
# Don't print decorator lines or empty lines
|
|
37
|
+
if line and not line.startswith("@"):
|
|
38
|
+
# Get local variables
|
|
39
|
+
local_vars = frame.f_locals.copy()
|
|
40
|
+
if is_method:
|
|
41
|
+
# Remove 'self' from local variables for clarity
|
|
42
|
+
local_vars.pop("self", None)
|
|
43
|
+
|
|
44
|
+
# Format timestamp
|
|
45
|
+
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
46
|
+
|
|
47
|
+
# Create context string (class.method or function)
|
|
48
|
+
context = (
|
|
49
|
+
f"{class_name}.{func.__name__}" if class_name else func.__name__
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Print execution information
|
|
53
|
+
print(f"[{timestamp}] {context} line {line_no}: {line}")
|
|
54
|
+
|
|
55
|
+
# Print local variables if they exist and have changed
|
|
56
|
+
if local_vars:
|
|
57
|
+
vars_str = ", ".join(
|
|
58
|
+
f"{k}={repr(v)}" for k, v in local_vars.items()
|
|
59
|
+
)
|
|
60
|
+
print(f" Variables: {vars_str}")
|
|
61
|
+
return trace
|
|
62
|
+
|
|
63
|
+
# Set the trace function
|
|
64
|
+
sys.settrace(trace)
|
|
65
|
+
|
|
66
|
+
# Call the original function
|
|
67
|
+
result = func(*args, **kwargs)
|
|
68
|
+
|
|
69
|
+
# Disable tracing
|
|
70
|
+
sys.settrace(None)
|
|
71
|
+
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
return wrapper
|
chdb/utils/types.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
import json
|
|
4
|
+
import decimal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_to_columnar(items: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
|
|
8
|
+
"""
|
|
9
|
+
Converts a list of dictionaries into a columnar format.
|
|
10
|
+
|
|
11
|
+
This function takes a list of dictionaries and converts it into a dictionary
|
|
12
|
+
where each key corresponds to a column and each value is a list of column values.
|
|
13
|
+
Missing values in the dictionaries are represented as None.
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
- items (List[Dict[str, Any]]): A list of dictionaries to convert.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
- Dict[str, List[Any]]: A dictionary with keys as column names and values as lists
|
|
20
|
+
of column values.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> items = [
|
|
24
|
+
... {"name": "Alice", "age": 30, "city": "New York"},
|
|
25
|
+
... {"name": "Bob", "age": 25},
|
|
26
|
+
... {"name": "Charlie", "city": "San Francisco"}
|
|
27
|
+
... ]
|
|
28
|
+
>>> convert_to_columnar(items)
|
|
29
|
+
{
|
|
30
|
+
'name': ['Alice', 'Bob', 'Charlie'],
|
|
31
|
+
'age': [30, 25, None],
|
|
32
|
+
'city': ['New York', None, 'San Francisco']
|
|
33
|
+
}
|
|
34
|
+
"""
|
|
35
|
+
if not items:
|
|
36
|
+
return {}
|
|
37
|
+
|
|
38
|
+
flattened_items = [flatten_dict(item) for item in items]
|
|
39
|
+
columns = defaultdict(list)
|
|
40
|
+
keys = set()
|
|
41
|
+
|
|
42
|
+
# Collect all possible keys
|
|
43
|
+
for flattened_item in flattened_items:
|
|
44
|
+
keys.update(flattened_item.keys())
|
|
45
|
+
|
|
46
|
+
# Fill the column lists
|
|
47
|
+
for flattened_item in flattened_items:
|
|
48
|
+
for key in keys:
|
|
49
|
+
columns[key].append(flattened_item.get(key, None))
|
|
50
|
+
|
|
51
|
+
return dict(columns)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def flatten_dict(
|
|
55
|
+
d: Dict[str, Any], parent_key: str = "", sep: str = "_"
|
|
56
|
+
) -> Dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Flattens a nested dictionary.
|
|
59
|
+
|
|
60
|
+
This function takes a nested dictionary and flattens it, concatenating nested keys
|
|
61
|
+
with a separator. Lists of dictionaries are serialized to JSON strings.
|
|
62
|
+
|
|
63
|
+
Parameters:
|
|
64
|
+
- d (Dict[str, Any]): The dictionary to flatten.
|
|
65
|
+
- parent_key (str, optional): The base key to prepend to each key. Defaults to "".
|
|
66
|
+
- sep (str, optional): The separator to use between concatenated keys. Defaults to "_".
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
- Dict[str, Any]: A flattened dictionary.
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> nested_dict = {
|
|
73
|
+
... "a": 1,
|
|
74
|
+
... "b": {
|
|
75
|
+
... "c": 2,
|
|
76
|
+
... "d": {
|
|
77
|
+
... "e": 3
|
|
78
|
+
... }
|
|
79
|
+
... },
|
|
80
|
+
... "f": [4, 5, {"g": 6}],
|
|
81
|
+
... "h": [{"i": 7}, {"j": 8}]
|
|
82
|
+
... }
|
|
83
|
+
>>> flatten_dict(nested_dict)
|
|
84
|
+
{
|
|
85
|
+
'a': 1,
|
|
86
|
+
'b_c': 2,
|
|
87
|
+
'b_d_e': 3,
|
|
88
|
+
'f_0': 4,
|
|
89
|
+
'f_1': 5,
|
|
90
|
+
'f_2_g': 6,
|
|
91
|
+
'h': '[{"i": 7}, {"j": 8}]'
|
|
92
|
+
}
|
|
93
|
+
"""
|
|
94
|
+
items = []
|
|
95
|
+
for k, v in d.items():
|
|
96
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
97
|
+
if isinstance(v, dict):
|
|
98
|
+
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
|
99
|
+
elif isinstance(v, list):
|
|
100
|
+
if all(isinstance(i, dict) for i in v):
|
|
101
|
+
items.append((new_key, json.dumps(v)))
|
|
102
|
+
else:
|
|
103
|
+
for i, item in enumerate(v):
|
|
104
|
+
if isinstance(item, dict):
|
|
105
|
+
items.extend(
|
|
106
|
+
flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
items.append((f"{new_key}{sep}{i}", item))
|
|
110
|
+
else:
|
|
111
|
+
items.append((new_key, v))
|
|
112
|
+
return dict(items)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def infer_data_types(
|
|
116
|
+
column_data: Dict[str, List[Any]], n_rows: int = 10000
|
|
117
|
+
) -> List[tuple]:
|
|
118
|
+
"""
|
|
119
|
+
Infers data types for each column in a columnar data structure.
|
|
120
|
+
|
|
121
|
+
This function analyzes the values in each column and infers the most suitable
|
|
122
|
+
data type for each column, based on a sample of the data.
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
- column_data (Dict[str, List[Any]]): A dictionary where keys are column names
|
|
126
|
+
and values are lists of column values.
|
|
127
|
+
- n_rows (int, optional): The number of rows to sample for type inference. Defaults to 10000.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
- List[tuple]: A list of tuples, each containing a column name and its inferred data type.
|
|
131
|
+
"""
|
|
132
|
+
data_types = []
|
|
133
|
+
for column, values in column_data.items():
|
|
134
|
+
sampled_values = values[:n_rows]
|
|
135
|
+
inferred_type = infer_data_type(sampled_values)
|
|
136
|
+
data_types.append((column, inferred_type))
|
|
137
|
+
return data_types
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def infer_data_type(values: List[Any]) -> str:
|
|
141
|
+
"""
|
|
142
|
+
Infers the most suitable data type for a list of values.
|
|
143
|
+
|
|
144
|
+
This function examines a list of values and determines the most appropriate
|
|
145
|
+
data type that can represent all the values in the list. It considers integer,
|
|
146
|
+
unsigned integer, decimal, and float types, and defaults to "string" if the
|
|
147
|
+
values cannot be represented by any numeric type or if all values are None.
|
|
148
|
+
|
|
149
|
+
Parameters:
|
|
150
|
+
- values (List[Any]): A list of values to analyze. The values can be of any type.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
- str: A string representing the inferred data type. Possible return values are:
|
|
154
|
+
"int8", "int16", "int32", "int64", "int128", "int256", "uint8", "uint16",
|
|
155
|
+
"uint32", "uint64", "uint128", "uint256", "decimal128", "decimal256",
|
|
156
|
+
"float32", "float64", or "string".
|
|
157
|
+
|
|
158
|
+
Notes:
|
|
159
|
+
- If all values in the list are None, the function returns "string".
|
|
160
|
+
- If any value in the list is a string, the function immediately returns "string".
|
|
161
|
+
- The function assumes that numeric values can be represented as integers,
|
|
162
|
+
decimals, or floats based on their range and precision.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
int_range = {
|
|
166
|
+
"int8": (-(2**7), 2**7 - 1),
|
|
167
|
+
"int16": (-(2**15), 2**15 - 1),
|
|
168
|
+
"int32": (-(2**31), 2**31 - 1),
|
|
169
|
+
"int64": (-(2**63), 2**63 - 1),
|
|
170
|
+
"int128": (-(2**127), 2**127 - 1),
|
|
171
|
+
"int256": (-(2**255), 2**255 - 1),
|
|
172
|
+
}
|
|
173
|
+
uint_range = {
|
|
174
|
+
"uint8": (0, 2**8 - 1),
|
|
175
|
+
"uint16": (0, 2**16 - 1),
|
|
176
|
+
"uint32": (0, 2**32 - 1),
|
|
177
|
+
"uint64": (0, 2**64 - 1),
|
|
178
|
+
"uint128": (0, 2**128 - 1),
|
|
179
|
+
"uint256": (0, 2**256 - 1),
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
max_val = float("-inf")
|
|
183
|
+
min_val = float("inf")
|
|
184
|
+
is_int = True
|
|
185
|
+
is_decimal = True
|
|
186
|
+
is_float = True
|
|
187
|
+
|
|
188
|
+
all_none = True
|
|
189
|
+
|
|
190
|
+
for val in values:
|
|
191
|
+
if val is None:
|
|
192
|
+
continue
|
|
193
|
+
all_none = False
|
|
194
|
+
if isinstance(val, str):
|
|
195
|
+
return "string"
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
num = int(val)
|
|
199
|
+
max_val = max(max_val, num)
|
|
200
|
+
min_val = min(min_val, num)
|
|
201
|
+
except (ValueError, TypeError):
|
|
202
|
+
is_int = False
|
|
203
|
+
try:
|
|
204
|
+
num = decimal.Decimal(val)
|
|
205
|
+
max_val = max(max_val, float(num))
|
|
206
|
+
min_val = min(min_val, float(num))
|
|
207
|
+
except (decimal.InvalidOperation, TypeError):
|
|
208
|
+
is_decimal = False
|
|
209
|
+
try:
|
|
210
|
+
num = float(val)
|
|
211
|
+
max_val = max(max_val, num)
|
|
212
|
+
min_val = min(min_val, num)
|
|
213
|
+
except (ValueError, TypeError):
|
|
214
|
+
is_float = False
|
|
215
|
+
return "string"
|
|
216
|
+
|
|
217
|
+
if all_none:
|
|
218
|
+
return "string"
|
|
219
|
+
|
|
220
|
+
if is_int:
|
|
221
|
+
for dtype, (min_val_dtype, max_val_dtype) in int_range.items():
|
|
222
|
+
if min_val_dtype <= min_val and max_val <= max_val_dtype:
|
|
223
|
+
return dtype
|
|
224
|
+
for dtype, (_, max_val_dtype) in uint_range.items():
|
|
225
|
+
if max_val <= max_val_dtype:
|
|
226
|
+
return dtype
|
|
227
|
+
|
|
228
|
+
if is_decimal:
|
|
229
|
+
return "decimal128" if abs(max_val) < 10**38 else "decimal256"
|
|
230
|
+
|
|
231
|
+
if is_float:
|
|
232
|
+
return "float32" if abs(max_val) < 3.4e38 else "float64"
|
|
233
|
+
|
|
234
|
+
return "string"
|