chdb 3.4.1__cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chdb might be problematic. Click here for more details.

chdb/udf/udf.py ADDED
@@ -0,0 +1,106 @@
1
+ import functools
2
+ import inspect
3
+ import os
4
+ import sys
5
+ import tempfile
6
+ import atexit
7
+ import shutil
8
+ import textwrap
9
+ from xml.etree import ElementTree as ET
10
+ import chdb
11
+
12
+
13
+ def generate_udf(func_name, args, return_type, udf_body):
14
+ # generate python script
15
+ with open(f"{chdb.g_udf_path}/{func_name}.py", "w") as f:
16
+ f.write(f"#!{sys.executable}\n")
17
+ f.write("import sys\n")
18
+ f.write("\n")
19
+ for line in udf_body.split("\n"):
20
+ f.write(f"{line}\n")
21
+ f.write("\n")
22
+ f.write("if __name__ == '__main__':\n")
23
+ f.write(" for line in sys.stdin:\n")
24
+ f.write(" args = line.strip().split('\t')\n")
25
+ for i, arg in enumerate(args):
26
+ f.write(f" {arg} = args[{i}]\n")
27
+ f.write(f" print({func_name}({', '.join(args)}))\n")
28
+ f.write(" sys.stdout.flush()\n")
29
+ os.chmod(f"{chdb.g_udf_path}/{func_name}.py", 0o755)
30
+ # generate xml file
31
+ xml_file = f"{chdb.g_udf_path}/udf_config.xml"
32
+ root = ET.Element("functions")
33
+ if os.path.exists(xml_file):
34
+ tree = ET.parse(xml_file)
35
+ root = tree.getroot()
36
+ function = ET.SubElement(root, "function")
37
+ ET.SubElement(function, "type").text = "executable"
38
+ ET.SubElement(function, "name").text = func_name
39
+ ET.SubElement(function, "return_type").text = return_type
40
+ ET.SubElement(function, "format").text = "TabSeparated"
41
+ ET.SubElement(function, "command").text = f"{func_name}.py"
42
+ for arg in args:
43
+ argument = ET.SubElement(function, "argument")
44
+ # We use TabSeparated format, so assume all arguments are strings
45
+ ET.SubElement(argument, "type").text = "String"
46
+ ET.SubElement(argument, "name").text = arg
47
+ tree = ET.ElementTree(root)
48
+ tree.write(xml_file)
49
+
50
+
51
+ def chdb_udf(return_type="String"):
52
+ """
53
+ Decorator for chDB Python UDF(User Defined Function).
54
+ 1. The function should be stateless. So, only UDFs are supported, not UDAFs(User Defined Aggregation Function).
55
+ 2. Default return type is String. If you want to change the return type, you can pass in the return type as an argument.
56
+ The return type should be one of the following: https://clickhouse.com/docs/en/sql-reference/data-types
57
+ 3. The function should take in arguments of type String. As the input is TabSeparated, all arguments are strings.
58
+ 4. The function will be called for each line of input. Something like this:
59
+ ```
60
+ def sum_udf(lhs, rhs):
61
+ return int(lhs) + int(rhs)
62
+
63
+ for line in sys.stdin:
64
+ args = line.strip().split('\t')
65
+ lhs = args[0]
66
+ rhs = args[1]
67
+ print(sum_udf(lhs, rhs))
68
+ sys.stdout.flush()
69
+ ```
70
+ 5. The function should be pure python function. You SHOULD import all python modules used IN THE FUNCTION.
71
+ ```
72
+ def func_use_json(arg):
73
+ import json
74
+ ...
75
+ ```
76
+ 6. Python interpertor used is the same as the one used to run the script. Get from `sys.executable`
77
+ """
78
+
79
+ def decorator(func):
80
+ func_name = func.__name__
81
+ sig = inspect.signature(func)
82
+ args = list(sig.parameters.keys())
83
+ src = inspect.getsource(func)
84
+ src = textwrap.dedent(src)
85
+ udf_body = src.split("\n", 1)[1] # remove the first line "@chdb_udf()"
86
+ # create tmp dir and make sure the dir is deleted when the process exits
87
+ if chdb.g_udf_path == "":
88
+ chdb.g_udf_path = tempfile.mkdtemp()
89
+
90
+ # clean up the tmp dir on exit
91
+ @atexit.register
92
+ def _cleanup():
93
+ try:
94
+ shutil.rmtree(chdb.g_udf_path)
95
+ except: # noqa
96
+ pass
97
+
98
+ generate_udf(func_name, args, return_type, udf_body)
99
+
100
+ @functools.wraps(func)
101
+ def wrapper(*args, **kwargs):
102
+ return func(*args, **kwargs)
103
+
104
+ return wrapper
105
+
106
+ return decorator
chdb/utils/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from .types import * # noqa: F403
2
+
3
+ __all__ = [ # noqa: F405
4
+ "flatten_dict",
5
+ "convert_to_columnar",
6
+ "infer_data_type",
7
+ "infer_data_types",
8
+ "trace",
9
+ ]
chdb/utils/trace.py ADDED
@@ -0,0 +1,74 @@
1
+ import functools
2
+ import inspect
3
+ import sys
4
+ import linecache
5
+ from datetime import datetime
6
+
7
+ enable_print = False
8
+
9
+
10
+ def print_lines(func):
11
+ if not enable_print:
12
+ return func
13
+
14
+ @functools.wraps(func)
15
+ def wrapper(*args, **kwargs):
16
+ # Get function name and determine if it's a method
17
+ is_method = inspect.ismethod(func) or (
18
+ len(args) > 0 and hasattr(args[0].__class__, func.__name__)
19
+ )
20
+ class_name = args[0].__class__.__name__ if is_method else None # type: ignore
21
+
22
+ # Get the source code of the function
23
+ try:
24
+ source_lines, start_line = inspect.getsourcelines(func)
25
+ except OSError:
26
+ # Handle cases where source might not be available
27
+ print(f"Warning: Could not get source for {func.__name__}")
28
+ return func(*args, **kwargs)
29
+
30
+ def trace(frame, event, arg):
31
+ if event == "line":
32
+ # Get the current line number and code
33
+ line_no = frame.f_lineno
34
+ line = linecache.getline(frame.f_code.co_filename, line_no).strip()
35
+
36
+ # Don't print decorator lines or empty lines
37
+ if line and not line.startswith("@"):
38
+ # Get local variables
39
+ local_vars = frame.f_locals.copy()
40
+ if is_method:
41
+ # Remove 'self' from local variables for clarity
42
+ local_vars.pop("self", None)
43
+
44
+ # Format timestamp
45
+ timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
46
+
47
+ # Create context string (class.method or function)
48
+ context = (
49
+ f"{class_name}.{func.__name__}" if class_name else func.__name__
50
+ )
51
+
52
+ # Print execution information
53
+ print(f"[{timestamp}] {context} line {line_no}: {line}")
54
+
55
+ # Print local variables if they exist and have changed
56
+ if local_vars:
57
+ vars_str = ", ".join(
58
+ f"{k}={repr(v)}" for k, v in local_vars.items()
59
+ )
60
+ print(f" Variables: {vars_str}")
61
+ return trace
62
+
63
+ # Set the trace function
64
+ sys.settrace(trace)
65
+
66
+ # Call the original function
67
+ result = func(*args, **kwargs)
68
+
69
+ # Disable tracing
70
+ sys.settrace(None)
71
+
72
+ return result
73
+
74
+ return wrapper
chdb/utils/types.py ADDED
@@ -0,0 +1,234 @@
1
+ from collections import defaultdict
2
+ from typing import List, Dict, Any
3
+ import json
4
+ import decimal
5
+
6
+
7
+ def convert_to_columnar(items: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
8
+ """
9
+ Converts a list of dictionaries into a columnar format.
10
+
11
+ This function takes a list of dictionaries and converts it into a dictionary
12
+ where each key corresponds to a column and each value is a list of column values.
13
+ Missing values in the dictionaries are represented as None.
14
+
15
+ Parameters:
16
+ - items (List[Dict[str, Any]]): A list of dictionaries to convert.
17
+
18
+ Returns:
19
+ - Dict[str, List[Any]]: A dictionary with keys as column names and values as lists
20
+ of column values.
21
+
22
+ Example:
23
+ >>> items = [
24
+ ... {"name": "Alice", "age": 30, "city": "New York"},
25
+ ... {"name": "Bob", "age": 25},
26
+ ... {"name": "Charlie", "city": "San Francisco"}
27
+ ... ]
28
+ >>> convert_to_columnar(items)
29
+ {
30
+ 'name': ['Alice', 'Bob', 'Charlie'],
31
+ 'age': [30, 25, None],
32
+ 'city': ['New York', None, 'San Francisco']
33
+ }
34
+ """
35
+ if not items:
36
+ return {}
37
+
38
+ flattened_items = [flatten_dict(item) for item in items]
39
+ columns = defaultdict(list)
40
+ keys = set()
41
+
42
+ # Collect all possible keys
43
+ for flattened_item in flattened_items:
44
+ keys.update(flattened_item.keys())
45
+
46
+ # Fill the column lists
47
+ for flattened_item in flattened_items:
48
+ for key in keys:
49
+ columns[key].append(flattened_item.get(key, None))
50
+
51
+ return dict(columns)
52
+
53
+
54
+ def flatten_dict(
55
+ d: Dict[str, Any], parent_key: str = "", sep: str = "_"
56
+ ) -> Dict[str, Any]:
57
+ """
58
+ Flattens a nested dictionary.
59
+
60
+ This function takes a nested dictionary and flattens it, concatenating nested keys
61
+ with a separator. Lists of dictionaries are serialized to JSON strings.
62
+
63
+ Parameters:
64
+ - d (Dict[str, Any]): The dictionary to flatten.
65
+ - parent_key (str, optional): The base key to prepend to each key. Defaults to "".
66
+ - sep (str, optional): The separator to use between concatenated keys. Defaults to "_".
67
+
68
+ Returns:
69
+ - Dict[str, Any]: A flattened dictionary.
70
+
71
+ Example:
72
+ >>> nested_dict = {
73
+ ... "a": 1,
74
+ ... "b": {
75
+ ... "c": 2,
76
+ ... "d": {
77
+ ... "e": 3
78
+ ... }
79
+ ... },
80
+ ... "f": [4, 5, {"g": 6}],
81
+ ... "h": [{"i": 7}, {"j": 8}]
82
+ ... }
83
+ >>> flatten_dict(nested_dict)
84
+ {
85
+ 'a': 1,
86
+ 'b_c': 2,
87
+ 'b_d_e': 3,
88
+ 'f_0': 4,
89
+ 'f_1': 5,
90
+ 'f_2_g': 6,
91
+ 'h': '[{"i": 7}, {"j": 8}]'
92
+ }
93
+ """
94
+ items = []
95
+ for k, v in d.items():
96
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
97
+ if isinstance(v, dict):
98
+ items.extend(flatten_dict(v, new_key, sep=sep).items())
99
+ elif isinstance(v, list):
100
+ if all(isinstance(i, dict) for i in v):
101
+ items.append((new_key, json.dumps(v)))
102
+ else:
103
+ for i, item in enumerate(v):
104
+ if isinstance(item, dict):
105
+ items.extend(
106
+ flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()
107
+ )
108
+ else:
109
+ items.append((f"{new_key}{sep}{i}", item))
110
+ else:
111
+ items.append((new_key, v))
112
+ return dict(items)
113
+
114
+
115
+ def infer_data_types(
116
+ column_data: Dict[str, List[Any]], n_rows: int = 10000
117
+ ) -> List[tuple]:
118
+ """
119
+ Infers data types for each column in a columnar data structure.
120
+
121
+ This function analyzes the values in each column and infers the most suitable
122
+ data type for each column, based on a sample of the data.
123
+
124
+ Parameters:
125
+ - column_data (Dict[str, List[Any]]): A dictionary where keys are column names
126
+ and values are lists of column values.
127
+ - n_rows (int, optional): The number of rows to sample for type inference. Defaults to 10000.
128
+
129
+ Returns:
130
+ - List[tuple]: A list of tuples, each containing a column name and its inferred data type.
131
+ """
132
+ data_types = []
133
+ for column, values in column_data.items():
134
+ sampled_values = values[:n_rows]
135
+ inferred_type = infer_data_type(sampled_values)
136
+ data_types.append((column, inferred_type))
137
+ return data_types
138
+
139
+
140
+ def infer_data_type(values: List[Any]) -> str:
141
+ """
142
+ Infers the most suitable data type for a list of values.
143
+
144
+ This function examines a list of values and determines the most appropriate
145
+ data type that can represent all the values in the list. It considers integer,
146
+ unsigned integer, decimal, and float types, and defaults to "string" if the
147
+ values cannot be represented by any numeric type or if all values are None.
148
+
149
+ Parameters:
150
+ - values (List[Any]): A list of values to analyze. The values can be of any type.
151
+
152
+ Returns:
153
+ - str: A string representing the inferred data type. Possible return values are:
154
+ "int8", "int16", "int32", "int64", "int128", "int256", "uint8", "uint16",
155
+ "uint32", "uint64", "uint128", "uint256", "decimal128", "decimal256",
156
+ "float32", "float64", or "string".
157
+
158
+ Notes:
159
+ - If all values in the list are None, the function returns "string".
160
+ - If any value in the list is a string, the function immediately returns "string".
161
+ - The function assumes that numeric values can be represented as integers,
162
+ decimals, or floats based on their range and precision.
163
+ """
164
+
165
+ int_range = {
166
+ "int8": (-(2**7), 2**7 - 1),
167
+ "int16": (-(2**15), 2**15 - 1),
168
+ "int32": (-(2**31), 2**31 - 1),
169
+ "int64": (-(2**63), 2**63 - 1),
170
+ "int128": (-(2**127), 2**127 - 1),
171
+ "int256": (-(2**255), 2**255 - 1),
172
+ }
173
+ uint_range = {
174
+ "uint8": (0, 2**8 - 1),
175
+ "uint16": (0, 2**16 - 1),
176
+ "uint32": (0, 2**32 - 1),
177
+ "uint64": (0, 2**64 - 1),
178
+ "uint128": (0, 2**128 - 1),
179
+ "uint256": (0, 2**256 - 1),
180
+ }
181
+
182
+ max_val = float("-inf")
183
+ min_val = float("inf")
184
+ is_int = True
185
+ is_decimal = True
186
+ is_float = True
187
+
188
+ all_none = True
189
+
190
+ for val in values:
191
+ if val is None:
192
+ continue
193
+ all_none = False
194
+ if isinstance(val, str):
195
+ return "string"
196
+
197
+ try:
198
+ num = int(val)
199
+ max_val = max(max_val, num)
200
+ min_val = min(min_val, num)
201
+ except (ValueError, TypeError):
202
+ is_int = False
203
+ try:
204
+ num = decimal.Decimal(val)
205
+ max_val = max(max_val, float(num))
206
+ min_val = min(min_val, float(num))
207
+ except (decimal.InvalidOperation, TypeError):
208
+ is_decimal = False
209
+ try:
210
+ num = float(val)
211
+ max_val = max(max_val, num)
212
+ min_val = min(min_val, num)
213
+ except (ValueError, TypeError):
214
+ is_float = False
215
+ return "string"
216
+
217
+ if all_none:
218
+ return "string"
219
+
220
+ if is_int:
221
+ for dtype, (min_val_dtype, max_val_dtype) in int_range.items():
222
+ if min_val_dtype <= min_val and max_val <= max_val_dtype:
223
+ return dtype
224
+ for dtype, (_, max_val_dtype) in uint_range.items():
225
+ if max_val <= max_val_dtype:
226
+ return dtype
227
+
228
+ if is_decimal:
229
+ return "decimal128" if abs(max_val) < 10**38 else "decimal256"
230
+
231
+ if is_float:
232
+ return "float32" if abs(max_val) < 3.4e38 else "float64"
233
+
234
+ return "string"