clickhouse-driver 0.2.1__cp39-cp39-win_amd64.whl → 0.2.8__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clickhouse_driver/__init__.py +9 -9
- clickhouse_driver/block.py +227 -195
- clickhouse_driver/blockstreamprofileinfo.py +22 -22
- clickhouse_driver/bufferedreader.cp39-win_amd64.pyd +0 -0
- clickhouse_driver/bufferedwriter.cp39-win_amd64.pyd +0 -0
- clickhouse_driver/client.py +896 -666
- clickhouse_driver/clientinfo.py +119 -80
- clickhouse_driver/columns/arraycolumn.py +161 -150
- clickhouse_driver/columns/base.py +221 -147
- clickhouse_driver/columns/boolcolumn.py +7 -0
- clickhouse_driver/columns/datecolumn.py +108 -49
- clickhouse_driver/columns/datetimecolumn.py +202 -207
- clickhouse_driver/columns/decimalcolumn.py +116 -118
- clickhouse_driver/columns/enumcolumn.py +119 -119
- clickhouse_driver/columns/exceptions.py +12 -12
- clickhouse_driver/columns/floatcolumn.py +34 -34
- clickhouse_driver/columns/intcolumn.py +157 -157
- clickhouse_driver/columns/intervalcolumn.py +33 -33
- clickhouse_driver/columns/ipcolumn.py +118 -118
- clickhouse_driver/columns/jsoncolumn.py +37 -0
- clickhouse_driver/columns/largeint.cp39-win_amd64.pyd +0 -0
- clickhouse_driver/columns/lowcardinalitycolumn.py +142 -123
- clickhouse_driver/columns/mapcolumn.py +73 -58
- clickhouse_driver/columns/nestedcolumn.py +10 -0
- clickhouse_driver/columns/nothingcolumn.py +13 -13
- clickhouse_driver/columns/nullablecolumn.py +7 -7
- clickhouse_driver/columns/nullcolumn.py +15 -15
- clickhouse_driver/columns/numpy/base.py +47 -14
- clickhouse_driver/columns/numpy/boolcolumn.py +8 -0
- clickhouse_driver/columns/numpy/datecolumn.py +19 -12
- clickhouse_driver/columns/numpy/datetimecolumn.py +143 -145
- clickhouse_driver/columns/numpy/floatcolumn.py +24 -13
- clickhouse_driver/columns/numpy/intcolumn.py +43 -43
- clickhouse_driver/columns/numpy/lowcardinalitycolumn.py +96 -83
- clickhouse_driver/columns/numpy/service.py +58 -80
- clickhouse_driver/columns/numpy/stringcolumn.py +78 -76
- clickhouse_driver/columns/numpy/tuplecolumn.py +37 -0
- clickhouse_driver/columns/service.py +185 -131
- clickhouse_driver/columns/simpleaggregatefunctioncolumn.py +7 -7
- clickhouse_driver/columns/stringcolumn.py +73 -73
- clickhouse_driver/columns/tuplecolumn.py +63 -65
- clickhouse_driver/columns/util.py +60 -0
- clickhouse_driver/columns/uuidcolumn.py +64 -64
- clickhouse_driver/compression/__init__.py +28 -28
- clickhouse_driver/compression/base.py +87 -52
- clickhouse_driver/compression/lz4.py +21 -55
- clickhouse_driver/compression/lz4hc.py +9 -9
- clickhouse_driver/compression/zstd.py +20 -51
- clickhouse_driver/connection.py +784 -632
- clickhouse_driver/context.py +36 -36
- clickhouse_driver/dbapi/__init__.py +62 -62
- clickhouse_driver/dbapi/connection.py +99 -96
- clickhouse_driver/dbapi/cursor.py +370 -368
- clickhouse_driver/dbapi/errors.py +40 -40
- clickhouse_driver/dbapi/extras.py +73 -0
- clickhouse_driver/defines.py +55 -42
- clickhouse_driver/errors.py +453 -446
- clickhouse_driver/log.py +48 -44
- clickhouse_driver/numpy/block.py +8 -8
- clickhouse_driver/numpy/helpers.py +25 -25
- clickhouse_driver/numpy/result.py +123 -123
- clickhouse_driver/opentelemetry.py +43 -0
- clickhouse_driver/progress.py +38 -32
- clickhouse_driver/protocol.py +114 -105
- clickhouse_driver/queryprocessingstage.py +8 -8
- clickhouse_driver/reader.py +69 -69
- clickhouse_driver/readhelpers.py +26 -26
- clickhouse_driver/result.py +144 -144
- clickhouse_driver/settings/available.py +405 -405
- clickhouse_driver/settings/types.py +50 -50
- clickhouse_driver/settings/writer.py +34 -29
- clickhouse_driver/streams/compressed.py +88 -88
- clickhouse_driver/streams/native.py +102 -90
- clickhouse_driver/util/compat.py +39 -0
- clickhouse_driver/util/escape.py +94 -55
- clickhouse_driver/util/helpers.py +57 -57
- clickhouse_driver/varint.cp39-win_amd64.pyd +0 -0
- clickhouse_driver/writer.py +67 -67
- {clickhouse_driver-0.2.1.dist-info → clickhouse_driver-0.2.8.dist-info}/LICENSE +21 -21
- clickhouse_driver-0.2.8.dist-info/METADATA +201 -0
- clickhouse_driver-0.2.8.dist-info/RECORD +89 -0
- {clickhouse_driver-0.2.1.dist-info → clickhouse_driver-0.2.8.dist-info}/WHEEL +1 -1
- clickhouse_driver-0.2.1.dist-info/METADATA +0 -24
- clickhouse_driver-0.2.1.dist-info/RECORD +0 -80
- {clickhouse_driver-0.2.1.dist-info → clickhouse_driver-0.2.8.dist-info}/top_level.txt +0 -0
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
from .intcolumn import Int64Column
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class IntervalColumn(Int64Column):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class IntervalDayColumn(IntervalColumn):
|
|
9
|
-
ch_type = 'IntervalDay'
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class IntervalWeekColumn(IntervalColumn):
|
|
13
|
-
ch_type = 'IntervalWeek'
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class IntervalMonthColumn(IntervalColumn):
|
|
17
|
-
ch_type = 'IntervalMonth'
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class IntervalYearColumn(IntervalColumn):
|
|
21
|
-
ch_type = 'IntervalYear'
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class IntervalHourColumn(IntervalColumn):
|
|
25
|
-
ch_type = 'IntervalHour'
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class IntervalMinuteColumn(IntervalColumn):
|
|
29
|
-
ch_type = 'IntervalMinute'
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class IntervalSecondColumn(IntervalColumn):
|
|
33
|
-
ch_type = 'IntervalSecond'
|
|
1
|
+
from .intcolumn import Int64Column
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class IntervalColumn(Int64Column):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class IntervalDayColumn(IntervalColumn):
|
|
9
|
+
ch_type = 'IntervalDay'
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IntervalWeekColumn(IntervalColumn):
|
|
13
|
+
ch_type = 'IntervalWeek'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class IntervalMonthColumn(IntervalColumn):
|
|
17
|
+
ch_type = 'IntervalMonth'
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class IntervalYearColumn(IntervalColumn):
|
|
21
|
+
ch_type = 'IntervalYear'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class IntervalHourColumn(IntervalColumn):
|
|
25
|
+
ch_type = 'IntervalHour'
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class IntervalMinuteColumn(IntervalColumn):
|
|
29
|
+
ch_type = 'IntervalMinute'
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IntervalSecondColumn(IntervalColumn):
|
|
33
|
+
ch_type = 'IntervalSecond'
|
|
@@ -1,118 +1,118 @@
|
|
|
1
|
-
from ipaddress import IPv4Address, IPv6Address, AddressValueError
|
|
2
|
-
|
|
3
|
-
from .. import errors
|
|
4
|
-
from .exceptions import ColumnTypeMismatchException
|
|
5
|
-
from .stringcolumn import ByteFixedString
|
|
6
|
-
from .intcolumn import UInt32Column
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class IPv4Column(UInt32Column):
|
|
10
|
-
ch_type = "IPv4"
|
|
11
|
-
py_types = (str, IPv4Address, int)
|
|
12
|
-
|
|
13
|
-
def __init__(self, types_check=False, **kwargs):
|
|
14
|
-
# UIntColumn overrides before_write_item and check_item
|
|
15
|
-
# in its __init__ when types_check is True so we force
|
|
16
|
-
# __init__ without it then add the appropriate check method for IPv4
|
|
17
|
-
super(UInt32Column, self).__init__(types_check=False, **kwargs)
|
|
18
|
-
|
|
19
|
-
self.types_check_enabled = types_check
|
|
20
|
-
if types_check:
|
|
21
|
-
|
|
22
|
-
def check_item(value):
|
|
23
|
-
if isinstance(value, int) and value < 0:
|
|
24
|
-
raise ColumnTypeMismatchException(value)
|
|
25
|
-
|
|
26
|
-
if not isinstance(value, IPv4Address):
|
|
27
|
-
try:
|
|
28
|
-
value = IPv4Address(value)
|
|
29
|
-
except AddressValueError:
|
|
30
|
-
# Cannot parse input in a valid IPv4
|
|
31
|
-
raise ColumnTypeMismatchException(value)
|
|
32
|
-
|
|
33
|
-
self.check_item = check_item
|
|
34
|
-
|
|
35
|
-
def after_read_items(self, items, nulls_map=None):
|
|
36
|
-
if nulls_map is None:
|
|
37
|
-
return tuple(IPv4Address(item) for item in items)
|
|
38
|
-
else:
|
|
39
|
-
return tuple(
|
|
40
|
-
(None if is_null else IPv4Address(items[i]))
|
|
41
|
-
for i, is_null in enumerate(nulls_map)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
def before_write_items(self, items, nulls_map=None):
|
|
45
|
-
null_value = self.null_value
|
|
46
|
-
|
|
47
|
-
for i, item in enumerate(items):
|
|
48
|
-
if nulls_map and nulls_map[i]:
|
|
49
|
-
items[i] = null_value
|
|
50
|
-
continue
|
|
51
|
-
|
|
52
|
-
# allow Ipv4 in integer, string or IPv4Address object
|
|
53
|
-
try:
|
|
54
|
-
if isinstance(item, int):
|
|
55
|
-
continue
|
|
56
|
-
|
|
57
|
-
if not isinstance(item, IPv4Address):
|
|
58
|
-
item = IPv4Address(item)
|
|
59
|
-
|
|
60
|
-
items[i] = int(item)
|
|
61
|
-
except AddressValueError:
|
|
62
|
-
raise errors.CannotParseDomainError(
|
|
63
|
-
"Cannot parse IPv4 '{}'".format(item)
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class IPv6Column(ByteFixedString):
|
|
68
|
-
ch_type = "IPv6"
|
|
69
|
-
py_types = (str, IPv6Address, bytes)
|
|
70
|
-
|
|
71
|
-
def __init__(self, types_check=False, **kwargs):
|
|
72
|
-
super(IPv6Column, self).__init__(16, types_check=types_check, **kwargs)
|
|
73
|
-
|
|
74
|
-
if types_check:
|
|
75
|
-
|
|
76
|
-
def check_item(value):
|
|
77
|
-
if isinstance(value, bytes) and len(value) != 16:
|
|
78
|
-
raise ColumnTypeMismatchException(value)
|
|
79
|
-
|
|
80
|
-
if not isinstance(value, IPv6Address):
|
|
81
|
-
try:
|
|
82
|
-
value = IPv6Address(value)
|
|
83
|
-
except AddressValueError:
|
|
84
|
-
# Cannot parse input in a valid IPv6
|
|
85
|
-
raise ColumnTypeMismatchException(value)
|
|
86
|
-
|
|
87
|
-
self.check_item = check_item
|
|
88
|
-
|
|
89
|
-
def after_read_items(self, items, nulls_map=None):
|
|
90
|
-
if nulls_map is None:
|
|
91
|
-
return tuple(IPv6Address(item) for item in items)
|
|
92
|
-
else:
|
|
93
|
-
return tuple(
|
|
94
|
-
(None if is_null else IPv6Address(items[i]))
|
|
95
|
-
for i, is_null in enumerate(nulls_map)
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
def before_write_items(self, items, nulls_map=None):
|
|
99
|
-
null_value = self.null_value
|
|
100
|
-
|
|
101
|
-
for i, item in enumerate(items):
|
|
102
|
-
if nulls_map and nulls_map[i]:
|
|
103
|
-
items[i] = null_value
|
|
104
|
-
continue
|
|
105
|
-
|
|
106
|
-
# allow Ipv6 in bytes or python IPv6Address
|
|
107
|
-
# this is raw bytes (not encoded) in order to fit FixedString(16)
|
|
108
|
-
try:
|
|
109
|
-
if isinstance(item, bytes):
|
|
110
|
-
continue
|
|
111
|
-
|
|
112
|
-
if not isinstance(item, IPv6Address):
|
|
113
|
-
item = IPv6Address(item)
|
|
114
|
-
items[i] = item.packed
|
|
115
|
-
except AddressValueError:
|
|
116
|
-
raise errors.CannotParseDomainError(
|
|
117
|
-
"Cannot parse IPv6 '{}'".format(item)
|
|
118
|
-
)
|
|
1
|
+
from ipaddress import IPv4Address, IPv6Address, AddressValueError
|
|
2
|
+
|
|
3
|
+
from .. import errors
|
|
4
|
+
from .exceptions import ColumnTypeMismatchException
|
|
5
|
+
from .stringcolumn import ByteFixedString
|
|
6
|
+
from .intcolumn import UInt32Column
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IPv4Column(UInt32Column):
|
|
10
|
+
ch_type = "IPv4"
|
|
11
|
+
py_types = (str, IPv4Address, int)
|
|
12
|
+
|
|
13
|
+
def __init__(self, types_check=False, **kwargs):
|
|
14
|
+
# UIntColumn overrides before_write_item and check_item
|
|
15
|
+
# in its __init__ when types_check is True so we force
|
|
16
|
+
# __init__ without it then add the appropriate check method for IPv4
|
|
17
|
+
super(UInt32Column, self).__init__(types_check=False, **kwargs)
|
|
18
|
+
|
|
19
|
+
self.types_check_enabled = types_check
|
|
20
|
+
if types_check:
|
|
21
|
+
|
|
22
|
+
def check_item(value):
|
|
23
|
+
if isinstance(value, int) and value < 0:
|
|
24
|
+
raise ColumnTypeMismatchException(value)
|
|
25
|
+
|
|
26
|
+
if not isinstance(value, IPv4Address):
|
|
27
|
+
try:
|
|
28
|
+
value = IPv4Address(value)
|
|
29
|
+
except AddressValueError:
|
|
30
|
+
# Cannot parse input in a valid IPv4
|
|
31
|
+
raise ColumnTypeMismatchException(value)
|
|
32
|
+
|
|
33
|
+
self.check_item = check_item
|
|
34
|
+
|
|
35
|
+
def after_read_items(self, items, nulls_map=None):
|
|
36
|
+
if nulls_map is None:
|
|
37
|
+
return tuple(IPv4Address(item) for item in items)
|
|
38
|
+
else:
|
|
39
|
+
return tuple(
|
|
40
|
+
(None if is_null else IPv4Address(items[i]))
|
|
41
|
+
for i, is_null in enumerate(nulls_map)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def before_write_items(self, items, nulls_map=None):
|
|
45
|
+
null_value = self.null_value
|
|
46
|
+
|
|
47
|
+
for i, item in enumerate(items):
|
|
48
|
+
if nulls_map and nulls_map[i]:
|
|
49
|
+
items[i] = null_value
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
# allow Ipv4 in integer, string or IPv4Address object
|
|
53
|
+
try:
|
|
54
|
+
if isinstance(item, int):
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
if not isinstance(item, IPv4Address):
|
|
58
|
+
item = IPv4Address(item)
|
|
59
|
+
|
|
60
|
+
items[i] = int(item)
|
|
61
|
+
except AddressValueError:
|
|
62
|
+
raise errors.CannotParseDomainError(
|
|
63
|
+
"Cannot parse IPv4 '{}'".format(item)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class IPv6Column(ByteFixedString):
|
|
68
|
+
ch_type = "IPv6"
|
|
69
|
+
py_types = (str, IPv6Address, bytes)
|
|
70
|
+
|
|
71
|
+
def __init__(self, types_check=False, **kwargs):
|
|
72
|
+
super(IPv6Column, self).__init__(16, types_check=types_check, **kwargs)
|
|
73
|
+
|
|
74
|
+
if types_check:
|
|
75
|
+
|
|
76
|
+
def check_item(value):
|
|
77
|
+
if isinstance(value, bytes) and len(value) != 16:
|
|
78
|
+
raise ColumnTypeMismatchException(value)
|
|
79
|
+
|
|
80
|
+
if not isinstance(value, IPv6Address):
|
|
81
|
+
try:
|
|
82
|
+
value = IPv6Address(value)
|
|
83
|
+
except AddressValueError:
|
|
84
|
+
# Cannot parse input in a valid IPv6
|
|
85
|
+
raise ColumnTypeMismatchException(value)
|
|
86
|
+
|
|
87
|
+
self.check_item = check_item
|
|
88
|
+
|
|
89
|
+
def after_read_items(self, items, nulls_map=None):
|
|
90
|
+
if nulls_map is None:
|
|
91
|
+
return tuple(IPv6Address(item) for item in items)
|
|
92
|
+
else:
|
|
93
|
+
return tuple(
|
|
94
|
+
(None if is_null else IPv6Address(items[i]))
|
|
95
|
+
for i, is_null in enumerate(nulls_map)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def before_write_items(self, items, nulls_map=None):
|
|
99
|
+
null_value = self.null_value
|
|
100
|
+
|
|
101
|
+
for i, item in enumerate(items):
|
|
102
|
+
if nulls_map and nulls_map[i]:
|
|
103
|
+
items[i] = null_value
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# allow Ipv6 in bytes or python IPv6Address
|
|
107
|
+
# this is raw bytes (not encoded) in order to fit FixedString(16)
|
|
108
|
+
try:
|
|
109
|
+
if isinstance(item, bytes):
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if not isinstance(item, IPv6Address):
|
|
113
|
+
item = IPv6Address(item)
|
|
114
|
+
items[i] = item.packed
|
|
115
|
+
except AddressValueError:
|
|
116
|
+
raise errors.CannotParseDomainError(
|
|
117
|
+
"Cannot parse IPv6 '{}'".format(item)
|
|
118
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from .base import Column
|
|
2
|
+
from .stringcolumn import String
|
|
3
|
+
from ..reader import read_binary_uint8, read_binary_str
|
|
4
|
+
from ..util.compat import json
|
|
5
|
+
from ..writer import write_binary_uint8
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class JsonColumn(Column):
|
|
9
|
+
py_types = (dict, )
|
|
10
|
+
|
|
11
|
+
# No NULL value actually
|
|
12
|
+
null_value = {}
|
|
13
|
+
|
|
14
|
+
def __init__(self, column_by_spec_getter, **kwargs):
|
|
15
|
+
self.column_by_spec_getter = column_by_spec_getter
|
|
16
|
+
self.string_column = String(**kwargs)
|
|
17
|
+
super(JsonColumn, self).__init__(**kwargs)
|
|
18
|
+
|
|
19
|
+
def write_state_prefix(self, buf):
|
|
20
|
+
# Read in binary format.
|
|
21
|
+
# Write in text format.
|
|
22
|
+
write_binary_uint8(1, buf)
|
|
23
|
+
|
|
24
|
+
def read_items(self, n_items, buf):
|
|
25
|
+
read_binary_uint8(buf)
|
|
26
|
+
spec = read_binary_str(buf)
|
|
27
|
+
col = self.column_by_spec_getter(spec)
|
|
28
|
+
col.read_state_prefix(buf)
|
|
29
|
+
return col.read_data(n_items, buf)
|
|
30
|
+
|
|
31
|
+
def write_items(self, items, buf):
|
|
32
|
+
items = [x if isinstance(x, str) else json.dumps(x) for x in items]
|
|
33
|
+
self.string_column.write_items(items, buf)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def create_json_column(spec, column_by_spec_getter, column_options):
|
|
37
|
+
return JsonColumn(column_by_spec_getter, **column_options)
|
|
Binary file
|
|
@@ -1,123 +1,142 @@
|
|
|
1
|
-
from math import log
|
|
2
|
-
|
|
3
|
-
from ..reader import read_binary_uint64
|
|
4
|
-
from ..writer import write_binary_int64
|
|
5
|
-
from .base import Column
|
|
6
|
-
from .intcolumn import UInt8Column, UInt16Column, UInt32Column, UInt64Column
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def create_low_cardinality_column(spec, column_by_spec_getter):
|
|
10
|
-
inner = spec[15:-1]
|
|
11
|
-
nested = column_by_spec_getter(inner)
|
|
12
|
-
return LowCardinalityColumn(nested)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class LowCardinalityColumn(Column):
|
|
16
|
-
"""
|
|
17
|
-
Stores column as index (unique elements) and keys.
|
|
18
|
-
Good for de-duplication of large values with low cardinality.
|
|
19
|
-
"""
|
|
20
|
-
int_types = {
|
|
21
|
-
0: UInt8Column,
|
|
22
|
-
1: UInt16Column,
|
|
23
|
-
2: UInt32Column,
|
|
24
|
-
3: UInt64Column
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
# Need to read additional keys.
|
|
28
|
-
# Additional keys are stored before indexes as value N and N keys
|
|
29
|
-
# after them.
|
|
30
|
-
has_additional_keys_bit = 1 << 9
|
|
31
|
-
# Need to update dictionary.
|
|
32
|
-
# It means that previous granule has different dictionary.
|
|
33
|
-
need_update_dictionary = 1 << 10
|
|
34
|
-
|
|
35
|
-
serialization_type = has_additional_keys_bit | need_update_dictionary
|
|
36
|
-
|
|
37
|
-
def __init__(self, nested_column, **kwargs):
|
|
38
|
-
self.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
key
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
self.
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
1
|
+
from math import log
|
|
2
|
+
|
|
3
|
+
from ..reader import read_binary_uint64
|
|
4
|
+
from ..writer import write_binary_int64
|
|
5
|
+
from .base import Column
|
|
6
|
+
from .intcolumn import UInt8Column, UInt16Column, UInt32Column, UInt64Column
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def create_low_cardinality_column(spec, column_by_spec_getter, column_options):
|
|
10
|
+
inner = spec[15:-1]
|
|
11
|
+
nested = column_by_spec_getter(inner)
|
|
12
|
+
return LowCardinalityColumn(nested, **column_options)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LowCardinalityColumn(Column):
|
|
16
|
+
"""
|
|
17
|
+
Stores column as index (unique elements) and keys.
|
|
18
|
+
Good for de-duplication of large values with low cardinality.
|
|
19
|
+
"""
|
|
20
|
+
int_types = {
|
|
21
|
+
0: UInt8Column,
|
|
22
|
+
1: UInt16Column,
|
|
23
|
+
2: UInt32Column,
|
|
24
|
+
3: UInt64Column
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Need to read additional keys.
|
|
28
|
+
# Additional keys are stored before indexes as value N and N keys
|
|
29
|
+
# after them.
|
|
30
|
+
has_additional_keys_bit = 1 << 9
|
|
31
|
+
# Need to update dictionary.
|
|
32
|
+
# It means that previous granule has different dictionary.
|
|
33
|
+
need_update_dictionary = 1 << 10
|
|
34
|
+
|
|
35
|
+
serialization_type = has_additional_keys_bit | need_update_dictionary
|
|
36
|
+
|
|
37
|
+
def __init__(self, nested_column, **kwargs):
|
|
38
|
+
self.init_kwargs = kwargs
|
|
39
|
+
self.nested_column = nested_column
|
|
40
|
+
super(LowCardinalityColumn, self).__init__(**kwargs)
|
|
41
|
+
|
|
42
|
+
def read_state_prefix(self, buf):
|
|
43
|
+
super(LowCardinalityColumn, self).read_state_prefix(buf)
|
|
44
|
+
|
|
45
|
+
read_binary_uint64(buf)
|
|
46
|
+
|
|
47
|
+
def write_state_prefix(self, buf):
|
|
48
|
+
super(LowCardinalityColumn, self).write_state_prefix(buf)
|
|
49
|
+
|
|
50
|
+
# KeysSerializationVersion. See ClickHouse docs.
|
|
51
|
+
write_binary_int64(1, buf)
|
|
52
|
+
|
|
53
|
+
def _write_data(self, items, buf):
|
|
54
|
+
index, keys = [], []
|
|
55
|
+
key_by_index_element = {}
|
|
56
|
+
nested_is_nullable = False
|
|
57
|
+
|
|
58
|
+
if self.nested_column.nullable:
|
|
59
|
+
# First element represents NULL if column is nullable.
|
|
60
|
+
index.append(self.nested_column.null_value)
|
|
61
|
+
# Prevent null map writing. Reset nested column nullable flag.
|
|
62
|
+
self.nested_column.nullable = False
|
|
63
|
+
nested_is_nullable = True
|
|
64
|
+
|
|
65
|
+
for x in items:
|
|
66
|
+
if x is None:
|
|
67
|
+
# Zero element for null.
|
|
68
|
+
keys.append(0)
|
|
69
|
+
|
|
70
|
+
else:
|
|
71
|
+
key = key_by_index_element.get(x)
|
|
72
|
+
# Get key from index or add it to index.
|
|
73
|
+
if key is None:
|
|
74
|
+
key = len(key_by_index_element)
|
|
75
|
+
key_by_index_element[x] = key
|
|
76
|
+
index.append(x)
|
|
77
|
+
|
|
78
|
+
keys.append(key + 1)
|
|
79
|
+
else:
|
|
80
|
+
for x in items:
|
|
81
|
+
key = key_by_index_element.get(x)
|
|
82
|
+
|
|
83
|
+
# Get key from index or add it to index.
|
|
84
|
+
if key is None:
|
|
85
|
+
key = len(key_by_index_element)
|
|
86
|
+
key_by_index_element[x] = len(key_by_index_element)
|
|
87
|
+
index.append(x)
|
|
88
|
+
|
|
89
|
+
keys.append(key)
|
|
90
|
+
|
|
91
|
+
# Do not write anything for empty column.
|
|
92
|
+
# May happen while writing empty arrays.
|
|
93
|
+
if not len(index):
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
int_type = int(log(len(index), 2) / 8)
|
|
97
|
+
int_column = self.int_types[int_type](**self.init_kwargs)
|
|
98
|
+
|
|
99
|
+
serialization_type = self.serialization_type | int_type
|
|
100
|
+
|
|
101
|
+
write_binary_int64(serialization_type, buf)
|
|
102
|
+
write_binary_int64(len(index), buf)
|
|
103
|
+
|
|
104
|
+
if nested_is_nullable:
|
|
105
|
+
# Given we reset nested column nullable flag above,
|
|
106
|
+
# we need to write null map manually. If to invoke
|
|
107
|
+
# write_data method, it will cause an exception,
|
|
108
|
+
# because `prepare_data` may not be able to handle
|
|
109
|
+
# null value correctly.
|
|
110
|
+
self.nested_column.write_items(
|
|
111
|
+
[self.nested_column.null_value], buf)
|
|
112
|
+
# Remove null map from index, because it is already written.
|
|
113
|
+
index_to_write = index[1:]
|
|
114
|
+
self.nested_column.write_data(index_to_write, buf)
|
|
115
|
+
else:
|
|
116
|
+
self.nested_column.write_data(index, buf)
|
|
117
|
+
write_binary_int64(len(items), buf)
|
|
118
|
+
int_column.write_items(keys, buf)
|
|
119
|
+
|
|
120
|
+
def _read_data(self, n_items, buf, nulls_map=None):
|
|
121
|
+
if not n_items:
|
|
122
|
+
return tuple()
|
|
123
|
+
|
|
124
|
+
serialization_type = read_binary_uint64(buf)
|
|
125
|
+
|
|
126
|
+
# Lowest byte contains info about key type.
|
|
127
|
+
key_type = serialization_type & 0xf
|
|
128
|
+
keys_column = self.int_types[key_type](**self.init_kwargs)
|
|
129
|
+
|
|
130
|
+
nullable = self.nested_column.nullable
|
|
131
|
+
# Prevent null map reading. Reset nested column nullable flag.
|
|
132
|
+
self.nested_column.nullable = False
|
|
133
|
+
|
|
134
|
+
index_size = read_binary_uint64(buf)
|
|
135
|
+
index = self.nested_column.read_data(index_size, buf)
|
|
136
|
+
if nullable:
|
|
137
|
+
index = (None, ) + index[1:]
|
|
138
|
+
|
|
139
|
+
read_binary_uint64(buf) # number of keys
|
|
140
|
+
keys = keys_column.read_data(n_items, buf)
|
|
141
|
+
|
|
142
|
+
return tuple(index[x] for x in keys)
|