vastdb 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/internal_commands.py +25 -70
- vastdb/session.py +2 -2
- vastdb/table.py +31 -3
- vastdb/tests/test_nested.py +75 -1
- vastdb/tests/test_tables.py +81 -78
- vastdb/transaction.py +1 -1
- {vastdb-0.1.4.dist-info → vastdb-0.1.5.dist-info}/METADATA +1 -1
- {vastdb-0.1.4.dist-info → vastdb-0.1.5.dist-info}/RECORD +11 -11
- {vastdb-0.1.4.dist-info → vastdb-0.1.5.dist-info}/LICENSE +0 -0
- {vastdb-0.1.4.dist-info → vastdb-0.1.5.dist-info}/WHEEL +0 -0
- {vastdb-0.1.4.dist-info → vastdb-0.1.5.dist-info}/top_level.txt +0 -0
vastdb/internal_commands.py
CHANGED
|
@@ -130,46 +130,13 @@ def get_unit_to_flatbuff_time_unit(type):
|
|
|
130
130
|
class Predicate:
|
|
131
131
|
def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
|
|
132
132
|
self.schema = schema
|
|
133
|
+
index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
|
|
134
|
+
# Arrow schema contains the top-level columns, where each column may include multiple subfields
|
|
135
|
+
# We use DFS is used to enumerate all the sub-columns, using `index` as an ID allocator
|
|
136
|
+
nodes = [FieldNode(field, index) for field in schema]
|
|
137
|
+
self.nodes_map = {node.field.name: node for node in nodes}
|
|
133
138
|
self.expr = expr
|
|
134
139
|
|
|
135
|
-
def get_field_indexes(self, field: 'pa.Field', field_name_per_index: list) -> None:
|
|
136
|
-
field_name_per_index.append(field.name)
|
|
137
|
-
|
|
138
|
-
if isinstance(field.type, pa.StructType):
|
|
139
|
-
flat_fields = field.flatten()
|
|
140
|
-
elif isinstance(field.type, pa.MapType):
|
|
141
|
-
flat_fields = [pa.field(f'{field.name}.entries', pa.struct([field.type.key_field, field.type.item_field]))]
|
|
142
|
-
elif isinstance(field.type, pa.ListType):
|
|
143
|
-
flat_fields = [pa.field(f'{field.name}.{field.type.value_field.name}', field.type.value_field.type)]
|
|
144
|
-
else:
|
|
145
|
-
return
|
|
146
|
-
|
|
147
|
-
for flat_field in flat_fields:
|
|
148
|
-
self.get_field_indexes(flat_field, field_name_per_index)
|
|
149
|
-
|
|
150
|
-
@property
|
|
151
|
-
def field_name_per_index(self):
|
|
152
|
-
if self._field_name_per_index is None:
|
|
153
|
-
_field_name_per_index = []
|
|
154
|
-
for field in self.schema:
|
|
155
|
-
self.get_field_indexes(field, _field_name_per_index)
|
|
156
|
-
self._field_name_per_index = {field: index for index, field in enumerate(_field_name_per_index)}
|
|
157
|
-
return self._field_name_per_index
|
|
158
|
-
|
|
159
|
-
def get_projections(self, builder: 'flatbuffers.builder.Builder', field_names: Optional[List[str]] = None):
|
|
160
|
-
if field_names is None:
|
|
161
|
-
field_names = self.field_name_per_index.keys()
|
|
162
|
-
projection_fields = []
|
|
163
|
-
for field_name in field_names:
|
|
164
|
-
fb_field_index.Start(builder)
|
|
165
|
-
fb_field_index.AddPosition(builder, self.field_name_per_index[field_name])
|
|
166
|
-
offset = fb_field_index.End(builder)
|
|
167
|
-
projection_fields.append(offset)
|
|
168
|
-
fb_source.StartProjectionVector(builder, len(projection_fields))
|
|
169
|
-
for offset in reversed(projection_fields):
|
|
170
|
-
builder.PrependUOffsetTRelative(offset)
|
|
171
|
-
return builder.EndVector()
|
|
172
|
-
|
|
173
140
|
def serialize(self, builder: 'flatbuffers.builder.Builder'):
|
|
174
141
|
from ibis.expr.operations.generic import (
|
|
175
142
|
IsNull,
|
|
@@ -204,8 +171,6 @@ class Predicate:
|
|
|
204
171
|
Between: self.build_between,
|
|
205
172
|
}
|
|
206
173
|
|
|
207
|
-
positions_map = dict((f.name, index) for index, f in enumerate(self.schema)) # TODO: BFS
|
|
208
|
-
|
|
209
174
|
self.builder = builder
|
|
210
175
|
|
|
211
176
|
offsets = []
|
|
@@ -261,7 +226,11 @@ class Predicate:
|
|
|
261
226
|
elif prev_field_name != field_name:
|
|
262
227
|
raise NotImplementedError(self.expr)
|
|
263
228
|
|
|
264
|
-
|
|
229
|
+
node = self.nodes_map[field_name]
|
|
230
|
+
# TODO: support predicate pushdown for leaf nodes (ORION-160338)
|
|
231
|
+
if node.children:
|
|
232
|
+
raise NotImplementedError(node.field) # no predicate pushdown for nested columns
|
|
233
|
+
column_offset = self.build_column(position=node.index)
|
|
265
234
|
field = self.schema.field(field_name)
|
|
266
235
|
for literal in literals:
|
|
267
236
|
args_offsets = [column_offset]
|
|
@@ -839,12 +808,13 @@ class VastdbApi:
|
|
|
839
808
|
return prefix
|
|
840
809
|
|
|
841
810
|
def _fill_common_headers(self, txid=0, client_tags=[], version_id=1):
|
|
842
|
-
common_headers = {
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
811
|
+
common_headers = {
|
|
812
|
+
'tabular-txid': str(txid),
|
|
813
|
+
'tabular-api-version-id': str(version_id),
|
|
814
|
+
'tabular-client-name': 'tabular-api'
|
|
815
|
+
}
|
|
846
816
|
|
|
847
|
-
return common_headers
|
|
817
|
+
return common_headers | {f'tabular-client-tags-{index}': tag for index, tag in enumerate(client_tags)}
|
|
848
818
|
|
|
849
819
|
def _check_res(self, res, cmd="", expected_retvals=[]):
|
|
850
820
|
if exc := errors.from_response(res):
|
|
@@ -952,8 +922,7 @@ class VastdbApi:
|
|
|
952
922
|
res_headers = res.headers
|
|
953
923
|
next_key = int(res_headers['tabular-next-key'])
|
|
954
924
|
is_truncated = res_headers['tabular-is-truncated'] == 'true'
|
|
955
|
-
|
|
956
|
-
lists = list_schemas.GetRootAs(flatbuf)
|
|
925
|
+
lists = list_schemas.GetRootAs(res.content)
|
|
957
926
|
bucket_name = lists.BucketName().decode()
|
|
958
927
|
if not bucket.startswith(bucket_name):
|
|
959
928
|
raise ValueError(f'bucket: {bucket} did not start from {bucket_name}')
|
|
@@ -976,8 +945,7 @@ class VastdbApi:
|
|
|
976
945
|
res = self.session.get(self._api_prefix(bucket=bucket, command="list", url_params=url_params), headers={}, stream=True)
|
|
977
946
|
self._check_res(res, "list_snapshots")
|
|
978
947
|
|
|
979
|
-
|
|
980
|
-
xml_str = out.decode()
|
|
948
|
+
xml_str = res.content.decode()
|
|
981
949
|
xml_dict = xmltodict.parse(xml_str)
|
|
982
950
|
list_res = xml_dict['ListBucketResult']
|
|
983
951
|
is_truncated = list_res['IsTruncated'] == 'true'
|
|
@@ -1059,8 +1027,7 @@ class VastdbApi:
|
|
|
1059
1027
|
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=name, command="stats", url_params=url_params), headers=headers)
|
|
1060
1028
|
self._check_res(res, "get_table_stats", expected_retvals)
|
|
1061
1029
|
|
|
1062
|
-
|
|
1063
|
-
stats = get_table_stats.GetRootAs(flatbuf)
|
|
1030
|
+
stats = get_table_stats.GetRootAs(res.content)
|
|
1064
1031
|
num_rows = stats.NumRows()
|
|
1065
1032
|
size_in_bytes = stats.SizeInBytes()
|
|
1066
1033
|
is_external_rowid_alloc = stats.IsExternalRowidAlloc()
|
|
@@ -1159,8 +1126,7 @@ class VastdbApi:
|
|
|
1159
1126
|
res_headers = res.headers
|
|
1160
1127
|
next_key = int(res_headers['tabular-next-key'])
|
|
1161
1128
|
is_truncated = res_headers['tabular-is-truncated'] == 'true'
|
|
1162
|
-
|
|
1163
|
-
lists = list_tables.GetRootAs(flatbuf)
|
|
1129
|
+
lists = list_tables.GetRootAs(res.content)
|
|
1164
1130
|
bucket_name = lists.BucketName().decode()
|
|
1165
1131
|
schema_name = lists.SchemaName().decode()
|
|
1166
1132
|
if not bucket.startswith(bucket_name): # ignore snapshot name
|
|
@@ -1288,11 +1254,7 @@ class VastdbApi:
|
|
|
1288
1254
|
next_key = int(res_headers['tabular-next-key'])
|
|
1289
1255
|
is_truncated = res_headers['tabular-is-truncated'] == 'true'
|
|
1290
1256
|
count = int(res_headers['tabular-list-count'])
|
|
1291
|
-
columns = []
|
|
1292
|
-
if not count_only:
|
|
1293
|
-
schema_buf = b''.join(res.iter_content(chunk_size=128))
|
|
1294
|
-
schema_out = pa.ipc.open_stream(schema_buf).schema
|
|
1295
|
-
columns = schema_out
|
|
1257
|
+
columns = [] if count_only else pa.ipc.open_stream(res.content).schema
|
|
1296
1258
|
|
|
1297
1259
|
return columns, next_key, is_truncated, count
|
|
1298
1260
|
|
|
@@ -1692,8 +1654,7 @@ class VastdbApi:
|
|
|
1692
1654
|
res = self.session.get(self._api_prefix(bucket=bucket, schema=schema, table=table, command="projection-stats", url_params=url_params),
|
|
1693
1655
|
headers=headers)
|
|
1694
1656
|
if res.status_code == 200:
|
|
1695
|
-
|
|
1696
|
-
stats = get_projection_table_stats.GetRootAs(flatbuf)
|
|
1657
|
+
stats = get_projection_table_stats.GetRootAs(res.content)
|
|
1697
1658
|
num_rows = stats.NumRows()
|
|
1698
1659
|
size_in_bytes = stats.SizeInBytes()
|
|
1699
1660
|
dirty_blocks_percentage = stats.DirtyBlocksPercentage()
|
|
@@ -1779,8 +1740,7 @@ class VastdbApi:
|
|
|
1779
1740
|
next_key = int(res_headers['tabular-next-key'])
|
|
1780
1741
|
is_truncated = res_headers['tabular-is-truncated'] == 'true'
|
|
1781
1742
|
count = int(res_headers['tabular-list-count'])
|
|
1782
|
-
|
|
1783
|
-
lists = list_projections.GetRootAs(flatbuf)
|
|
1743
|
+
lists = list_projections.GetRootAs(res.content)
|
|
1784
1744
|
bucket_name = lists.BucketName().decode()
|
|
1785
1745
|
schema_name = lists.SchemaName().decode()
|
|
1786
1746
|
table_name = lists.TableName().decode()
|
|
@@ -1827,13 +1787,8 @@ class VastdbApi:
|
|
|
1827
1787
|
next_key = int(res_headers['tabular-next-key'])
|
|
1828
1788
|
is_truncated = res_headers['tabular-is-truncated'] == 'true'
|
|
1829
1789
|
count = int(res_headers['tabular-list-count'])
|
|
1830
|
-
columns = []
|
|
1831
|
-
|
|
1832
|
-
schema_buf = b''.join(res.iter_content(chunk_size=128))
|
|
1833
|
-
schema_out = pa.ipc.open_stream(schema_buf).schema
|
|
1834
|
-
for f in schema_out:
|
|
1835
|
-
columns.append([f.name, f.type, f.metadata])
|
|
1836
|
-
# sort_type = f.metadata[b'VAST:sort_type'].decode()
|
|
1790
|
+
columns = [] if count_only else [[f.name, f.type, f.metadata] for f in
|
|
1791
|
+
pa.ipc.open_stream(res.content).schema]
|
|
1837
1792
|
|
|
1838
1793
|
return columns, next_key, is_truncated, count
|
|
1839
1794
|
|
vastdb/session.py
CHANGED
|
@@ -35,7 +35,7 @@ class Features:
|
|
|
35
35
|
class Session:
|
|
36
36
|
"""VAST database session."""
|
|
37
37
|
|
|
38
|
-
def __init__(self, access=None, secret=None, endpoint=None):
|
|
38
|
+
def __init__(self, access=None, secret=None, endpoint=None, ssl_verify=True):
|
|
39
39
|
"""Connect to a VAST Database endpoint, using specified credentials."""
|
|
40
40
|
if access is None:
|
|
41
41
|
access = os.environ['AWS_ACCESS_KEY_ID']
|
|
@@ -44,7 +44,7 @@ class Session:
|
|
|
44
44
|
if endpoint is None:
|
|
45
45
|
endpoint = os.environ['AWS_S3_ENDPOINT_URL']
|
|
46
46
|
|
|
47
|
-
self.api = internal_commands.VastdbApi(endpoint, access, secret)
|
|
47
|
+
self.api = internal_commands.VastdbApi(endpoint, access, secret, ssl_verify=ssl_verify)
|
|
48
48
|
version_tuple = tuple(int(part) for part in self.api.vast_version.split('.'))
|
|
49
49
|
self.features = Features(version_tuple)
|
|
50
50
|
self.s3 = boto3.client('s3',
|
vastdb/table.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
12
12
|
import backoff
|
|
13
13
|
import ibis
|
|
14
14
|
import pyarrow as pa
|
|
15
|
+
import requests
|
|
15
16
|
|
|
16
17
|
from . import errors, internal_commands, schema, util
|
|
17
18
|
|
|
@@ -39,20 +40,44 @@ class TableStats:
|
|
|
39
40
|
endpoints: Tuple[str, ...] = ()
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
RETRIABLE_ERRORS = (
|
|
44
|
+
errors.Slowdown,
|
|
45
|
+
requests.exceptions.ConnectionError,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
42
49
|
@dataclass
|
|
43
50
|
class QueryConfig:
|
|
44
51
|
"""Query execution configiration."""
|
|
45
52
|
|
|
53
|
+
# allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
|
|
46
54
|
num_sub_splits: int = 4
|
|
55
|
+
|
|
56
|
+
# used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
|
|
47
57
|
num_splits: int = 1
|
|
58
|
+
|
|
59
|
+
# each endpoint will be handled by a separate worker thread
|
|
60
|
+
# a single endpoint can be specified more than once to benefit from multithreaded execution
|
|
48
61
|
data_endpoints: Optional[List[str]] = None
|
|
62
|
+
|
|
63
|
+
# a subsplit fiber will finish after sending this number of rows back to the client
|
|
49
64
|
limit_rows_per_sub_split: int = 128 * 1024
|
|
65
|
+
|
|
66
|
+
# each fiber will read the following number of rowgroups coninuously before skipping
|
|
67
|
+
# in order to use semi-sorted projections this value must be 8
|
|
50
68
|
num_row_groups_per_sub_split: int = 8
|
|
69
|
+
|
|
70
|
+
# can be disabled for benchmarking purposes
|
|
51
71
|
use_semi_sorted_projections: bool = True
|
|
72
|
+
|
|
73
|
+
# used to estimate the number of splits, given the table rows' count
|
|
52
74
|
rows_per_split: int = 4000000
|
|
75
|
+
|
|
76
|
+
# used for worker threads' naming
|
|
53
77
|
query_id: str = ""
|
|
54
|
-
|
|
55
|
-
|
|
78
|
+
|
|
79
|
+
# allows retrying QueryData when the server is overloaded
|
|
80
|
+
backoff_func: Any = field(default=backoff.on_exception(backoff.expo, RETRIABLE_ERRORS, max_tries=10))
|
|
56
81
|
|
|
57
82
|
|
|
58
83
|
@dataclass
|
|
@@ -271,7 +296,7 @@ class Table:
|
|
|
271
296
|
return TableStats(**stats_tuple._asdict())
|
|
272
297
|
|
|
273
298
|
def select(self, columns: Optional[List[str]] = None,
|
|
274
|
-
predicate: ibis.expr.types.BooleanColumn = None,
|
|
299
|
+
predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
|
|
275
300
|
config: Optional[QueryConfig] = None,
|
|
276
301
|
*,
|
|
277
302
|
internal_row_id: bool = False) -> pa.RecordBatchReader:
|
|
@@ -310,6 +335,9 @@ class Table:
|
|
|
310
335
|
response_schema = internal_commands.get_response_schema(schema=query_schema, field_names=columns)
|
|
311
336
|
return pa.RecordBatchReader.from_batches(response_schema, [])
|
|
312
337
|
|
|
338
|
+
if isinstance(predicate, ibis.common.deferred.Deferred):
|
|
339
|
+
predicate = predicate.resolve(self._ibis_table) # may raise if the predicate is invalid (e.g. wrong types / missing column)
|
|
340
|
+
|
|
313
341
|
query_data_request = internal_commands.build_query_data_request(
|
|
314
342
|
schema=query_schema,
|
|
315
343
|
predicate=predicate,
|
vastdb/tests/test_nested.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
import itertools
|
|
3
|
+
import operator
|
|
2
4
|
|
|
3
5
|
import pyarrow as pa
|
|
6
|
+
import pyarrow.compute as pc
|
|
7
|
+
import pytest
|
|
4
8
|
|
|
5
9
|
from .util import prepare_data
|
|
6
10
|
|
|
7
11
|
|
|
8
|
-
def
|
|
12
|
+
def test_nested_select(session, clean_bucket_name):
|
|
9
13
|
columns = pa.schema([
|
|
10
14
|
('l', pa.list_(pa.int8())),
|
|
11
15
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
@@ -26,3 +30,73 @@ def test_nested(session, clean_bucket_name):
|
|
|
26
30
|
for cols in itertools.permutations(names, n):
|
|
27
31
|
actual = pa.Table.from_batches(t.select(columns=cols))
|
|
28
32
|
assert actual == expected.select(cols)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_nested_filter(session, clean_bucket_name):
|
|
36
|
+
columns = pa.schema([
|
|
37
|
+
('x', pa.int64()),
|
|
38
|
+
('l', pa.list_(pa.int8())),
|
|
39
|
+
('y', pa.int64()),
|
|
40
|
+
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
41
|
+
('z', pa.int64()),
|
|
42
|
+
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
43
|
+
('w', pa.int64()),
|
|
44
|
+
])
|
|
45
|
+
expected = pa.table(schema=columns, data=[
|
|
46
|
+
[1, 2, 3, None],
|
|
47
|
+
[[1], [], [2, 3], None],
|
|
48
|
+
[1, 2, None, 3],
|
|
49
|
+
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
50
|
+
[1, None, 2, 3],
|
|
51
|
+
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
52
|
+
[None, 1, 2, 3],
|
|
53
|
+
])
|
|
54
|
+
|
|
55
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
56
|
+
actual = pa.Table.from_batches(t.select())
|
|
57
|
+
assert actual == expected
|
|
58
|
+
|
|
59
|
+
names = list('xyzw')
|
|
60
|
+
for n in range(1, len(names) + 1):
|
|
61
|
+
for cols in itertools.permutations(names, n):
|
|
62
|
+
ibis_predicate = functools.reduce(
|
|
63
|
+
operator.and_,
|
|
64
|
+
(t[col] > 2 for col in cols))
|
|
65
|
+
actual = pa.Table.from_batches(t.select(predicate=ibis_predicate), t.arrow_schema)
|
|
66
|
+
|
|
67
|
+
arrow_predicate = functools.reduce(
|
|
68
|
+
operator.and_,
|
|
69
|
+
(pc.field(col) > 2 for col in cols))
|
|
70
|
+
assert actual == expected.filter(arrow_predicate)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_nested_unsupported_filter(session, clean_bucket_name):
|
|
74
|
+
columns = pa.schema([
|
|
75
|
+
('x', pa.int64()),
|
|
76
|
+
('l', pa.list_(pa.int8())),
|
|
77
|
+
('y', pa.int64()),
|
|
78
|
+
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
79
|
+
('z', pa.int64()),
|
|
80
|
+
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
81
|
+
('w', pa.int64()),
|
|
82
|
+
])
|
|
83
|
+
expected = pa.table(schema=columns, data=[
|
|
84
|
+
[1, 2, 3, None],
|
|
85
|
+
[[1], [], [2, 3], None],
|
|
86
|
+
[1, 2, None, 3],
|
|
87
|
+
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
88
|
+
[1, None, 2, 3],
|
|
89
|
+
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
90
|
+
[None, 1, 2, 3],
|
|
91
|
+
])
|
|
92
|
+
|
|
93
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
94
|
+
|
|
95
|
+
with pytest.raises(NotImplementedError):
|
|
96
|
+
list(t.select(predicate=(t['l'].isnull())))
|
|
97
|
+
|
|
98
|
+
with pytest.raises(NotImplementedError):
|
|
99
|
+
list(t.select(predicate=(t['m'].isnull())))
|
|
100
|
+
|
|
101
|
+
with pytest.raises(NotImplementedError):
|
|
102
|
+
list(t.select(predicate=(t['s'].isnull())))
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -7,6 +7,7 @@ import time
|
|
|
7
7
|
from contextlib import closing
|
|
8
8
|
from tempfile import NamedTemporaryFile
|
|
9
9
|
|
|
10
|
+
import ibis
|
|
10
11
|
import pyarrow as pa
|
|
11
12
|
import pyarrow.compute as pc
|
|
12
13
|
import pyarrow.parquet as pq
|
|
@@ -215,46 +216,47 @@ def test_types(session, clean_bucket_name):
|
|
|
215
216
|
[dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
|
|
216
217
|
])
|
|
217
218
|
|
|
218
|
-
with prepare_data(session, clean_bucket_name, 's', 't', expected) as
|
|
219
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as table:
|
|
219
220
|
def select(predicate):
|
|
220
|
-
return pa.Table.from_batches(
|
|
221
|
+
return pa.Table.from_batches(table.select(predicate=predicate))
|
|
221
222
|
|
|
222
223
|
assert select(None) == expected
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
224
|
+
for t in [table, ibis._]:
|
|
225
|
+
assert select(t['tb'] == False) == expected.filter(pc.field('tb') == False) # noqa: E712
|
|
226
|
+
assert select(t['a1'] == 2) == expected.filter(pc.field('a1') == 2)
|
|
227
|
+
assert select(t['a2'] == 2000) == expected.filter(pc.field('a2') == 2000)
|
|
228
|
+
assert select(t['a4'] == 222111122) == expected.filter(pc.field('a4') == 222111122)
|
|
229
|
+
assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
|
|
230
|
+
assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
|
|
231
|
+
assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
|
|
232
|
+
assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
|
|
231
233
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
+
date_literal = dt.date(2024, 4, 10)
|
|
235
|
+
assert select(t['date'] == date_literal) == expected.filter(pc.field('date') == date_literal)
|
|
234
236
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
+
time_literal = dt.time(12, 34, 56)
|
|
238
|
+
assert select(t['t0'] == time_literal) == expected.filter(pc.field('t0') == time_literal)
|
|
237
239
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
+
time_literal = dt.time(12, 34, 56, 789000)
|
|
241
|
+
assert select(t['t3'] == time_literal) == expected.filter(pc.field('t3') == time_literal)
|
|
240
242
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
+
time_literal = dt.time(12, 34, 56, 789789)
|
|
244
|
+
assert select(t['t6'] == time_literal) == expected.filter(pc.field('t6') == time_literal)
|
|
243
245
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
+
time_literal = dt.time(12, 34, 56, 789789)
|
|
247
|
+
assert select(t['t9'] == time_literal) == expected.filter(pc.field('t9') == time_literal)
|
|
246
248
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56)
|
|
250
|
+
assert select(t['ts0'] == ts_literal) == expected.filter(pc.field('ts0') == ts_literal)
|
|
249
251
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789000)
|
|
253
|
+
assert select(t['ts3'] == ts_literal) == expected.filter(pc.field('ts3') == ts_literal)
|
|
252
254
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
|
|
256
|
+
assert select(t['ts6'] == ts_literal) == expected.filter(pc.field('ts6') == ts_literal)
|
|
255
257
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
|
|
259
|
+
assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
|
|
258
260
|
|
|
259
261
|
|
|
260
262
|
def test_filters(session, clean_bucket_name):
|
|
@@ -270,62 +272,63 @@ def test_filters(session, clean_bucket_name):
|
|
|
270
272
|
['a', 'bb', 'ccc', None, 'xyz'],
|
|
271
273
|
])
|
|
272
274
|
|
|
273
|
-
with prepare_data(session, clean_bucket_name, 's', 't', expected) as
|
|
275
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as table:
|
|
274
276
|
def select(predicate):
|
|
275
|
-
return pa.Table.from_batches(
|
|
277
|
+
return pa.Table.from_batches(table.select(predicate=predicate), table.arrow_schema)
|
|
276
278
|
|
|
277
279
|
assert select(None) == expected
|
|
278
280
|
assert select(True) == expected
|
|
279
281
|
assert select(False) == pa.Table.from_batches([], schema=columns)
|
|
280
282
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
283
|
+
for t in [table, ibis._]:
|
|
284
|
+
assert select(t['a'].between(222, 444)) == expected.filter((pc.field('a') >= 222) & (pc.field('a') <= 444))
|
|
285
|
+
assert select((t['a'].between(222, 444)) & (t['b'] > 2.5)) == expected.filter((pc.field('a') >= 222) & (pc.field('a') <= 444) & (pc.field('b') > 2.5))
|
|
286
|
+
|
|
287
|
+
assert select(t['a'] > 222) == expected.filter(pc.field('a') > 222)
|
|
288
|
+
assert select(t['a'] < 222) == expected.filter(pc.field('a') < 222)
|
|
289
|
+
assert select(t['a'] == 222) == expected.filter(pc.field('a') == 222)
|
|
290
|
+
assert select(t['a'] != 222) == expected.filter(pc.field('a') != 222)
|
|
291
|
+
assert select(t['a'] <= 222) == expected.filter(pc.field('a') <= 222)
|
|
292
|
+
assert select(t['a'] >= 222) == expected.filter(pc.field('a') >= 222)
|
|
293
|
+
|
|
294
|
+
assert select(t['b'] > 1.5) == expected.filter(pc.field('b') > 1.5)
|
|
295
|
+
assert select(t['b'] < 1.5) == expected.filter(pc.field('b') < 1.5)
|
|
296
|
+
assert select(t['b'] == 1.5) == expected.filter(pc.field('b') == 1.5)
|
|
297
|
+
assert select(t['b'] != 1.5) == expected.filter(pc.field('b') != 1.5)
|
|
298
|
+
assert select(t['b'] <= 1.5) == expected.filter(pc.field('b') <= 1.5)
|
|
299
|
+
assert select(t['b'] >= 1.5) == expected.filter(pc.field('b') >= 1.5)
|
|
300
|
+
|
|
301
|
+
assert select(t['s'] > 'bb') == expected.filter(pc.field('s') > 'bb')
|
|
302
|
+
assert select(t['s'] < 'bb') == expected.filter(pc.field('s') < 'bb')
|
|
303
|
+
assert select(t['s'] == 'bb') == expected.filter(pc.field('s') == 'bb')
|
|
304
|
+
assert select(t['s'] != 'bb') == expected.filter(pc.field('s') != 'bb')
|
|
305
|
+
assert select(t['s'] <= 'bb') == expected.filter(pc.field('s') <= 'bb')
|
|
306
|
+
assert select(t['s'] >= 'bb') == expected.filter(pc.field('s') >= 'bb')
|
|
307
|
+
|
|
308
|
+
assert select((t['a'] > 111) & (t['b'] > 0) & (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) & (pc.field('b') > 0) & (pc.field('s') < 'ccc'))
|
|
309
|
+
assert select((t['a'] > 111) & (t['b'] < 2.5)) == expected.filter((pc.field('a') > 111) & (pc.field('b') < 2.5))
|
|
310
|
+
assert select((t['a'] > 111) & (t['a'] < 333)) == expected.filter((pc.field('a') > 111) & (pc.field('a') < 333))
|
|
311
|
+
|
|
312
|
+
assert select((t['a'] > 111) | (t['a'] < 333)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333))
|
|
313
|
+
assert select(((t['a'] > 111) | (t['a'] < 333)) & (t['b'] < 2.5)) == expected.filter(((pc.field('a') > 111) | (pc.field('a') < 333)) & (pc.field('b') < 2.5))
|
|
314
|
+
with pytest.raises(NotImplementedError):
|
|
315
|
+
assert select((t['a'] > 111) | (t['b'] > 0) | (t['s'] < 'ccc')) == expected.filter((pc.field('a') > 111) | (pc.field('b') > 0) | (pc.field('s') < 'ccc'))
|
|
316
|
+
assert select((t['a'] > 111) | (t['a'] < 333) | (t['a'] == 777)) == expected.filter((pc.field('a') > 111) | (pc.field('a') < 333) | (pc.field('a') == 777))
|
|
317
|
+
|
|
318
|
+
assert select(t['s'].isnull()) == expected.filter(pc.field('s').is_null())
|
|
319
|
+
assert select((t['s'].isnull()) | (t['s'] == 'bb')) == expected.filter((pc.field('s').is_null()) | (pc.field('s') == 'bb'))
|
|
320
|
+
assert select((t['s'].isnull()) & (t['b'] == 3.5)) == expected.filter((pc.field('s').is_null()) & (pc.field('b') == 3.5))
|
|
321
|
+
|
|
322
|
+
assert select(~t['s'].isnull()) == expected.filter(~pc.field('s').is_null())
|
|
323
|
+
assert select(t['s'].contains('b')) == expected.filter(pc.field('s') == 'bb')
|
|
324
|
+
assert select(t['s'].contains('y')) == expected.filter(pc.field('s') == 'xyz')
|
|
325
|
+
|
|
326
|
+
assert select(t['a'].isin([555])) == expected.filter(pc.field('a').isin([555]))
|
|
327
|
+
assert select(t['a'].isin([111, 222, 999])) == expected.filter(pc.field('a').isin([111, 222, 999]))
|
|
328
|
+
assert select((t['a'] == 111) | t['a'].isin([333, 444]) | (t['a'] > 600)) == expected.filter((pc.field('a') == 111) | pc.field('a').isin([333, 444]) | (pc.field('a') > 600))
|
|
329
|
+
|
|
330
|
+
with pytest.raises(NotImplementedError):
|
|
331
|
+
select(t['a'].isin([]))
|
|
329
332
|
|
|
330
333
|
|
|
331
334
|
def test_parquet_export(session, clean_bucket_name):
|
vastdb/transaction.py
CHANGED
|
@@ -63,7 +63,7 @@ class Transaction:
|
|
|
63
63
|
except botocore.exceptions.ClientError as e:
|
|
64
64
|
log.warning("res: %s", e.response)
|
|
65
65
|
if e.response['Error']['Code'] == '404':
|
|
66
|
-
raise errors.MissingBucket(name)
|
|
66
|
+
raise errors.MissingBucket(name) from e
|
|
67
67
|
raise
|
|
68
68
|
return bucket.Bucket(name, self)
|
|
69
69
|
|
|
@@ -152,26 +152,26 @@ vastdb/__init__.py,sha256=cMJtZuJ0IL9aKyM3DUWqTCzuP1H1MXXVivKKE1-q0DY,292
|
|
|
152
152
|
vastdb/bucket.py,sha256=4rPEm9qlPTg7ccWO6VGmd4LKb8w-BDhJYwzXGjn03sc,3566
|
|
153
153
|
vastdb/conftest.py,sha256=pKpo_46Vq4QHzTDQAFxasrVhnZ2V2L-y6IMLxojxaFM,2132
|
|
154
154
|
vastdb/errors.py,sha256=fj8IlPnGi1lbJWIl1-8MSjLavL9bYQ-YUoboWbXCo54,4047
|
|
155
|
-
vastdb/internal_commands.py,sha256=
|
|
155
|
+
vastdb/internal_commands.py,sha256=kIdkLHabW8r4-GSygGl1Gdrr4puxD79WPO8Jkx8aszg,98490
|
|
156
156
|
vastdb/schema.py,sha256=ql4TPB1W_FQ_BHov3CKHI8JX3krXMlcKWz7dTrjpQ1w,3346
|
|
157
|
-
vastdb/session.py,sha256=
|
|
158
|
-
vastdb/table.py,sha256=
|
|
159
|
-
vastdb/transaction.py,sha256=
|
|
157
|
+
vastdb/session.py,sha256=UTaz1Fh3u71Bnay2r6IyCHNMDrAszbzjnwylPURzhsk,2603
|
|
158
|
+
vastdb/table.py,sha256=1ikj6toITImFowI2WHiimmqSiObmTfAohCdWC89q71Y,30031
|
|
159
|
+
vastdb/transaction.py,sha256=u4pJBLooZQ_YGjsRgEWVL6RPAlt3lgm5oOpPHzPcayM,2852
|
|
160
160
|
vastdb/util.py,sha256=rs7nLL2Qz-OVEZDSVIqAvS-uETMq-zxQs5jBksB5-JA,4276
|
|
161
161
|
vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
162
|
vastdb/bench/test_perf.py,sha256=iHE3E60fvyU5SBDHPi4h03Dj6QcY6VI9l9mMhgNMtPc,1117
|
|
163
163
|
vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
164
|
vastdb/tests/test_duckdb.py,sha256=KDuv4PrjGEwChCGHG36xNT2JiFlBOt6K3DQ3L06Kq-A,1913
|
|
165
165
|
vastdb/tests/test_imports.py,sha256=48kbJKsa_MrEXcBYQUbUDr1e9wzjG4FHQ7C3wUEQfXA,5705
|
|
166
|
-
vastdb/tests/test_nested.py,sha256=
|
|
166
|
+
vastdb/tests/test_nested.py,sha256=FHYMmaKYvqVh0NvsocUFLr2LDVlSfXZYgqUSopWOSM0,3512
|
|
167
167
|
vastdb/tests/test_projections.py,sha256=_cDNfD5zTwbCXLk6uGpPUWGN0P-4HElu5OjubWu-Jg0,1255
|
|
168
168
|
vastdb/tests/test_sanity.py,sha256=ixx0QPo73hLHjAa7bByFXjS1XST0WvmSwLEpgnHh_JY,2960
|
|
169
169
|
vastdb/tests/test_schemas.py,sha256=qoHTLX51D-0S4bMxdCpRh9gaYQd-BkZdT_agGOwFwTM,1739
|
|
170
|
-
vastdb/tests/test_tables.py,sha256=
|
|
170
|
+
vastdb/tests/test_tables.py,sha256=Q3N5P-7mOPVcfAFEfpAzomqkyCJ5gKZmfE4SUW5jehk,27859
|
|
171
171
|
vastdb/tests/test_util.py,sha256=owRAU3TCKMq-kz54NRdA5wX2O_bZIHqG5ucUR77jm5k,1046
|
|
172
172
|
vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
|
|
173
|
-
vastdb-0.1.
|
|
174
|
-
vastdb-0.1.
|
|
175
|
-
vastdb-0.1.
|
|
176
|
-
vastdb-0.1.
|
|
177
|
-
vastdb-0.1.
|
|
173
|
+
vastdb-0.1.5.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
|
|
174
|
+
vastdb-0.1.5.dist-info/METADATA,sha256=NJzrnkyfPs4lliFamaEdJy2elLYLzYJtlCxEMRSiLtg,1350
|
|
175
|
+
vastdb-0.1.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
176
|
+
vastdb-0.1.5.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
|
|
177
|
+
vastdb-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|