vastdb 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/bench/__init__.py +0 -0
- vastdb/bench/test_perf.py +29 -0
- vastdb/bucket.py +14 -6
- vastdb/{tests/conftest.py → conftest.py} +21 -7
- vastdb/errors.py +20 -3
- vastdb/internal_commands.py +105 -141
- vastdb/schema.py +14 -6
- vastdb/session.py +2 -3
- vastdb/table.py +16 -13
- vastdb/tests/test_imports.py +3 -5
- vastdb/tests/test_nested.py +28 -0
- vastdb/tests/test_projections.py +2 -1
- vastdb/tests/test_sanity.py +3 -4
- vastdb/tests/test_schemas.py +18 -0
- vastdb/tests/test_tables.py +78 -31
- vastdb/tests/util.py +18 -0
- vastdb/transaction.py +3 -4
- vastdb/util.py +4 -3
- {vastdb-0.1.0.dist-info → vastdb-0.1.1.dist-info}/METADATA +1 -1
- {vastdb-0.1.0.dist-info → vastdb-0.1.1.dist-info}/RECORD +23 -19
- {vastdb-0.1.0.dist-info → vastdb-0.1.1.dist-info}/LICENSE +0 -0
- {vastdb-0.1.0.dist-info → vastdb-0.1.1.dist-info}/WHEEL +0 -0
- {vastdb-0.1.0.dist-info → vastdb-0.1.1.dist-info}/top_level.txt +0 -0
vastdb/bench/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from vastdb import util
|
|
8
|
+
from vastdb.table import ImportConfig, QueryConfig
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.mark.benchmark
|
|
14
|
+
def test_bench(session, clean_bucket_name, parquets_path, crater_path):
|
|
15
|
+
files = [str(parquets_path/f) for f in (parquets_path.glob('**/*.pq'))]
|
|
16
|
+
|
|
17
|
+
with session.transaction() as tx:
|
|
18
|
+
b = tx.bucket(clean_bucket_name)
|
|
19
|
+
s = b.create_schema('s1')
|
|
20
|
+
t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
|
|
21
|
+
config = QueryConfig(num_splits=8, num_sub_splits=4)
|
|
22
|
+
s = time.time()
|
|
23
|
+
pa_table = pa.Table.from_batches(t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config))
|
|
24
|
+
e = time.time()
|
|
25
|
+
log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e-s)
|
|
26
|
+
if crater_path:
|
|
27
|
+
with open(f'{crater_path}/bench_results', 'a') as f:
|
|
28
|
+
f.write(f"'SELECT sid FROM TABLE WHERE sid = 10033007' returned in {e-s} seconds")
|
|
29
|
+
assert pa_table.num_rows == 255_075
|
vastdb/bucket.py
CHANGED
|
@@ -4,10 +4,10 @@ VAST S3 buckets can be used to create Database schemas and tables.
|
|
|
4
4
|
It is possible to list and access VAST snapshots generated over a bucket.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from . import errors, schema, transaction
|
|
8
|
-
|
|
9
|
-
from dataclasses import dataclass
|
|
10
7
|
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
from . import errors, schema, transaction
|
|
11
11
|
|
|
12
12
|
log = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -27,18 +27,26 @@ class Bucket:
|
|
|
27
27
|
name: str
|
|
28
28
|
tx: "transaction.Transaction"
|
|
29
29
|
|
|
30
|
-
def create_schema(self, path: str) -> "schema.Schema":
|
|
30
|
+
def create_schema(self, path: str, fail_if_exists=True) -> "schema.Schema":
|
|
31
31
|
"""Create a new schema (a container of tables) under this bucket."""
|
|
32
|
+
if current := self.schema(path, fail_if_missing=False):
|
|
33
|
+
if fail_if_exists:
|
|
34
|
+
raise errors.SchemaExists(self.name, path)
|
|
35
|
+
else:
|
|
36
|
+
return current
|
|
32
37
|
self.tx._rpc.api.create_schema(self.name, path, txid=self.tx.txid)
|
|
33
38
|
log.info("Created schema: %s", path)
|
|
34
39
|
return self.schema(path)
|
|
35
40
|
|
|
36
|
-
def schema(self, path: str) -> "schema.Schema":
|
|
41
|
+
def schema(self, path: str, fail_if_missing=True) -> "schema.Schema":
|
|
37
42
|
"""Get a specific schema (a container of tables) under this bucket."""
|
|
38
43
|
s = self.schemas(path)
|
|
39
44
|
log.debug("schema: %s", s)
|
|
40
45
|
if not s:
|
|
41
|
-
|
|
46
|
+
if fail_if_missing:
|
|
47
|
+
raise errors.MissingSchema(self.name, path)
|
|
48
|
+
else:
|
|
49
|
+
return None
|
|
42
50
|
assert len(s) == 1, f"Expected to receive only a single schema, but got: {len(s)}. ({s})"
|
|
43
51
|
log.debug("Found schema: %s", s[0].name)
|
|
44
52
|
return s[0]
|
|
@@ -1,15 +1,19 @@
|
|
|
1
|
-
import
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
|
|
3
|
-
import pytest
|
|
4
4
|
import boto3
|
|
5
|
-
import
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
import vastdb
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
def pytest_addoption(parser):
|
|
9
|
-
parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default
|
|
10
|
-
parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default
|
|
11
|
-
parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)"
|
|
12
|
-
parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default
|
|
11
|
+
parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default="vastdb")
|
|
12
|
+
parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default=os.environ.get("AWS_ACCESS_KEY_ID", None))
|
|
13
|
+
parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)", default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
|
|
14
|
+
parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="http://localhost:9090")
|
|
15
|
+
parser.addoption("--data-path", help="Data files location", default=None)
|
|
16
|
+
parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
|
|
13
17
|
|
|
14
18
|
|
|
15
19
|
@pytest.fixture(scope="session")
|
|
@@ -44,3 +48,13 @@ def s3(request):
|
|
|
44
48
|
aws_access_key_id=request.config.getoption("--tabular-access-key"),
|
|
45
49
|
aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
|
|
46
50
|
endpoint_url=request.config.getoption("--tabular-endpoint-url"))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture(scope="function")
|
|
54
|
+
def parquets_path(request):
|
|
55
|
+
return Path(request.config.getoption("--data-path"))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@pytest.fixture(scope="function")
|
|
59
|
+
def crater_path(request):
|
|
60
|
+
return request.config.getoption("--crater-path")
|
vastdb/errors.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import requests
|
|
3
2
|
import xml.etree.ElementTree
|
|
4
|
-
|
|
5
|
-
from enum import Enum
|
|
6
3
|
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class HttpStatus(Enum):
|
|
@@ -114,6 +114,23 @@ class MissingProjection(Missing):
|
|
|
114
114
|
projection: str
|
|
115
115
|
|
|
116
116
|
|
|
117
|
+
class Exists(Exception):
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class SchemaExists(Exists):
|
|
123
|
+
bucket: str
|
|
124
|
+
schema: str
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class TableExists(Exists):
|
|
129
|
+
bucket: str
|
|
130
|
+
schema: str
|
|
131
|
+
table: str
|
|
132
|
+
|
|
133
|
+
|
|
117
134
|
ERROR_TYPES_MAP = {
|
|
118
135
|
HttpStatus.BAD_REQUEST: BadRequest,
|
|
119
136
|
HttpStatus.FOBIDDEN: Forbidden,
|
vastdb/internal_commands.py
CHANGED
|
@@ -1,26 +1,23 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
1
3
|
import logging
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
2
6
|
import struct
|
|
3
7
|
import urllib.parse
|
|
4
8
|
from collections import defaultdict, namedtuple
|
|
5
|
-
from datetime import datetime
|
|
6
9
|
from enum import Enum
|
|
7
|
-
from
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
import math
|
|
11
|
-
from functools import cmp_to_key
|
|
12
|
-
import pyarrow.parquet as pq
|
|
10
|
+
from ipaddress import IPv4Address, IPv6Address
|
|
11
|
+
from typing import Iterator, Optional, Union
|
|
12
|
+
|
|
13
13
|
import flatbuffers
|
|
14
|
+
import ibis
|
|
14
15
|
import pyarrow as pa
|
|
16
|
+
import pyarrow.parquet as pq
|
|
15
17
|
import requests
|
|
16
|
-
import json
|
|
17
|
-
import itertools
|
|
18
|
-
from aws_requests_auth.aws_auth import AWSRequestsAuth
|
|
19
18
|
import urllib3
|
|
20
|
-
import
|
|
21
|
-
|
|
22
|
-
from . import errors
|
|
23
|
-
from ipaddress import IPv4Address, IPv6Address
|
|
19
|
+
import xmltodict
|
|
20
|
+
from aws_requests_auth.aws_auth import AWSRequestsAuth
|
|
24
21
|
|
|
25
22
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
|
|
26
23
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
|
|
@@ -32,10 +29,10 @@ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldIndex as fb_field_in
|
|
|
32
29
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldRef as fb_field_ref
|
|
33
30
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float32Literal as fb_float32_lit
|
|
34
31
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float64Literal as fb_float64_lit
|
|
32
|
+
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
|
|
35
33
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int16Literal as fb_int16_lit
|
|
36
34
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int32Literal as fb_int32_lit
|
|
37
35
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int64Literal as fb_int64_lit
|
|
38
|
-
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
|
|
39
36
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Literal as fb_literal
|
|
40
37
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation as fb_relation
|
|
41
38
|
import vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelationImpl as rel_impl
|
|
@@ -48,38 +45,47 @@ import vast_flatbuf.org.apache.arrow.flatbuf.Bool as fb_bool
|
|
|
48
45
|
import vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
|
|
49
46
|
import vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
|
|
50
47
|
import vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
|
|
48
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
|
|
51
49
|
import vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
|
|
52
50
|
import vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
|
|
53
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
|
|
54
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
|
|
55
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
|
|
56
51
|
import vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
|
|
57
52
|
import vast_flatbuf.org.apache.arrow.flatbuf.Map as fb_map
|
|
58
|
-
import vast_flatbuf.org.apache.arrow.flatbuf.
|
|
53
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
|
|
54
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
|
|
55
|
+
import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
|
|
59
56
|
import vast_flatbuf.org.apache.arrow.flatbuf.Timestamp as fb_timestamp
|
|
60
57
|
import vast_flatbuf.org.apache.arrow.flatbuf.Utf8 as fb_utf8
|
|
61
58
|
import vast_flatbuf.tabular.AlterColumnRequest as tabular_alter_column
|
|
59
|
+
import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
|
|
62
60
|
import vast_flatbuf.tabular.AlterSchemaRequest as tabular_alter_schema
|
|
63
61
|
import vast_flatbuf.tabular.AlterTableRequest as tabular_alter_table
|
|
64
|
-
import vast_flatbuf.tabular.
|
|
62
|
+
import vast_flatbuf.tabular.Column as tabular_projecion_column
|
|
63
|
+
import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
|
|
64
|
+
import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
|
|
65
65
|
import vast_flatbuf.tabular.CreateSchemaRequest as tabular_create_schema
|
|
66
66
|
import vast_flatbuf.tabular.ImportDataRequest as tabular_import_data
|
|
67
67
|
import vast_flatbuf.tabular.S3File as tabular_s3_file
|
|
68
|
-
import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
|
|
69
|
-
import vast_flatbuf.tabular.Column as tabular_projecion_column
|
|
70
|
-
import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
|
|
71
|
-
|
|
72
68
|
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Deref import Deref
|
|
73
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import
|
|
69
|
+
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import (
|
|
70
|
+
ExpressionImpl,
|
|
71
|
+
)
|
|
74
72
|
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.LiteralImpl import LiteralImpl
|
|
75
73
|
from vast_flatbuf.org.apache.arrow.flatbuf.DateUnit import DateUnit
|
|
76
74
|
from vast_flatbuf.org.apache.arrow.flatbuf.TimeUnit import TimeUnit
|
|
77
75
|
from vast_flatbuf.org.apache.arrow.flatbuf.Type import Type
|
|
76
|
+
from vast_flatbuf.tabular.GetProjectionTableStatsResponse import (
|
|
77
|
+
GetProjectionTableStatsResponse as get_projection_table_stats,
|
|
78
|
+
)
|
|
79
|
+
from vast_flatbuf.tabular.GetTableStatsResponse import (
|
|
80
|
+
GetTableStatsResponse as get_table_stats,
|
|
81
|
+
)
|
|
82
|
+
from vast_flatbuf.tabular.ListProjectionsResponse import (
|
|
83
|
+
ListProjectionsResponse as list_projections,
|
|
84
|
+
)
|
|
78
85
|
from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list_schemas
|
|
79
86
|
from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
|
|
80
|
-
|
|
81
|
-
from
|
|
82
|
-
from vast_flatbuf.tabular.ListProjectionsResponse import ListProjectionsResponse as list_projections
|
|
87
|
+
|
|
88
|
+
from . import errors
|
|
83
89
|
|
|
84
90
|
UINT64_MAX = 18446744073709551615
|
|
85
91
|
|
|
@@ -122,13 +128,6 @@ def get_unit_to_flatbuff_time_unit(type):
|
|
|
122
128
|
return unit_to_flatbuff_time_unit[type]
|
|
123
129
|
|
|
124
130
|
class Predicate:
|
|
125
|
-
unit_to_epoch = {
|
|
126
|
-
'ns': 1_000_000,
|
|
127
|
-
'us': 1_000,
|
|
128
|
-
'ms': 1,
|
|
129
|
-
's': 0.001
|
|
130
|
-
}
|
|
131
|
-
|
|
132
131
|
def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
|
|
133
132
|
self.schema = schema
|
|
134
133
|
self.expr = expr
|
|
@@ -173,8 +172,18 @@ class Predicate:
|
|
|
173
172
|
return builder.EndVector()
|
|
174
173
|
|
|
175
174
|
def serialize(self, builder: 'flatbuffers.builder.Builder'):
|
|
176
|
-
from ibis.expr.operations.generic import
|
|
177
|
-
from ibis.expr.operations.logical import
|
|
175
|
+
from ibis.expr.operations.generic import IsNull, Literal, TableColumn
|
|
176
|
+
from ibis.expr.operations.logical import (
|
|
177
|
+
And,
|
|
178
|
+
Equals,
|
|
179
|
+
Greater,
|
|
180
|
+
GreaterEqual,
|
|
181
|
+
Less,
|
|
182
|
+
LessEqual,
|
|
183
|
+
Not,
|
|
184
|
+
NotEquals,
|
|
185
|
+
Or,
|
|
186
|
+
)
|
|
178
187
|
from ibis.expr.operations.strings import StringContains
|
|
179
188
|
|
|
180
189
|
builder_map = {
|
|
@@ -403,7 +412,7 @@ class Predicate:
|
|
|
403
412
|
field_type = fb_utf8.End(self.builder)
|
|
404
413
|
|
|
405
414
|
value = self.builder.CreateString(value)
|
|
406
|
-
elif field.type.equals(pa.date32()): # pa.date64()
|
|
415
|
+
elif field.type.equals(pa.date32()): # pa.date64() is not supported
|
|
407
416
|
literal_type = fb_date32_lit
|
|
408
417
|
literal_impl = LiteralImpl.DateLiteral
|
|
409
418
|
|
|
@@ -411,37 +420,49 @@ class Predicate:
|
|
|
411
420
|
fb_date.Start(self.builder)
|
|
412
421
|
fb_date.AddUnit(self.builder, DateUnit.DAY)
|
|
413
422
|
field_type = fb_date.End(self.builder)
|
|
414
|
-
|
|
415
|
-
start_date = datetime.fromtimestamp(0).date()
|
|
416
|
-
date_delta = value - start_date
|
|
417
|
-
value = date_delta.days
|
|
423
|
+
value, = pa.array([value], field.type).cast(pa.int32()).to_pylist()
|
|
418
424
|
elif isinstance(field.type, pa.TimestampType):
|
|
419
425
|
literal_type = fb_timestamp_lit
|
|
420
426
|
literal_impl = LiteralImpl.TimestampLiteral
|
|
421
427
|
|
|
428
|
+
if field.type.equals(pa.timestamp('s')):
|
|
429
|
+
unit = TimeUnit.SECOND
|
|
430
|
+
if field.type.equals(pa.timestamp('ms')):
|
|
431
|
+
unit = TimeUnit.MILLISECOND
|
|
432
|
+
if field.type.equals(pa.timestamp('us')):
|
|
433
|
+
unit = TimeUnit.MICROSECOND
|
|
434
|
+
if field.type.equals(pa.timestamp('ns')):
|
|
435
|
+
unit = TimeUnit.NANOSECOND
|
|
436
|
+
|
|
422
437
|
field_type_type = Type.Timestamp
|
|
423
438
|
fb_timestamp.Start(self.builder)
|
|
424
|
-
fb_timestamp.AddUnit(self.builder,
|
|
439
|
+
fb_timestamp.AddUnit(self.builder, unit)
|
|
425
440
|
field_type = fb_timestamp.End(self.builder)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
elif field.type.equals(pa.time32('s')) or field.type.equals(pa.time32('ms')) or field.type.equals(pa.time64('us')) or field.type.equals(pa.time64('ns')):
|
|
429
|
-
|
|
441
|
+
value, = pa.array([value], field.type).cast(pa.int64()).to_pylist()
|
|
442
|
+
elif isinstance(field.type, (pa.Time32Type, pa.Time64Type)):
|
|
430
443
|
literal_type = fb_time_lit
|
|
431
444
|
literal_impl = LiteralImpl.TimeLiteral
|
|
432
445
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
446
|
+
if field.type.equals(pa.time32('s')):
|
|
447
|
+
target_type = pa.int32()
|
|
448
|
+
unit = TimeUnit.SECOND
|
|
449
|
+
if field.type.equals(pa.time32('ms')):
|
|
450
|
+
target_type = pa.int32()
|
|
451
|
+
unit = TimeUnit.MILLISECOND
|
|
452
|
+
if field.type.equals(pa.time64('us')):
|
|
453
|
+
target_type = pa.int64()
|
|
454
|
+
unit = TimeUnit.MICROSECOND
|
|
455
|
+
if field.type.equals(pa.time64('ns')):
|
|
456
|
+
target_type = pa.int64()
|
|
457
|
+
unit = TimeUnit.NANOSECOND
|
|
437
458
|
|
|
438
459
|
field_type_type = Type.Time
|
|
439
460
|
fb_time.Start(self.builder)
|
|
440
461
|
fb_time.AddBitWidth(self.builder, field.type.bit_width)
|
|
441
|
-
fb_time.AddUnit(self.builder,
|
|
462
|
+
fb_time.AddUnit(self.builder, unit)
|
|
442
463
|
field_type = fb_time.End(self.builder)
|
|
443
464
|
|
|
444
|
-
value =
|
|
465
|
+
value, = pa.array([value], field.type).cast(target_type).to_pylist()
|
|
445
466
|
elif field.type.equals(pa.bool_()):
|
|
446
467
|
literal_type = fb_bool_lit
|
|
447
468
|
literal_impl = LiteralImpl.BooleanLiteral
|
|
@@ -558,8 +579,6 @@ class FieldNode:
|
|
|
558
579
|
# will be set during by the parser (see below)
|
|
559
580
|
self.buffers = None # a list of Arrow buffers (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
|
|
560
581
|
self.length = None # each array must have it's length specified (https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.from_buffers)
|
|
561
|
-
self.is_projected = False
|
|
562
|
-
self.projected_field = self.field
|
|
563
582
|
|
|
564
583
|
def _iter_to_root(self) -> Iterator['FieldNode']:
|
|
565
584
|
yield self
|
|
@@ -580,15 +599,13 @@ class FieldNode:
|
|
|
580
599
|
for child in self.children:
|
|
581
600
|
yield from child._iter_leaves()
|
|
582
601
|
|
|
583
|
-
def
|
|
602
|
+
def _iter_leaves(self) -> Iterator['FieldNode']:
|
|
584
603
|
"""Generate only leaf nodes (i.e. columns having scalar types)."""
|
|
585
604
|
if not self.children:
|
|
586
|
-
|
|
587
|
-
yield self
|
|
605
|
+
yield self
|
|
588
606
|
else:
|
|
589
607
|
for child in self.children:
|
|
590
|
-
|
|
591
|
-
yield from child._iter_projected_leaves()
|
|
608
|
+
yield from child._iter_leaves()
|
|
592
609
|
|
|
593
610
|
def debug_log(self, level=0):
|
|
594
611
|
"""Recursively dump this node state to log."""
|
|
@@ -625,27 +642,17 @@ class FieldNode:
|
|
|
625
642
|
|
|
626
643
|
def build(self) -> pa.Array:
|
|
627
644
|
"""Construct an Arrow array from the collected buffers (recursively)."""
|
|
628
|
-
children = self.children and [node.build() for node in self.children
|
|
629
|
-
|
|
630
|
-
self.field.name, self.projected_field.type, self.length, self.buffers, children)
|
|
631
|
-
result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
|
|
645
|
+
children = self.children and [node.build() for node in self.children]
|
|
646
|
+
result = pa.Array.from_buffers(self.type, self.length, buffers=self.buffers, children=children)
|
|
632
647
|
if self.debug:
|
|
633
648
|
_logger.debug('%s result=%s', self.field, result)
|
|
634
649
|
return result
|
|
635
650
|
|
|
636
|
-
def build_projected_field(self):
|
|
637
|
-
if isinstance(self.type, pa.StructType):
|
|
638
|
-
[child.build_projected_field() for child in self.children if child.is_projected]
|
|
639
|
-
self.projected_field = pa.field(self.field.name,
|
|
640
|
-
pa.struct([child.projected_field for child in self.children if child.is_projected]),
|
|
641
|
-
self.field.nullable,
|
|
642
|
-
self.field.metadata)
|
|
643
651
|
|
|
644
652
|
class QueryDataParser:
|
|
645
653
|
"""Used to parse VAST QueryData RPC response."""
|
|
646
|
-
def __init__(self, arrow_schema: pa.Schema, *, debug=False
|
|
654
|
+
def __init__(self, arrow_schema: pa.Schema, *, debug=False):
|
|
647
655
|
self.arrow_schema = arrow_schema
|
|
648
|
-
self.projection_positions = projection_positions
|
|
649
656
|
index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
|
|
650
657
|
self.nodes = [FieldNode(field, index, debug=debug) for field in arrow_schema]
|
|
651
658
|
self.debug = debug
|
|
@@ -653,24 +660,15 @@ class QueryDataParser:
|
|
|
653
660
|
for node in self.nodes:
|
|
654
661
|
node.debug_log()
|
|
655
662
|
self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
|
|
656
|
-
self.mark_projected_nodes()
|
|
657
|
-
[node.build_projected_field() for node in self.nodes]
|
|
658
|
-
self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
|
|
659
663
|
|
|
660
664
|
self.leaf_offset = 0
|
|
661
665
|
|
|
662
|
-
def mark_projected_nodes(self):
|
|
663
|
-
for leaf in self.leaves:
|
|
664
|
-
if self.projection_positions is None or leaf.index in self.projection_positions:
|
|
665
|
-
for node in leaf._iter_to_root():
|
|
666
|
-
node.is_projected = True
|
|
667
|
-
|
|
668
666
|
def parse(self, column: pa.Array):
|
|
669
667
|
"""Parse a single column response from VAST (see FieldNode.set for details)"""
|
|
670
|
-
if not self.leaf_offset < len(self.
|
|
668
|
+
if not self.leaf_offset < len(self.leaves):
|
|
671
669
|
raise ValueError(f'self.leaf_offset: {self.leaf_offset} are not < '
|
|
672
670
|
f'than len(self.leaves): {len(self.leaves)}')
|
|
673
|
-
leaf = self.
|
|
671
|
+
leaf = self.leaves[self.leaf_offset]
|
|
674
672
|
|
|
675
673
|
# A column response may be sent in multiple chunks, therefore we need to combine
|
|
676
674
|
# it into a single chunk to allow reconstruction using `Array.from_buffers()`.
|
|
@@ -691,32 +689,19 @@ class QueryDataParser:
|
|
|
691
689
|
|
|
692
690
|
self.leaf_offset += 1
|
|
693
691
|
|
|
694
|
-
def build(self
|
|
692
|
+
def build(self) -> Optional[pa.Table]:
|
|
695
693
|
"""Try to build the resulting Table object (if all columns were parsed)"""
|
|
696
|
-
if self.
|
|
697
|
-
|
|
698
|
-
return None
|
|
699
|
-
else:
|
|
700
|
-
if self.leaf_offset < len(self.leaves):
|
|
701
|
-
return None
|
|
694
|
+
if self.leaf_offset < len(self.leaves):
|
|
695
|
+
return None
|
|
702
696
|
|
|
703
697
|
if self.debug:
|
|
704
698
|
for node in self.nodes:
|
|
705
699
|
node.debug_log()
|
|
706
700
|
|
|
707
|
-
# sort resulting table according to the output field names
|
|
708
|
-
projected_nodes = [node for node in self.nodes if node.is_projected]
|
|
709
|
-
if output_field_names is not None:
|
|
710
|
-
def key_func(projected_node):
|
|
711
|
-
return output_field_names.index(projected_node.field.name)
|
|
712
|
-
sorted_projected_nodes = sorted(projected_nodes, key=key_func)
|
|
713
|
-
else:
|
|
714
|
-
sorted_projected_nodes = projected_nodes
|
|
715
|
-
|
|
716
701
|
result = pa.Table.from_arrays(
|
|
717
|
-
arrays=[node.build() for node in
|
|
718
|
-
schema
|
|
719
|
-
result.validate(full=
|
|
702
|
+
arrays=[node.build() for node in self.nodes],
|
|
703
|
+
schema=self.arrow_schema)
|
|
704
|
+
result.validate(full=self.debug) # does expensive validation checks only if debug is enabled
|
|
720
705
|
return result
|
|
721
706
|
|
|
722
707
|
def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
|
|
@@ -1661,7 +1646,8 @@ class VastdbApi:
|
|
|
1661
1646
|
data=record_batch, headers=headers)
|
|
1662
1647
|
return self._check_res(res, "update_rows", expected_retvals)
|
|
1663
1648
|
|
|
1664
|
-
def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]
|
|
1649
|
+
def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[],
|
|
1650
|
+
delete_from_imports_table=False):
|
|
1665
1651
|
"""
|
|
1666
1652
|
DELETE /mybucket/myschema/mytable?rows HTTP/1.1
|
|
1667
1653
|
Content-Length: ContentLength
|
|
@@ -1673,8 +1659,10 @@ class VastdbApi:
|
|
|
1673
1659
|
"""
|
|
1674
1660
|
headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
|
|
1675
1661
|
headers['Content-Length'] = str(len(record_batch))
|
|
1676
|
-
|
|
1677
|
-
|
|
1662
|
+
url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if delete_from_imports_table else {}
|
|
1663
|
+
|
|
1664
|
+
res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows", url_params=url_params),
|
|
1665
|
+
data=record_batch, headers=headers)
|
|
1678
1666
|
return self._check_res(res, "delete_rows", expected_retvals)
|
|
1679
1667
|
|
|
1680
1668
|
def create_projection(self, bucket, schema, table, name, columns, txid=0, client_tags=[], expected_retvals=[]):
|
|
@@ -1943,18 +1931,16 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
|
|
|
1943
1931
|
"""
|
|
1944
1932
|
if start_row_ids is None:
|
|
1945
1933
|
start_row_ids = {}
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
is_empty_projection = (len(projection_positions) == 0)
|
|
1951
|
-
parsers = defaultdict(lambda: QueryDataParser(arrow_schema, debug=debug, projection_positions=projection_positions)) # {stream_id: QueryDataParser}
|
|
1934
|
+
|
|
1935
|
+
is_empty_projection = (len(schema) == 0)
|
|
1936
|
+
parsers = defaultdict(lambda: QueryDataParser(schema, debug=debug)) # {stream_id: QueryDataParser}
|
|
1937
|
+
|
|
1952
1938
|
for stream_id, next_row_id, table in _iter_query_data_response_columns(conn, stream_ids):
|
|
1953
1939
|
parser = parsers[stream_id]
|
|
1954
1940
|
for column in table.columns:
|
|
1955
1941
|
parser.parse(column)
|
|
1956
1942
|
|
|
1957
|
-
parsed_table = parser.build(
|
|
1943
|
+
parsed_table = parser.build()
|
|
1958
1944
|
if parsed_table is not None: # when we got all columns (and before starting a new "select_rows" cycle)
|
|
1959
1945
|
parsers.pop(stream_id)
|
|
1960
1946
|
if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
|
|
@@ -2042,7 +2028,7 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
|
|
|
2042
2028
|
fb_utf8.Start(builder)
|
|
2043
2029
|
field_type = fb_utf8.End(builder)
|
|
2044
2030
|
|
|
2045
|
-
elif field.type.equals(pa.date32()): # pa.date64()
|
|
2031
|
+
elif field.type.equals(pa.date32()): # pa.date64() is not supported
|
|
2046
2032
|
field_type_type = Type.Date
|
|
2047
2033
|
fb_date.Start(builder)
|
|
2048
2034
|
fb_date.AddUnit(builder, DateUnit.DAY)
|
|
@@ -2155,12 +2141,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
|
|
|
2155
2141
|
return fb_field.End(builder)
|
|
2156
2142
|
|
|
2157
2143
|
|
|
2158
|
-
class VastDBResponseSchema:
|
|
2159
|
-
def __init__(self, arrow_schema, projection_positions, output_field_names):
|
|
2160
|
-
self.arrow_schema = arrow_schema
|
|
2161
|
-
self.projection_positions = projection_positions
|
|
2162
|
-
self.output_field_names = output_field_names
|
|
2163
|
-
|
|
2164
2144
|
class QueryDataRequest:
|
|
2165
2145
|
def __init__(self, serialized, response_schema):
|
|
2166
2146
|
self.serialized = serialized
|
|
@@ -2187,31 +2167,17 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibi
|
|
|
2187
2167
|
filter_obj = predicate.serialize(builder)
|
|
2188
2168
|
|
|
2189
2169
|
parser = QueryDataParser(schema)
|
|
2190
|
-
|
|
2191
|
-
for node in parser.nodes
|
|
2192
|
-
|
|
2193
|
-
if descendent.parent and isinstance(descendent.parent.type, (pa.ListType, pa.MapType)):
|
|
2194
|
-
continue
|
|
2195
|
-
iter_from_root = reversed(list(descendent._iter_to_root()))
|
|
2196
|
-
descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
|
|
2197
|
-
descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
|
|
2198
|
-
leaves_map[descendent_full_name] = descendent_leaves
|
|
2199
|
-
|
|
2200
|
-
output_field_names = None
|
|
2170
|
+
fields_map = {node.field.name: node.field for node in parser.nodes}
|
|
2171
|
+
leaves_map = {node.field.name: [leaf.index for leaf in node._iter_leaves()] for node in parser.nodes}
|
|
2172
|
+
|
|
2201
2173
|
if field_names is None:
|
|
2202
2174
|
field_names = [field.name for field in schema]
|
|
2203
|
-
else:
|
|
2204
|
-
output_field_names = [f.split('.')[0] for f in field_names]
|
|
2205
|
-
# sort projected field_names according to positions to maintain ordering according to the schema
|
|
2206
|
-
def compare_field_names_by_pos(field_name1, field_name2):
|
|
2207
|
-
return leaves_map[field_name1][0]-leaves_map[field_name2][0]
|
|
2208
|
-
field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
|
|
2209
2175
|
|
|
2176
|
+
response_schema = pa.schema([fields_map[name] for name in field_names])
|
|
2210
2177
|
projection_fields = []
|
|
2211
|
-
projection_positions = []
|
|
2212
2178
|
for field_name in field_names:
|
|
2179
|
+
# TODO: only root-level projection pushdown is supported (i.e. no support for SELECT s.x FROM t)
|
|
2213
2180
|
positions = leaves_map[field_name]
|
|
2214
|
-
projection_positions.extend(positions)
|
|
2215
2181
|
for leaf_position in positions:
|
|
2216
2182
|
fb_field_index.Start(builder)
|
|
2217
2183
|
fb_field_index.AddPosition(builder, leaf_position)
|
|
@@ -2222,8 +2188,6 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibi
|
|
|
2222
2188
|
builder.PrependUOffsetTRelative(offset)
|
|
2223
2189
|
projection = builder.EndVector()
|
|
2224
2190
|
|
|
2225
|
-
response_schema = VastDBResponseSchema(schema, projection_positions, output_field_names=output_field_names)
|
|
2226
|
-
|
|
2227
2191
|
fb_source.Start(builder)
|
|
2228
2192
|
fb_source.AddName(builder, source_name)
|
|
2229
2193
|
fb_source.AddSchema(builder, schema_obj)
|
vastdb/schema.py
CHANGED
|
@@ -4,12 +4,12 @@ VAST S3 buckets can be used to create Database schemas and tables.
|
|
|
4
4
|
It is possible to list and access VAST snapshots generated over a bucket.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
8
9
|
|
|
9
10
|
import pyarrow as pa
|
|
10
11
|
|
|
11
|
-
from
|
|
12
|
-
import logging
|
|
12
|
+
from . import bucket, errors, schema, table
|
|
13
13
|
|
|
14
14
|
log = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -26,17 +26,25 @@ class Schema:
|
|
|
26
26
|
"""VAST transaction used for this schema."""
|
|
27
27
|
return self.bucket.tx
|
|
28
28
|
|
|
29
|
-
def create_table(self, table_name: str, columns: pa.Schema) -> "table.Table":
|
|
29
|
+
def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
|
|
30
30
|
"""Create a new table under this schema."""
|
|
31
|
+
if current := self.table(table_name, fail_if_missing=False):
|
|
32
|
+
if fail_if_exists:
|
|
33
|
+
raise errors.TableExists(self.bucket.name, self.name, table_name)
|
|
34
|
+
else:
|
|
35
|
+
return current
|
|
31
36
|
self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
|
|
32
37
|
log.info("Created table: %s", table_name)
|
|
33
38
|
return self.table(table_name)
|
|
34
39
|
|
|
35
|
-
def table(self, name: str) -> "table.Table":
|
|
40
|
+
def table(self, name: str, fail_if_missing=True) -> "table.Table":
|
|
36
41
|
"""Get a specific table under this schema."""
|
|
37
42
|
t = self.tables(table_name=name)
|
|
38
43
|
if not t:
|
|
39
|
-
|
|
44
|
+
if fail_if_missing:
|
|
45
|
+
raise errors.MissingTable(self.bucket.name, self.name, name)
|
|
46
|
+
else:
|
|
47
|
+
return None
|
|
40
48
|
assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
|
|
41
49
|
log.debug("Found table: %s", t[0])
|
|
42
50
|
return t[0]
|
vastdb/session.py
CHANGED
|
@@ -7,12 +7,11 @@ For more details see:
|
|
|
7
7
|
- [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
from . import transaction
|
|
10
|
+
import os
|
|
12
11
|
|
|
13
12
|
import boto3
|
|
14
13
|
|
|
15
|
-
import
|
|
14
|
+
from . import internal_commands, transaction
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class Session:
|
vastdb/table.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
|
-
from . import errors, schema
|
|
2
|
-
from .internal_commands import build_query_data_request, parse_query_data_response, \
|
|
3
|
-
TABULAR_INVALID_ROW_ID, VastdbApi
|
|
4
|
-
|
|
5
|
-
import pyarrow as pa
|
|
6
|
-
import ibis
|
|
7
|
-
|
|
8
1
|
import concurrent.futures
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
9
4
|
import queue
|
|
10
|
-
from threading import Event
|
|
11
|
-
from math import ceil
|
|
12
|
-
|
|
13
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from math import ceil
|
|
7
|
+
from threading import Event
|
|
14
8
|
from typing import List, Union
|
|
15
|
-
|
|
16
|
-
import
|
|
9
|
+
|
|
10
|
+
import ibis
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
|
|
13
|
+
from . import errors, schema
|
|
14
|
+
from .internal_commands import (
|
|
15
|
+
TABULAR_INVALID_ROW_ID,
|
|
16
|
+
VastdbApi,
|
|
17
|
+
build_query_data_request,
|
|
18
|
+
parse_query_data_response,
|
|
19
|
+
)
|
|
17
20
|
|
|
18
21
|
log = logging.getLogger(__name__)
|
|
19
22
|
|
|
@@ -327,7 +330,7 @@ class Table:
|
|
|
327
330
|
if record_batches_queue.get() is None:
|
|
328
331
|
tasks_running -= 1
|
|
329
332
|
|
|
330
|
-
return pa.RecordBatchReader.from_batches(query_data_request.response_schema
|
|
333
|
+
return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
|
|
331
334
|
|
|
332
335
|
def _combine_chunks(self, col):
|
|
333
336
|
if hasattr(col, "combine_chunks"):
|
vastdb/tests/test_imports.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from tempfile import NamedTemporaryFile
|
|
4
1
|
import logging
|
|
2
|
+
from tempfile import NamedTemporaryFile
|
|
5
3
|
|
|
6
4
|
import pyarrow as pa
|
|
7
5
|
import pyarrow.parquet as pq
|
|
6
|
+
import pytest
|
|
8
7
|
|
|
9
|
-
from vastdb.errors import InvalidArgument, ImportFilesError
|
|
10
8
|
from vastdb import util
|
|
11
|
-
|
|
9
|
+
from vastdb.errors import ImportFilesError, InvalidArgument
|
|
12
10
|
|
|
13
11
|
log = logging.getLogger(__name__)
|
|
14
12
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
|
|
5
|
+
from .util import prepare_data
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_nested(session, clean_bucket_name):
|
|
9
|
+
columns = pa.schema([
|
|
10
|
+
('l', pa.list_(pa.int8())),
|
|
11
|
+
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
12
|
+
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
13
|
+
])
|
|
14
|
+
expected = pa.table(schema=columns, data=[
|
|
15
|
+
[[1], [], [2, 3], None],
|
|
16
|
+
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
17
|
+
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
21
|
+
actual = pa.Table.from_batches(t.select())
|
|
22
|
+
assert actual == expected
|
|
23
|
+
|
|
24
|
+
names = [f.name for f in columns]
|
|
25
|
+
for n in range(len(names) + 1):
|
|
26
|
+
for cols in itertools.permutations(names, n):
|
|
27
|
+
actual = pa.Table.from_batches(t.select(columns=cols))
|
|
28
|
+
assert actual == expected.select(cols)
|
vastdb/tests/test_projections.py
CHANGED
vastdb/tests/test_sanity.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
from itertools import cycle
|
|
1
|
+
import contextlib
|
|
3
2
|
import logging
|
|
4
3
|
import threading
|
|
5
|
-
import
|
|
4
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
5
|
+
from itertools import cycle
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
8
|
import requests
|
|
9
9
|
|
|
10
10
|
import vastdb
|
|
11
11
|
|
|
12
|
-
|
|
13
12
|
log = logging.getLogger(__name__)
|
|
14
13
|
|
|
15
14
|
|
vastdb/tests/test_schemas.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import pytest
|
|
2
2
|
|
|
3
|
+
from .. import errors
|
|
4
|
+
|
|
3
5
|
|
|
4
6
|
def test_schemas(session, clean_bucket_name):
|
|
5
7
|
with session.transaction() as tx:
|
|
@@ -19,6 +21,22 @@ def test_schemas(session, clean_bucket_name):
|
|
|
19
21
|
assert b.schemas() == []
|
|
20
22
|
|
|
21
23
|
|
|
24
|
+
def test_exists(session, clean_bucket_name):
|
|
25
|
+
with session.transaction() as tx:
|
|
26
|
+
b = tx.bucket(clean_bucket_name)
|
|
27
|
+
assert b.schemas() == []
|
|
28
|
+
|
|
29
|
+
s = b.create_schema('s1')
|
|
30
|
+
|
|
31
|
+
assert b.schemas() == [s]
|
|
32
|
+
with pytest.raises(errors.SchemaExists):
|
|
33
|
+
b.create_schema('s1')
|
|
34
|
+
|
|
35
|
+
assert b.schemas() == [s]
|
|
36
|
+
assert b.create_schema('s1', fail_if_exists=False) == s
|
|
37
|
+
assert b.schemas() == [s]
|
|
38
|
+
|
|
39
|
+
|
|
22
40
|
def test_commits_and_rollbacks(session, clean_bucket_name):
|
|
23
41
|
with session.transaction() as tx:
|
|
24
42
|
b = tx.bucket(clean_bucket_name)
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -1,41 +1,25 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import decimal
|
|
3
|
+
import logging
|
|
4
4
|
import random
|
|
5
|
+
import threading
|
|
6
|
+
from contextlib import closing
|
|
7
|
+
from tempfile import NamedTemporaryFile
|
|
8
|
+
|
|
9
|
+
import duckdb
|
|
5
10
|
import pyarrow as pa
|
|
6
11
|
import pyarrow.compute as pc
|
|
7
12
|
import pyarrow.parquet as pq
|
|
8
|
-
import
|
|
9
|
-
import datetime as dt
|
|
10
|
-
|
|
11
|
-
from tempfile import NamedTemporaryFile
|
|
12
|
-
from contextlib import contextmanager, closing
|
|
13
|
-
|
|
13
|
+
import pytest
|
|
14
14
|
from requests.exceptions import HTTPError
|
|
15
|
-
import logging
|
|
16
15
|
|
|
17
|
-
from ..table import INTERNAL_ROW_ID, QueryConfig
|
|
18
16
|
from .. import errors
|
|
19
|
-
|
|
17
|
+
from ..table import INTERNAL_ROW_ID, QueryConfig
|
|
18
|
+
from .util import prepare_data
|
|
20
19
|
|
|
21
20
|
log = logging.getLogger(__name__)
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
@contextmanager
|
|
25
|
-
def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
|
|
26
|
-
with session.transaction() as tx:
|
|
27
|
-
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
28
|
-
t = s.create_table(table_name, arrow_table.schema)
|
|
29
|
-
row_ids_array = t.insert(arrow_table)
|
|
30
|
-
row_ids = row_ids_array.to_pylist()
|
|
31
|
-
log.debug("row_ids=%s" % row_ids)
|
|
32
|
-
assert row_ids == list(range(arrow_table.num_rows))
|
|
33
|
-
yield t
|
|
34
|
-
t.drop()
|
|
35
|
-
s.drop()
|
|
36
|
-
|
|
37
|
-
log = logging.getLogger(__name__)
|
|
38
|
-
|
|
39
23
|
def test_tables(session, clean_bucket_name):
|
|
40
24
|
columns = pa.schema([
|
|
41
25
|
('a', pa.int64()),
|
|
@@ -86,6 +70,28 @@ def test_tables(session, clean_bucket_name):
|
|
|
86
70
|
's': ['ccc']
|
|
87
71
|
}
|
|
88
72
|
|
|
73
|
+
|
|
74
|
+
def test_exists(session, clean_bucket_name):
|
|
75
|
+
with session.transaction() as tx:
|
|
76
|
+
s = tx.bucket(clean_bucket_name).create_schema('s1')
|
|
77
|
+
assert s.tables() == []
|
|
78
|
+
|
|
79
|
+
t = s.create_table('t', pa.schema([('x', pa.int64())]))
|
|
80
|
+
|
|
81
|
+
assert s.tables() == [t]
|
|
82
|
+
with pytest.raises(errors.TableExists):
|
|
83
|
+
s.create_table('t', pa.schema([('x', pa.int64())]))
|
|
84
|
+
|
|
85
|
+
assert s.tables() == [t]
|
|
86
|
+
assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
|
|
87
|
+
assert s.tables() == [t]
|
|
88
|
+
assert s.create_table('t', pa.schema([('y', pa.int64())]), fail_if_exists=False) == t
|
|
89
|
+
assert s.tables() == [t]
|
|
90
|
+
assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
|
|
91
|
+
assert s.tables() == [t]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
89
95
|
def test_update_table(session, clean_bucket_name):
|
|
90
96
|
columns = pa.schema([
|
|
91
97
|
('a', pa.int64()),
|
|
@@ -169,7 +175,14 @@ def test_types(session, clean_bucket_name):
|
|
|
169
175
|
('d', pa.decimal128(7, 3)),
|
|
170
176
|
('bin', pa.binary()),
|
|
171
177
|
('date', pa.date32()),
|
|
172
|
-
('
|
|
178
|
+
('t0', pa.time32('s')),
|
|
179
|
+
('t3', pa.time32('ms')),
|
|
180
|
+
('t6', pa.time64('us')),
|
|
181
|
+
('t9', pa.time64('ns')),
|
|
182
|
+
('ts0' ,pa.timestamp('s')),
|
|
183
|
+
('ts3' ,pa.timestamp('ms')),
|
|
184
|
+
('ts6' ,pa.timestamp('us')),
|
|
185
|
+
('ts9' ,pa.timestamp('ns')),
|
|
173
186
|
])
|
|
174
187
|
|
|
175
188
|
expected = pa.table(schema=columns, data=[
|
|
@@ -181,9 +194,17 @@ def test_types(session, clean_bucket_name):
|
|
|
181
194
|
["a", "v", "s"],
|
|
182
195
|
[decimal.Decimal('110.52'), decimal.Decimal('231.15'), decimal.Decimal('3332.44')],
|
|
183
196
|
[b"\x01\x02", b"\x01\x05", b"\x01\x07"],
|
|
184
|
-
[dt.
|
|
185
|
-
[dt.
|
|
197
|
+
[dt.date(2024, 4, 10), dt.date(2024, 4, 11), dt.date(2024, 4, 12)],
|
|
198
|
+
[dt.time(12, 34, 56), dt.time(12, 34, 57), dt.time(12, 34, 58)],
|
|
199
|
+
[dt.time(12, 34, 56, 789000), dt.time(12, 34, 57, 789000), dt.time(12, 34, 58, 789000)],
|
|
200
|
+
[dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
|
|
201
|
+
[dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
|
|
202
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
|
|
203
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
|
|
204
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
|
|
205
|
+
[dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
|
|
186
206
|
])
|
|
207
|
+
|
|
187
208
|
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
188
209
|
def select(predicate):
|
|
189
210
|
return pa.Table.from_batches(t.select(predicate=predicate))
|
|
@@ -197,7 +218,33 @@ def test_types(session, clean_bucket_name):
|
|
|
197
218
|
assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
|
|
198
219
|
assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
|
|
199
220
|
assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
|
|
200
|
-
|
|
221
|
+
|
|
222
|
+
date_literal = dt.date(2024, 4, 10)
|
|
223
|
+
assert select(t['date'] == date_literal) == expected.filter(pc.field('date') == date_literal)
|
|
224
|
+
|
|
225
|
+
time_literal = dt.time(12, 34, 56)
|
|
226
|
+
assert select(t['t0'] == time_literal) == expected.filter(pc.field('t0') == time_literal)
|
|
227
|
+
|
|
228
|
+
time_literal = dt.time(12, 34, 56, 789000)
|
|
229
|
+
assert select(t['t3'] == time_literal) == expected.filter(pc.field('t3') == time_literal)
|
|
230
|
+
|
|
231
|
+
time_literal = dt.time(12, 34, 56, 789789)
|
|
232
|
+
assert select(t['t6'] == time_literal) == expected.filter(pc.field('t6') == time_literal)
|
|
233
|
+
|
|
234
|
+
time_literal = dt.time(12, 34, 56, 789789)
|
|
235
|
+
assert select(t['t9'] == time_literal) == expected.filter(pc.field('t9') == time_literal)
|
|
236
|
+
|
|
237
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56)
|
|
238
|
+
assert select(t['ts0'] == ts_literal) == expected.filter(pc.field('ts0') == ts_literal)
|
|
239
|
+
|
|
240
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789000)
|
|
241
|
+
assert select(t['ts3'] == ts_literal) == expected.filter(pc.field('ts3') == ts_literal)
|
|
242
|
+
|
|
243
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
|
|
244
|
+
assert select(t['ts6'] == ts_literal) == expected.filter(pc.field('ts6') == ts_literal)
|
|
245
|
+
|
|
246
|
+
ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
|
|
247
|
+
assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
|
|
201
248
|
|
|
202
249
|
|
|
203
250
|
def test_filters(session, clean_bucket_name):
|
vastdb/tests/util.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
log = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@contextmanager
|
|
8
|
+
def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
|
|
9
|
+
with session.transaction() as tx:
|
|
10
|
+
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
11
|
+
t = s.create_table(table_name, arrow_table.schema)
|
|
12
|
+
row_ids_array = t.insert(arrow_table)
|
|
13
|
+
row_ids = row_ids_array.to_pylist()
|
|
14
|
+
log.debug("row_ids=%s" % row_ids)
|
|
15
|
+
assert row_ids == list(range(arrow_table.num_rows))
|
|
16
|
+
yield t
|
|
17
|
+
t.drop()
|
|
18
|
+
s.drop()
|
vastdb/transaction.py
CHANGED
|
@@ -6,13 +6,12 @@ A transcation is used as a context manager, since every Database-related operati
|
|
|
6
6
|
tx.bucket("bucket").create_schema("schema")
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import logging
|
|
10
|
+
from dataclasses import dataclass
|
|
10
11
|
|
|
11
12
|
import botocore
|
|
12
13
|
|
|
13
|
-
from
|
|
14
|
-
import logging
|
|
15
|
-
|
|
14
|
+
from . import bucket, errors, session
|
|
16
15
|
|
|
17
16
|
log = logging.getLogger(__name__)
|
|
18
17
|
|
vastdb/util.py
CHANGED
|
@@ -6,13 +6,14 @@ import pyarrow.parquet as pq
|
|
|
6
6
|
|
|
7
7
|
from .errors import InvalidArgument
|
|
8
8
|
from .schema import Schema
|
|
9
|
-
from .table import Table
|
|
9
|
+
from .table import ImportConfig, Table
|
|
10
10
|
|
|
11
11
|
log = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def create_table_from_files(
|
|
15
|
-
schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None
|
|
15
|
+
schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None,
|
|
16
|
+
config: ImportConfig = None) -> Table:
|
|
16
17
|
if not schema_merge_func:
|
|
17
18
|
schema_merge_func = default_schema_merge
|
|
18
19
|
else:
|
|
@@ -32,7 +33,7 @@ def create_table_from_files(
|
|
|
32
33
|
table = schema.create_table(table_name, current_schema)
|
|
33
34
|
|
|
34
35
|
log.info("Starting import of %d files to table: %s", len(parquet_files), table)
|
|
35
|
-
table.import_files(parquet_files)
|
|
36
|
+
table.import_files(parquet_files, config=config)
|
|
36
37
|
log.info("Finished import of %d files to table: %s", len(parquet_files), table)
|
|
37
38
|
return table
|
|
38
39
|
|
|
@@ -149,23 +149,27 @@ vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI
|
|
|
149
149
|
vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
|
|
150
150
|
vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
151
|
vastdb/__init__.py,sha256=GY30IfZQApfl7HfcFmfTzFpx48oHgQIrDcUQCiTnxpo,206
|
|
152
|
-
vastdb/bucket.py,sha256=
|
|
153
|
-
vastdb/
|
|
154
|
-
vastdb/
|
|
155
|
-
vastdb/
|
|
156
|
-
vastdb/
|
|
157
|
-
vastdb/
|
|
158
|
-
vastdb/
|
|
159
|
-
vastdb/
|
|
152
|
+
vastdb/bucket.py,sha256=5J8KBdRViaz5bZ8WEPggQj7DfJaIhY7CqpoWP6reaDo,2854
|
|
153
|
+
vastdb/conftest.py,sha256=pKpo_46Vq4QHzTDQAFxasrVhnZ2V2L-y6IMLxojxaFM,2132
|
|
154
|
+
vastdb/errors.py,sha256=wCJp70QyBW8US2KMwhB6e5ZnKRft4GiN8jyJ36f1Yuo,3315
|
|
155
|
+
vastdb/internal_commands.py,sha256=rmxOjIq229gsxFFZ4nKXwVIFJcu8WR1DVsE-91w4-BY,101564
|
|
156
|
+
vastdb/schema.py,sha256=x9Yn4tFTFkSpJbQqpqlecKUSOK214XsRLdOUrNW0jzM,3192
|
|
157
|
+
vastdb/session.py,sha256=VZOFGZbAdr5Tl4cp88VRQYnR4Q16UNuYjSmX_QPW1II,1718
|
|
158
|
+
vastdb/table.py,sha256=eALN5YpUfDFqZNF_lp6lZD5RJkBKqp5Mlc6hpwGI8Rg,20443
|
|
159
|
+
vastdb/transaction.py,sha256=2I5k81YvcgDsp07BrAWkmXf41qUP6m88Y40rFfuIKvI,1796
|
|
160
|
+
vastdb/util.py,sha256=VR0UJ1D0WUpqS5edG_mkxDZYZJ_qqce8y7iJOvqeyWE,2974
|
|
161
|
+
vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
+
vastdb/bench/test_perf.py,sha256=X7BIo60L5Oj7H-56e8pDFtXY9rNLerkywKexXWiqvrY,1111
|
|
160
163
|
vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
|
-
vastdb/tests/
|
|
162
|
-
vastdb/tests/
|
|
163
|
-
vastdb/tests/test_projections.py,sha256=
|
|
164
|
-
vastdb/tests/test_sanity.py,sha256=
|
|
165
|
-
vastdb/tests/test_schemas.py,sha256=
|
|
166
|
-
vastdb/tests/test_tables.py,sha256=
|
|
167
|
-
vastdb
|
|
168
|
-
vastdb-0.1.
|
|
169
|
-
vastdb-0.1.
|
|
170
|
-
vastdb-0.1.
|
|
171
|
-
vastdb-0.1.
|
|
164
|
+
vastdb/tests/test_imports.py,sha256=fDUjO5U-5i4QTIMoNnSSW4X_ZnOStLbx0mJkNq2pj9Q,5033
|
|
165
|
+
vastdb/tests/test_nested.py,sha256=3kejEvtSqV0LrUgb1QglRjrlxnKI4_AXTFw2nE7Q520,951
|
|
166
|
+
vastdb/tests/test_projections.py,sha256=0ZiFya6rzGvnKOrdb1xxxv-BEerNmiK_ymfZM6eIvvw,1254
|
|
167
|
+
vastdb/tests/test_sanity.py,sha256=kaOmZWDGBc-XhZ8eFQ3sks2Mo9De8q41Z5pqYWzJsHM,2958
|
|
168
|
+
vastdb/tests/test_schemas.py,sha256=8ZlEvnU7Fyg-TDQDxD65GAql4rU8R2_SFWVGrdv564o,1721
|
|
169
|
+
vastdb/tests/test_tables.py,sha256=o_JPqr2GX1DDpPB4Zq4E1YPFgmlsiXyVe1S3TcCjF-w,26226
|
|
170
|
+
vastdb/tests/util.py,sha256=_euE3fKJqgNssT9gVxlcHjdE61mnsNQcwDPzn1tTe9g,597
|
|
171
|
+
vastdb-0.1.1.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
|
|
172
|
+
vastdb-0.1.1.dist-info/METADATA,sha256=e84OEOXS09DEXniHJAU2aeK80-1h2rIZmYNBCMLa1AM,1331
|
|
173
|
+
vastdb-0.1.1.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
|
174
|
+
vastdb-0.1.1.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
|
|
175
|
+
vastdb-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|