vastdb 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,29 @@
1
+ import logging
2
+ import time
3
+
4
+ import pyarrow as pa
5
+ import pytest
6
+
7
+ from vastdb import util
8
+ from vastdb.table import ImportConfig, QueryConfig
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ @pytest.mark.benchmark
14
+ def test_bench(session, clean_bucket_name, parquets_path, crater_path):
15
+ files = [str(parquets_path/f) for f in (parquets_path.glob('**/*.pq'))]
16
+
17
+ with session.transaction() as tx:
18
+ b = tx.bucket(clean_bucket_name)
19
+ s = b.create_schema('s1')
20
+ t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
21
+ config = QueryConfig(num_splits=8, num_sub_splits=4)
22
+ s = time.time()
23
+ pa_table = pa.Table.from_batches(t.select(columns=['sid'], predicate=t['sid'] == 10033007, config=config))
24
+ e = time.time()
25
+ log.info("'SELECT sid from TABLE WHERE sid = 10033007' returned in %s seconds.", e-s)
26
+ if crater_path:
27
+ with open(f'{crater_path}/bench_results', 'a') as f:
28
+ f.write(f"'SELECT sid FROM TABLE WHERE sid = 10033007' returned in {e-s} seconds")
29
+ assert pa_table.num_rows == 255_075
vastdb/bucket.py CHANGED
@@ -4,10 +4,10 @@ VAST S3 buckets can be used to create Database schemas and tables.
4
4
  It is possible to list and access VAST snapshots generated over a bucket.
5
5
  """
6
6
 
7
- from . import errors, schema, transaction
8
-
9
- from dataclasses import dataclass
10
7
  import logging
8
+ from dataclasses import dataclass
9
+
10
+ from . import errors, schema, transaction
11
11
 
12
12
  log = logging.getLogger(__name__)
13
13
 
@@ -27,18 +27,26 @@ class Bucket:
27
27
  name: str
28
28
  tx: "transaction.Transaction"
29
29
 
30
- def create_schema(self, path: str) -> "schema.Schema":
30
+ def create_schema(self, path: str, fail_if_exists=True) -> "schema.Schema":
31
31
  """Create a new schema (a container of tables) under this bucket."""
32
+ if current := self.schema(path, fail_if_missing=False):
33
+ if fail_if_exists:
34
+ raise errors.SchemaExists(self.name, path)
35
+ else:
36
+ return current
32
37
  self.tx._rpc.api.create_schema(self.name, path, txid=self.tx.txid)
33
38
  log.info("Created schema: %s", path)
34
39
  return self.schema(path)
35
40
 
36
- def schema(self, path: str) -> "schema.Schema":
41
+ def schema(self, path: str, fail_if_missing=True) -> "schema.Schema":
37
42
  """Get a specific schema (a container of tables) under this bucket."""
38
43
  s = self.schemas(path)
39
44
  log.debug("schema: %s", s)
40
45
  if not s:
41
- raise errors.MissingSchema(self.name, path)
46
+ if fail_if_missing:
47
+ raise errors.MissingSchema(self.name, path)
48
+ else:
49
+ return None
42
50
  assert len(s) == 1, f"Expected to receive only a single schema, but got: {len(s)}. ({s})"
43
51
  log.debug("Found schema: %s", s[0].name)
44
52
  return s[0]
@@ -1,15 +1,19 @@
1
- import vastdb
1
+ import os
2
+ from pathlib import Path
2
3
 
3
- import pytest
4
4
  import boto3
5
- import os
5
+ import pytest
6
+
7
+ import vastdb
6
8
 
7
9
 
8
10
  def pytest_addoption(parser):
9
- parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default = "vastdb")
10
- parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default = os.environ.get("AWS_ACCESS_KEY_ID", None))
11
- parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)" , default = os.environ.get("AWS_SECRET_ACCESS_KEY", None))
12
- parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default = "http://localhost:9090")
11
+ parser.addoption("--tabular-bucket-name", help="Name of the S3 bucket with Tabular enabled", default="vastdb")
12
+ parser.addoption("--tabular-access-key", help="Access key with Tabular permissions (AWS_ACCESS_KEY_ID)", default=os.environ.get("AWS_ACCESS_KEY_ID", None))
13
+ parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)", default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
14
+ parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="http://localhost:9090")
15
+ parser.addoption("--data-path", help="Data files location", default=None)
16
+ parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
13
17
 
14
18
 
15
19
  @pytest.fixture(scope="session")
@@ -44,3 +48,13 @@ def s3(request):
44
48
  aws_access_key_id=request.config.getoption("--tabular-access-key"),
45
49
  aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
46
50
  endpoint_url=request.config.getoption("--tabular-endpoint-url"))
51
+
52
+
53
+ @pytest.fixture(scope="function")
54
+ def parquets_path(request):
55
+ return Path(request.config.getoption("--data-path"))
56
+
57
+
58
+ @pytest.fixture(scope="function")
59
+ def crater_path(request):
60
+ return request.config.getoption("--crater-path")
vastdb/errors.py CHANGED
@@ -1,9 +1,9 @@
1
1
  import logging
2
- import requests
3
2
  import xml.etree.ElementTree
4
-
5
- from enum import Enum
6
3
  from dataclasses import dataclass
4
+ from enum import Enum
5
+
6
+ import requests
7
7
 
8
8
 
9
9
  class HttpStatus(Enum):
@@ -114,6 +114,23 @@ class MissingProjection(Missing):
114
114
  projection: str
115
115
 
116
116
 
117
+ class Exists(Exception):
118
+ pass
119
+
120
+
121
+ @dataclass
122
+ class SchemaExists(Exists):
123
+ bucket: str
124
+ schema: str
125
+
126
+
127
+ @dataclass
128
+ class TableExists(Exists):
129
+ bucket: str
130
+ schema: str
131
+ table: str
132
+
133
+
117
134
  ERROR_TYPES_MAP = {
118
135
  HttpStatus.BAD_REQUEST: BadRequest,
119
136
  HttpStatus.FOBIDDEN: Forbidden,
@@ -1,26 +1,23 @@
1
+ import itertools
2
+ import json
1
3
  import logging
4
+ import math
5
+ import re
2
6
  import struct
3
7
  import urllib.parse
4
8
  from collections import defaultdict, namedtuple
5
- from datetime import datetime
6
9
  from enum import Enum
7
- from typing import Union, Optional, Iterator
8
- import ibis
9
- import xmltodict
10
- import math
11
- from functools import cmp_to_key
12
- import pyarrow.parquet as pq
10
+ from ipaddress import IPv4Address, IPv6Address
11
+ from typing import Iterator, Optional, Union
12
+
13
13
  import flatbuffers
14
+ import ibis
14
15
  import pyarrow as pa
16
+ import pyarrow.parquet as pq
15
17
  import requests
16
- import json
17
- import itertools
18
- from aws_requests_auth.aws_auth import AWSRequestsAuth
19
18
  import urllib3
20
- import re
21
-
22
- from . import errors
23
- from ipaddress import IPv4Address, IPv6Address
19
+ import xmltodict
20
+ from aws_requests_auth.aws_auth import AWSRequestsAuth
24
21
 
25
22
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BinaryLiteral as fb_binary_lit
26
23
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.BooleanLiteral as fb_bool_lit
@@ -32,10 +29,10 @@ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldIndex as fb_field_in
32
29
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.FieldRef as fb_field_ref
33
30
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float32Literal as fb_float32_lit
34
31
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Float64Literal as fb_float64_lit
32
+ import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
35
33
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int16Literal as fb_int16_lit
36
34
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int32Literal as fb_int32_lit
37
35
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int64Literal as fb_int64_lit
38
- import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Int8Literal as fb_int8_lit
39
36
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Literal as fb_literal
40
37
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation as fb_relation
41
38
  import vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelationImpl as rel_impl
@@ -48,38 +45,47 @@ import vast_flatbuf.org.apache.arrow.flatbuf.Bool as fb_bool
48
45
  import vast_flatbuf.org.apache.arrow.flatbuf.Date as fb_date
49
46
  import vast_flatbuf.org.apache.arrow.flatbuf.Decimal as fb_decimal
50
47
  import vast_flatbuf.org.apache.arrow.flatbuf.Field as fb_field
48
+ import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
51
49
  import vast_flatbuf.org.apache.arrow.flatbuf.FloatingPoint as fb_floating_point
52
50
  import vast_flatbuf.org.apache.arrow.flatbuf.Int as fb_int
53
- import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
54
- import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
55
- import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
56
51
  import vast_flatbuf.org.apache.arrow.flatbuf.List as fb_list
57
52
  import vast_flatbuf.org.apache.arrow.flatbuf.Map as fb_map
58
- import vast_flatbuf.org.apache.arrow.flatbuf.FixedSizeBinary as fb_fixed_size_binary
53
+ import vast_flatbuf.org.apache.arrow.flatbuf.Schema as fb_schema
54
+ import vast_flatbuf.org.apache.arrow.flatbuf.Struct_ as fb_struct
55
+ import vast_flatbuf.org.apache.arrow.flatbuf.Time as fb_time
59
56
  import vast_flatbuf.org.apache.arrow.flatbuf.Timestamp as fb_timestamp
60
57
  import vast_flatbuf.org.apache.arrow.flatbuf.Utf8 as fb_utf8
61
58
  import vast_flatbuf.tabular.AlterColumnRequest as tabular_alter_column
59
+ import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
62
60
  import vast_flatbuf.tabular.AlterSchemaRequest as tabular_alter_schema
63
61
  import vast_flatbuf.tabular.AlterTableRequest as tabular_alter_table
64
- import vast_flatbuf.tabular.AlterProjectionTableRequest as tabular_alter_projection
62
+ import vast_flatbuf.tabular.Column as tabular_projecion_column
63
+ import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
64
+ import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
65
65
  import vast_flatbuf.tabular.CreateSchemaRequest as tabular_create_schema
66
66
  import vast_flatbuf.tabular.ImportDataRequest as tabular_import_data
67
67
  import vast_flatbuf.tabular.S3File as tabular_s3_file
68
- import vast_flatbuf.tabular.CreateProjectionRequest as tabular_create_projection
69
- import vast_flatbuf.tabular.Column as tabular_projecion_column
70
- import vast_flatbuf.tabular.ColumnType as tabular_proj_column_type
71
-
72
68
  from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Deref import Deref
73
- from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import ExpressionImpl
69
+ from vast_flatbuf.org.apache.arrow.computeir.flatbuf.ExpressionImpl import (
70
+ ExpressionImpl,
71
+ )
74
72
  from vast_flatbuf.org.apache.arrow.computeir.flatbuf.LiteralImpl import LiteralImpl
75
73
  from vast_flatbuf.org.apache.arrow.flatbuf.DateUnit import DateUnit
76
74
  from vast_flatbuf.org.apache.arrow.flatbuf.TimeUnit import TimeUnit
77
75
  from vast_flatbuf.org.apache.arrow.flatbuf.Type import Type
76
+ from vast_flatbuf.tabular.GetProjectionTableStatsResponse import (
77
+ GetProjectionTableStatsResponse as get_projection_table_stats,
78
+ )
79
+ from vast_flatbuf.tabular.GetTableStatsResponse import (
80
+ GetTableStatsResponse as get_table_stats,
81
+ )
82
+ from vast_flatbuf.tabular.ListProjectionsResponse import (
83
+ ListProjectionsResponse as list_projections,
84
+ )
78
85
  from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list_schemas
79
86
  from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
80
- from vast_flatbuf.tabular.GetTableStatsResponse import GetTableStatsResponse as get_table_stats
81
- from vast_flatbuf.tabular.GetProjectionTableStatsResponse import GetProjectionTableStatsResponse as get_projection_table_stats
82
- from vast_flatbuf.tabular.ListProjectionsResponse import ListProjectionsResponse as list_projections
87
+
88
+ from . import errors
83
89
 
84
90
  UINT64_MAX = 18446744073709551615
85
91
 
@@ -122,13 +128,6 @@ def get_unit_to_flatbuff_time_unit(type):
122
128
  return unit_to_flatbuff_time_unit[type]
123
129
 
124
130
  class Predicate:
125
- unit_to_epoch = {
126
- 'ns': 1_000_000,
127
- 'us': 1_000,
128
- 'ms': 1,
129
- 's': 0.001
130
- }
131
-
132
131
  def __init__(self, schema: 'pa.Schema', expr: ibis.expr.types.BooleanColumn):
133
132
  self.schema = schema
134
133
  self.expr = expr
@@ -173,8 +172,18 @@ class Predicate:
173
172
  return builder.EndVector()
174
173
 
175
174
  def serialize(self, builder: 'flatbuffers.builder.Builder'):
176
- from ibis.expr.operations.generic import TableColumn, Literal, IsNull
177
- from ibis.expr.operations.logical import Greater, GreaterEqual, Less, LessEqual, Equals, NotEquals, And, Or, Not
175
+ from ibis.expr.operations.generic import IsNull, Literal, TableColumn
176
+ from ibis.expr.operations.logical import (
177
+ And,
178
+ Equals,
179
+ Greater,
180
+ GreaterEqual,
181
+ Less,
182
+ LessEqual,
183
+ Not,
184
+ NotEquals,
185
+ Or,
186
+ )
178
187
  from ibis.expr.operations.strings import StringContains
179
188
 
180
189
  builder_map = {
@@ -403,7 +412,7 @@ class Predicate:
403
412
  field_type = fb_utf8.End(self.builder)
404
413
 
405
414
  value = self.builder.CreateString(value)
406
- elif field.type.equals(pa.date32()): # pa.date64()
415
+ elif field.type.equals(pa.date32()): # pa.date64() is not supported
407
416
  literal_type = fb_date32_lit
408
417
  literal_impl = LiteralImpl.DateLiteral
409
418
 
@@ -411,37 +420,49 @@ class Predicate:
411
420
  fb_date.Start(self.builder)
412
421
  fb_date.AddUnit(self.builder, DateUnit.DAY)
413
422
  field_type = fb_date.End(self.builder)
414
-
415
- start_date = datetime.fromtimestamp(0).date()
416
- date_delta = value - start_date
417
- value = date_delta.days
423
+ value, = pa.array([value], field.type).cast(pa.int32()).to_pylist()
418
424
  elif isinstance(field.type, pa.TimestampType):
419
425
  literal_type = fb_timestamp_lit
420
426
  literal_impl = LiteralImpl.TimestampLiteral
421
427
 
428
+ if field.type.equals(pa.timestamp('s')):
429
+ unit = TimeUnit.SECOND
430
+ if field.type.equals(pa.timestamp('ms')):
431
+ unit = TimeUnit.MILLISECOND
432
+ if field.type.equals(pa.timestamp('us')):
433
+ unit = TimeUnit.MICROSECOND
434
+ if field.type.equals(pa.timestamp('ns')):
435
+ unit = TimeUnit.NANOSECOND
436
+
422
437
  field_type_type = Type.Timestamp
423
438
  fb_timestamp.Start(self.builder)
424
- fb_timestamp.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(field.type.unit))
439
+ fb_timestamp.AddUnit(self.builder, unit)
425
440
  field_type = fb_timestamp.End(self.builder)
426
-
427
- value = int(int(value) * self.unit_to_epoch[field.type.unit])
428
- elif field.type.equals(pa.time32('s')) or field.type.equals(pa.time32('ms')) or field.type.equals(pa.time64('us')) or field.type.equals(pa.time64('ns')):
429
-
441
+ value, = pa.array([value], field.type).cast(pa.int64()).to_pylist()
442
+ elif isinstance(field.type, (pa.Time32Type, pa.Time64Type)):
430
443
  literal_type = fb_time_lit
431
444
  literal_impl = LiteralImpl.TimeLiteral
432
445
 
433
- field_type_str = str(field.type)
434
- start = field_type_str.index('[')
435
- end = field_type_str.index(']')
436
- unit = field_type_str[start + 1:end]
446
+ if field.type.equals(pa.time32('s')):
447
+ target_type = pa.int32()
448
+ unit = TimeUnit.SECOND
449
+ if field.type.equals(pa.time32('ms')):
450
+ target_type = pa.int32()
451
+ unit = TimeUnit.MILLISECOND
452
+ if field.type.equals(pa.time64('us')):
453
+ target_type = pa.int64()
454
+ unit = TimeUnit.MICROSECOND
455
+ if field.type.equals(pa.time64('ns')):
456
+ target_type = pa.int64()
457
+ unit = TimeUnit.NANOSECOND
437
458
 
438
459
  field_type_type = Type.Time
439
460
  fb_time.Start(self.builder)
440
461
  fb_time.AddBitWidth(self.builder, field.type.bit_width)
441
- fb_time.AddUnit(self.builder, get_unit_to_flatbuff_time_unit(unit))
462
+ fb_time.AddUnit(self.builder, unit)
442
463
  field_type = fb_time.End(self.builder)
443
464
 
444
- value = int(value) * self.unit_to_epoch[unit]
465
+ value, = pa.array([value], field.type).cast(target_type).to_pylist()
445
466
  elif field.type.equals(pa.bool_()):
446
467
  literal_type = fb_bool_lit
447
468
  literal_impl = LiteralImpl.BooleanLiteral
@@ -558,8 +579,6 @@ class FieldNode:
558
579
  # will be set during by the parser (see below)
559
580
  self.buffers = None # a list of Arrow buffers (https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout)
560
581
  self.length = None # each array must have it's length specified (https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.from_buffers)
561
- self.is_projected = False
562
- self.projected_field = self.field
563
582
 
564
583
  def _iter_to_root(self) -> Iterator['FieldNode']:
565
584
  yield self
@@ -580,15 +599,13 @@ class FieldNode:
580
599
  for child in self.children:
581
600
  yield from child._iter_leaves()
582
601
 
583
- def _iter_projected_leaves(self) -> Iterator['FieldNode']:
602
+ def _iter_leaves(self) -> Iterator['FieldNode']:
584
603
  """Generate only leaf nodes (i.e. columns having scalar types)."""
585
604
  if not self.children:
586
- if self.is_projected:
587
- yield self
605
+ yield self
588
606
  else:
589
607
  for child in self.children:
590
- if child.is_projected:
591
- yield from child._iter_projected_leaves()
608
+ yield from child._iter_leaves()
592
609
 
593
610
  def debug_log(self, level=0):
594
611
  """Recursively dump this node state to log."""
@@ -625,27 +642,17 @@ class FieldNode:
625
642
 
626
643
  def build(self) -> pa.Array:
627
644
  """Construct an Arrow array from the collected buffers (recursively)."""
628
- children = self.children and [node.build() for node in self.children if node.is_projected]
629
- _logger.debug('build: self.field.name=%s, self.projected_field.type=%s, self.length=%s, self.buffers=%s children=%s',
630
- self.field.name, self.projected_field.type, self.length, self.buffers, children)
631
- result = pa.Array.from_buffers(self.projected_field.type, self.length, buffers=self.buffers, children=children)
645
+ children = self.children and [node.build() for node in self.children]
646
+ result = pa.Array.from_buffers(self.type, self.length, buffers=self.buffers, children=children)
632
647
  if self.debug:
633
648
  _logger.debug('%s result=%s', self.field, result)
634
649
  return result
635
650
 
636
- def build_projected_field(self):
637
- if isinstance(self.type, pa.StructType):
638
- [child.build_projected_field() for child in self.children if child.is_projected]
639
- self.projected_field = pa.field(self.field.name,
640
- pa.struct([child.projected_field for child in self.children if child.is_projected]),
641
- self.field.nullable,
642
- self.field.metadata)
643
651
 
644
652
  class QueryDataParser:
645
653
  """Used to parse VAST QueryData RPC response."""
646
- def __init__(self, arrow_schema: pa.Schema, *, debug=False, projection_positions=None):
654
+ def __init__(self, arrow_schema: pa.Schema, *, debug=False):
647
655
  self.arrow_schema = arrow_schema
648
- self.projection_positions = projection_positions
649
656
  index = itertools.count() # used to generate leaf column positions for VAST QueryData RPC
650
657
  self.nodes = [FieldNode(field, index, debug=debug) for field in arrow_schema]
651
658
  self.debug = debug
@@ -653,24 +660,15 @@ class QueryDataParser:
653
660
  for node in self.nodes:
654
661
  node.debug_log()
655
662
  self.leaves = [leaf for node in self.nodes for leaf in node._iter_leaves()]
656
- self.mark_projected_nodes()
657
- [node.build_projected_field() for node in self.nodes]
658
- self.projected_leaves = [leaf for node in self.nodes for leaf in node._iter_projected_leaves()]
659
663
 
660
664
  self.leaf_offset = 0
661
665
 
662
- def mark_projected_nodes(self):
663
- for leaf in self.leaves:
664
- if self.projection_positions is None or leaf.index in self.projection_positions:
665
- for node in leaf._iter_to_root():
666
- node.is_projected = True
667
-
668
666
  def parse(self, column: pa.Array):
669
667
  """Parse a single column response from VAST (see FieldNode.set for details)"""
670
- if not self.leaf_offset < len(self.projected_leaves):
668
+ if not self.leaf_offset < len(self.leaves):
671
669
  raise ValueError(f'self.leaf_offset: {self.leaf_offset} are not < '
672
670
  f'than len(self.leaves): {len(self.leaves)}')
673
- leaf = self.projected_leaves[self.leaf_offset]
671
+ leaf = self.leaves[self.leaf_offset]
674
672
 
675
673
  # A column response may be sent in multiple chunks, therefore we need to combine
676
674
  # it into a single chunk to allow reconstruction using `Array.from_buffers()`.
@@ -691,32 +689,19 @@ class QueryDataParser:
691
689
 
692
690
  self.leaf_offset += 1
693
691
 
694
- def build(self, output_field_names=None) -> Optional[pa.Table]:
692
+ def build(self) -> Optional[pa.Table]:
695
693
  """Try to build the resulting Table object (if all columns were parsed)"""
696
- if self.projection_positions is not None:
697
- if self.leaf_offset < len(self.projection_positions):
698
- return None
699
- else:
700
- if self.leaf_offset < len(self.leaves):
701
- return None
694
+ if self.leaf_offset < len(self.leaves):
695
+ return None
702
696
 
703
697
  if self.debug:
704
698
  for node in self.nodes:
705
699
  node.debug_log()
706
700
 
707
- # sort resulting table according to the output field names
708
- projected_nodes = [node for node in self.nodes if node.is_projected]
709
- if output_field_names is not None:
710
- def key_func(projected_node):
711
- return output_field_names.index(projected_node.field.name)
712
- sorted_projected_nodes = sorted(projected_nodes, key=key_func)
713
- else:
714
- sorted_projected_nodes = projected_nodes
715
-
716
701
  result = pa.Table.from_arrays(
717
- arrays=[node.build() for node in sorted_projected_nodes],
718
- schema = pa.schema([node.projected_field for node in sorted_projected_nodes]))
719
- result.validate(full=True) # does expensive validation checks only if debug is enabled
702
+ arrays=[node.build() for node in self.nodes],
703
+ schema=self.arrow_schema)
704
+ result.validate(full=self.debug) # does expensive validation checks only if debug is enabled
720
705
  return result
721
706
 
722
707
  def _iter_nested_arrays(column: pa.Array) -> Iterator[pa.Array]:
@@ -1661,7 +1646,8 @@ class VastdbApi:
1661
1646
  data=record_batch, headers=headers)
1662
1647
  return self._check_res(res, "update_rows", expected_retvals)
1663
1648
 
1664
- def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[]):
1649
+ def delete_rows(self, bucket, schema, table, record_batch, txid=0, client_tags=[], expected_retvals=[],
1650
+ delete_from_imports_table=False):
1665
1651
  """
1666
1652
  DELETE /mybucket/myschema/mytable?rows HTTP/1.1
1667
1653
  Content-Length: ContentLength
@@ -1673,8 +1659,10 @@ class VastdbApi:
1673
1659
  """
1674
1660
  headers = self._fill_common_headers(txid=txid, client_tags=client_tags)
1675
1661
  headers['Content-Length'] = str(len(record_batch))
1676
- res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows"),
1677
- data=record_batch, headers=headers)
1662
+ url_params = {'sub-table': IMPORTED_OBJECTS_TABLE_NAME} if delete_from_imports_table else {}
1663
+
1664
+ res = self.session.delete(self._api_prefix(bucket=bucket, schema=schema, table=table, command="rows", url_params=url_params),
1665
+ data=record_batch, headers=headers)
1678
1666
  return self._check_res(res, "delete_rows", expected_retvals)
1679
1667
 
1680
1668
  def create_projection(self, bucket, schema, table, name, columns, txid=0, client_tags=[], expected_retvals=[]):
@@ -1943,18 +1931,16 @@ def parse_query_data_response(conn, schema, stream_ids=None, start_row_ids=None,
1943
1931
  """
1944
1932
  if start_row_ids is None:
1945
1933
  start_row_ids = {}
1946
- projection_positions = schema.projection_positions
1947
- arrow_schema = schema.arrow_schema
1948
- output_field_names = schema.output_field_names
1949
- _logger.debug(f'projection_positions={projection_positions} len(arrow_schema)={len(arrow_schema)} arrow_schema={arrow_schema}')
1950
- is_empty_projection = (len(projection_positions) == 0)
1951
- parsers = defaultdict(lambda: QueryDataParser(arrow_schema, debug=debug, projection_positions=projection_positions)) # {stream_id: QueryDataParser}
1934
+
1935
+ is_empty_projection = (len(schema) == 0)
1936
+ parsers = defaultdict(lambda: QueryDataParser(schema, debug=debug)) # {stream_id: QueryDataParser}
1937
+
1952
1938
  for stream_id, next_row_id, table in _iter_query_data_response_columns(conn, stream_ids):
1953
1939
  parser = parsers[stream_id]
1954
1940
  for column in table.columns:
1955
1941
  parser.parse(column)
1956
1942
 
1957
- parsed_table = parser.build(output_field_names)
1943
+ parsed_table = parser.build()
1958
1944
  if parsed_table is not None: # when we got all columns (and before starting a new "select_rows" cycle)
1959
1945
  parsers.pop(stream_id)
1960
1946
  if is_empty_projection: # VAST returns an empty RecordBatch, with the correct rows' count
@@ -2042,7 +2028,7 @@ def get_field_type(builder: flatbuffers.Builder, field: pa.Field):
2042
2028
  fb_utf8.Start(builder)
2043
2029
  field_type = fb_utf8.End(builder)
2044
2030
 
2045
- elif field.type.equals(pa.date32()): # pa.date64()
2031
+ elif field.type.equals(pa.date32()): # pa.date64() is not supported
2046
2032
  field_type_type = Type.Date
2047
2033
  fb_date.Start(builder)
2048
2034
  fb_date.AddUnit(builder, DateUnit.DAY)
@@ -2155,12 +2141,6 @@ def build_field(builder: flatbuffers.Builder, f: pa.Field, name: str):
2155
2141
  return fb_field.End(builder)
2156
2142
 
2157
2143
 
2158
- class VastDBResponseSchema:
2159
- def __init__(self, arrow_schema, projection_positions, output_field_names):
2160
- self.arrow_schema = arrow_schema
2161
- self.projection_positions = projection_positions
2162
- self.output_field_names = output_field_names
2163
-
2164
2144
  class QueryDataRequest:
2165
2145
  def __init__(self, serialized, response_schema):
2166
2146
  self.serialized = serialized
@@ -2187,31 +2167,17 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibi
2187
2167
  filter_obj = predicate.serialize(builder)
2188
2168
 
2189
2169
  parser = QueryDataParser(schema)
2190
- leaves_map = {}
2191
- for node in parser.nodes:
2192
- for descendent in node._iter_nodes():
2193
- if descendent.parent and isinstance(descendent.parent.type, (pa.ListType, pa.MapType)):
2194
- continue
2195
- iter_from_root = reversed(list(descendent._iter_to_root()))
2196
- descendent_full_name = '.'.join([n.field.name for n in iter_from_root])
2197
- descendent_leaves = [leaf.index for leaf in descendent._iter_leaves()]
2198
- leaves_map[descendent_full_name] = descendent_leaves
2199
-
2200
- output_field_names = None
2170
+ fields_map = {node.field.name: node.field for node in parser.nodes}
2171
+ leaves_map = {node.field.name: [leaf.index for leaf in node._iter_leaves()] for node in parser.nodes}
2172
+
2201
2173
  if field_names is None:
2202
2174
  field_names = [field.name for field in schema]
2203
- else:
2204
- output_field_names = [f.split('.')[0] for f in field_names]
2205
- # sort projected field_names according to positions to maintain ordering according to the schema
2206
- def compare_field_names_by_pos(field_name1, field_name2):
2207
- return leaves_map[field_name1][0]-leaves_map[field_name2][0]
2208
- field_names = sorted(field_names, key=cmp_to_key(compare_field_names_by_pos))
2209
2175
 
2176
+ response_schema = pa.schema([fields_map[name] for name in field_names])
2210
2177
  projection_fields = []
2211
- projection_positions = []
2212
2178
  for field_name in field_names:
2179
+ # TODO: only root-level projection pushdown is supported (i.e. no support for SELECT s.x FROM t)
2213
2180
  positions = leaves_map[field_name]
2214
- projection_positions.extend(positions)
2215
2181
  for leaf_position in positions:
2216
2182
  fb_field_index.Start(builder)
2217
2183
  fb_field_index.AddPosition(builder, leaf_position)
@@ -2222,8 +2188,6 @@ def build_query_data_request(schema: 'pa.Schema' = pa.schema([]), predicate: ibi
2222
2188
  builder.PrependUOffsetTRelative(offset)
2223
2189
  projection = builder.EndVector()
2224
2190
 
2225
- response_schema = VastDBResponseSchema(schema, projection_positions, output_field_names=output_field_names)
2226
-
2227
2191
  fb_source.Start(builder)
2228
2192
  fb_source.AddName(builder, source_name)
2229
2193
  fb_source.AddSchema(builder, schema_obj)
vastdb/schema.py CHANGED
@@ -4,12 +4,12 @@ VAST S3 buckets can be used to create Database schemas and tables.
4
4
  It is possible to list and access VAST snapshots generated over a bucket.
5
5
  """
6
6
 
7
- from . import bucket, errors, schema, table
7
+ import logging
8
+ from dataclasses import dataclass
8
9
 
9
10
  import pyarrow as pa
10
11
 
11
- from dataclasses import dataclass
12
- import logging
12
+ from . import bucket, errors, schema, table
13
13
 
14
14
  log = logging.getLogger(__name__)
15
15
 
@@ -26,17 +26,25 @@ class Schema:
26
26
  """VAST transaction used for this schema."""
27
27
  return self.bucket.tx
28
28
 
29
- def create_table(self, table_name: str, columns: pa.Schema) -> "table.Table":
29
+ def create_table(self, table_name: str, columns: pa.Schema, fail_if_exists=True) -> "table.Table":
30
30
  """Create a new table under this schema."""
31
+ if current := self.table(table_name, fail_if_missing=False):
32
+ if fail_if_exists:
33
+ raise errors.TableExists(self.bucket.name, self.name, table_name)
34
+ else:
35
+ return current
31
36
  self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid)
32
37
  log.info("Created table: %s", table_name)
33
38
  return self.table(table_name)
34
39
 
35
- def table(self, name: str) -> "table.Table":
40
+ def table(self, name: str, fail_if_missing=True) -> "table.Table":
36
41
  """Get a specific table under this schema."""
37
42
  t = self.tables(table_name=name)
38
43
  if not t:
39
- raise errors.MissingTable(self.bucket.name, self.name, name)
44
+ if fail_if_missing:
45
+ raise errors.MissingTable(self.bucket.name, self.name, name)
46
+ else:
47
+ return None
40
48
  assert len(t) == 1, f"Expected to receive only a single table, but got: {len(t)}. tables: {t}"
41
49
  log.debug("Found table: %s", t[0])
42
50
  return t[0]
vastdb/session.py CHANGED
@@ -7,12 +7,11 @@ For more details see:
7
7
  - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
8
8
  """
9
9
 
10
- from . import internal_commands
11
- from . import transaction
10
+ import os
12
11
 
13
12
  import boto3
14
13
 
15
- import os
14
+ from . import internal_commands, transaction
16
15
 
17
16
 
18
17
  class Session:
vastdb/table.py CHANGED
@@ -1,19 +1,22 @@
1
- from . import errors, schema
2
- from .internal_commands import build_query_data_request, parse_query_data_response, \
3
- TABULAR_INVALID_ROW_ID, VastdbApi
4
-
5
- import pyarrow as pa
6
- import ibis
7
-
8
1
  import concurrent.futures
2
+ import logging
3
+ import os
9
4
  import queue
10
- from threading import Event
11
- from math import ceil
12
-
13
5
  from dataclasses import dataclass, field
6
+ from math import ceil
7
+ from threading import Event
14
8
  from typing import List, Union
15
- import logging
16
- import os
9
+
10
+ import ibis
11
+ import pyarrow as pa
12
+
13
+ from . import errors, schema
14
+ from .internal_commands import (
15
+ TABULAR_INVALID_ROW_ID,
16
+ VastdbApi,
17
+ build_query_data_request,
18
+ parse_query_data_response,
19
+ )
17
20
 
18
21
  log = logging.getLogger(__name__)
19
22
 
@@ -327,7 +330,7 @@ class Table:
327
330
  if record_batches_queue.get() is None:
328
331
  tasks_running -= 1
329
332
 
330
- return pa.RecordBatchReader.from_batches(query_data_request.response_schema.arrow_schema, batches_iterator())
333
+ return pa.RecordBatchReader.from_batches(query_data_request.response_schema, batches_iterator())
331
334
 
332
335
  def _combine_chunks(self, col):
333
336
  if hasattr(col, "combine_chunks"):
@@ -1,14 +1,12 @@
1
- import pytest
2
-
3
- from tempfile import NamedTemporaryFile
4
1
  import logging
2
+ from tempfile import NamedTemporaryFile
5
3
 
6
4
  import pyarrow as pa
7
5
  import pyarrow.parquet as pq
6
+ import pytest
8
7
 
9
- from vastdb.errors import InvalidArgument, ImportFilesError
10
8
  from vastdb import util
11
-
9
+ from vastdb.errors import ImportFilesError, InvalidArgument
12
10
 
13
11
  log = logging.getLogger(__name__)
14
12
 
@@ -0,0 +1,28 @@
1
+ import itertools
2
+
3
+ import pyarrow as pa
4
+
5
+ from .util import prepare_data
6
+
7
+
8
+ def test_nested(session, clean_bucket_name):
9
+ columns = pa.schema([
10
+ ('l', pa.list_(pa.int8())),
11
+ ('m', pa.map_(pa.utf8(), pa.float64())),
12
+ ('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
13
+ ])
14
+ expected = pa.table(schema=columns, data=[
15
+ [[1], [], [2, 3], None],
16
+ [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
17
+ [{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
18
+ ])
19
+
20
+ with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
21
+ actual = pa.Table.from_batches(t.select())
22
+ assert actual == expected
23
+
24
+ names = [f.name for f in columns]
25
+ for n in range(len(names) + 1):
26
+ for cols in itertools.permutations(names, n):
27
+ actual = pa.Table.from_batches(t.select(columns=cols))
28
+ assert actual == expected.select(cols)
@@ -1,6 +1,7 @@
1
- import pyarrow as pa
2
1
  import logging
3
2
 
3
+ import pyarrow as pa
4
+
4
5
  log = logging.getLogger(__name__)
5
6
 
6
7
  def test_basic_projections(session, clean_bucket_name):
@@ -1,15 +1,14 @@
1
- from http.server import HTTPServer, BaseHTTPRequestHandler
2
- from itertools import cycle
1
+ import contextlib
3
2
  import logging
4
3
  import threading
5
- import contextlib
4
+ from http.server import BaseHTTPRequestHandler, HTTPServer
5
+ from itertools import cycle
6
6
 
7
7
  import pytest
8
8
  import requests
9
9
 
10
10
  import vastdb
11
11
 
12
-
13
12
  log = logging.getLogger(__name__)
14
13
 
15
14
 
@@ -1,5 +1,7 @@
1
1
  import pytest
2
2
 
3
+ from .. import errors
4
+
3
5
 
4
6
  def test_schemas(session, clean_bucket_name):
5
7
  with session.transaction() as tx:
@@ -19,6 +21,22 @@ def test_schemas(session, clean_bucket_name):
19
21
  assert b.schemas() == []
20
22
 
21
23
 
24
+ def test_exists(session, clean_bucket_name):
25
+ with session.transaction() as tx:
26
+ b = tx.bucket(clean_bucket_name)
27
+ assert b.schemas() == []
28
+
29
+ s = b.create_schema('s1')
30
+
31
+ assert b.schemas() == [s]
32
+ with pytest.raises(errors.SchemaExists):
33
+ b.create_schema('s1')
34
+
35
+ assert b.schemas() == [s]
36
+ assert b.create_schema('s1', fail_if_exists=False) == s
37
+ assert b.schemas() == [s]
38
+
39
+
22
40
  def test_commits_and_rollbacks(session, clean_bucket_name):
23
41
  with session.transaction() as tx:
24
42
  b = tx.bucket(clean_bucket_name)
@@ -1,41 +1,25 @@
1
- import duckdb
2
- import pytest
3
- import threading
1
+ import datetime as dt
2
+ import decimal
3
+ import logging
4
4
  import random
5
+ import threading
6
+ from contextlib import closing
7
+ from tempfile import NamedTemporaryFile
8
+
9
+ import duckdb
5
10
  import pyarrow as pa
6
11
  import pyarrow.compute as pc
7
12
  import pyarrow.parquet as pq
8
- import decimal
9
- import datetime as dt
10
-
11
- from tempfile import NamedTemporaryFile
12
- from contextlib import contextmanager, closing
13
-
13
+ import pytest
14
14
  from requests.exceptions import HTTPError
15
- import logging
16
15
 
17
- from ..table import INTERNAL_ROW_ID, QueryConfig
18
16
  from .. import errors
19
-
17
+ from ..table import INTERNAL_ROW_ID, QueryConfig
18
+ from .util import prepare_data
20
19
 
21
20
  log = logging.getLogger(__name__)
22
21
 
23
22
 
24
- @contextmanager
25
- def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
26
- with session.transaction() as tx:
27
- s = tx.bucket(clean_bucket_name).create_schema(schema_name)
28
- t = s.create_table(table_name, arrow_table.schema)
29
- row_ids_array = t.insert(arrow_table)
30
- row_ids = row_ids_array.to_pylist()
31
- log.debug("row_ids=%s" % row_ids)
32
- assert row_ids == list(range(arrow_table.num_rows))
33
- yield t
34
- t.drop()
35
- s.drop()
36
-
37
- log = logging.getLogger(__name__)
38
-
39
23
  def test_tables(session, clean_bucket_name):
40
24
  columns = pa.schema([
41
25
  ('a', pa.int64()),
@@ -86,6 +70,28 @@ def test_tables(session, clean_bucket_name):
86
70
  's': ['ccc']
87
71
  }
88
72
 
73
+
74
+ def test_exists(session, clean_bucket_name):
75
+ with session.transaction() as tx:
76
+ s = tx.bucket(clean_bucket_name).create_schema('s1')
77
+ assert s.tables() == []
78
+
79
+ t = s.create_table('t', pa.schema([('x', pa.int64())]))
80
+
81
+ assert s.tables() == [t]
82
+ with pytest.raises(errors.TableExists):
83
+ s.create_table('t', pa.schema([('x', pa.int64())]))
84
+
85
+ assert s.tables() == [t]
86
+ assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
87
+ assert s.tables() == [t]
88
+ assert s.create_table('t', pa.schema([('y', pa.int64())]), fail_if_exists=False) == t
89
+ assert s.tables() == [t]
90
+ assert s.create_table('t', pa.schema([('x', pa.int64())]), fail_if_exists=False) == t
91
+ assert s.tables() == [t]
92
+
93
+
94
+
89
95
  def test_update_table(session, clean_bucket_name):
90
96
  columns = pa.schema([
91
97
  ('a', pa.int64()),
@@ -169,7 +175,14 @@ def test_types(session, clean_bucket_name):
169
175
  ('d', pa.decimal128(7, 3)),
170
176
  ('bin', pa.binary()),
171
177
  ('date', pa.date32()),
172
- ('ts' ,pa.timestamp('s')),
178
+ ('t0', pa.time32('s')),
179
+ ('t3', pa.time32('ms')),
180
+ ('t6', pa.time64('us')),
181
+ ('t9', pa.time64('ns')),
182
+ ('ts0' ,pa.timestamp('s')),
183
+ ('ts3' ,pa.timestamp('ms')),
184
+ ('ts6' ,pa.timestamp('us')),
185
+ ('ts9' ,pa.timestamp('ns')),
173
186
  ])
174
187
 
175
188
  expected = pa.table(schema=columns, data=[
@@ -181,9 +194,17 @@ def test_types(session, clean_bucket_name):
181
194
  ["a", "v", "s"],
182
195
  [decimal.Decimal('110.52'), decimal.Decimal('231.15'), decimal.Decimal('3332.44')],
183
196
  [b"\x01\x02", b"\x01\x05", b"\x01\x07"],
184
- [dt.datetime.now().date(), dt.datetime.now().date(), dt.datetime.now().date()],
185
- [dt.datetime.fromtimestamp(10000), dt.datetime.fromtimestamp(100), dt.datetime.fromtimestamp(0)]
197
+ [dt.date(2024, 4, 10), dt.date(2024, 4, 11), dt.date(2024, 4, 12)],
198
+ [dt.time(12, 34, 56), dt.time(12, 34, 57), dt.time(12, 34, 58)],
199
+ [dt.time(12, 34, 56, 789000), dt.time(12, 34, 57, 789000), dt.time(12, 34, 58, 789000)],
200
+ [dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
201
+ [dt.time(12, 34, 56, 789789), dt.time(12, 34, 57, 789789), dt.time(12, 34, 58, 789789)],
202
+ [dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
203
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
204
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
205
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
186
206
  ])
207
+
187
208
  with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
188
209
  def select(predicate):
189
210
  return pa.Table.from_batches(t.select(predicate=predicate))
@@ -197,7 +218,33 @@ def test_types(session, clean_bucket_name):
197
218
  assert select(t['s'] == "v") == expected.filter(pc.field('s') == "v")
198
219
  assert select(t['d'] == 231.15) == expected.filter(pc.field('d') == 231.15)
199
220
  assert select(t['bin'] == b"\x01\x02") == expected.filter(pc.field('bin') == b"\x01\x02")
200
- assert select(t['date'] == dt.datetime.now().date()) == expected.filter(pc.field('date') == dt.datetime.now().date())
221
+
222
+ date_literal = dt.date(2024, 4, 10)
223
+ assert select(t['date'] == date_literal) == expected.filter(pc.field('date') == date_literal)
224
+
225
+ time_literal = dt.time(12, 34, 56)
226
+ assert select(t['t0'] == time_literal) == expected.filter(pc.field('t0') == time_literal)
227
+
228
+ time_literal = dt.time(12, 34, 56, 789000)
229
+ assert select(t['t3'] == time_literal) == expected.filter(pc.field('t3') == time_literal)
230
+
231
+ time_literal = dt.time(12, 34, 56, 789789)
232
+ assert select(t['t6'] == time_literal) == expected.filter(pc.field('t6') == time_literal)
233
+
234
+ time_literal = dt.time(12, 34, 56, 789789)
235
+ assert select(t['t9'] == time_literal) == expected.filter(pc.field('t9') == time_literal)
236
+
237
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56)
238
+ assert select(t['ts0'] == ts_literal) == expected.filter(pc.field('ts0') == ts_literal)
239
+
240
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789000)
241
+ assert select(t['ts3'] == ts_literal) == expected.filter(pc.field('ts3') == ts_literal)
242
+
243
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
244
+ assert select(t['ts6'] == ts_literal) == expected.filter(pc.field('ts6') == ts_literal)
245
+
246
+ ts_literal = dt.datetime(2024, 4, 10, 12, 34, 56, 789789)
247
+ assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
201
248
 
202
249
 
203
250
  def test_filters(session, clean_bucket_name):
vastdb/tests/util.py ADDED
@@ -0,0 +1,18 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+
4
+ log = logging.getLogger(__name__)
5
+
6
+
7
+ @contextmanager
8
+ def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
9
+ with session.transaction() as tx:
10
+ s = tx.bucket(clean_bucket_name).create_schema(schema_name)
11
+ t = s.create_table(table_name, arrow_table.schema)
12
+ row_ids_array = t.insert(arrow_table)
13
+ row_ids = row_ids_array.to_pylist()
14
+ log.debug("row_ids=%s" % row_ids)
15
+ assert row_ids == list(range(arrow_table.num_rows))
16
+ yield t
17
+ t.drop()
18
+ s.drop()
vastdb/transaction.py CHANGED
@@ -6,13 +6,12 @@ A transcation is used as a context manager, since every Database-related operati
6
6
  tx.bucket("bucket").create_schema("schema")
7
7
  """
8
8
 
9
- from . import bucket, errors, session
9
+ import logging
10
+ from dataclasses import dataclass
10
11
 
11
12
  import botocore
12
13
 
13
- from dataclasses import dataclass
14
- import logging
15
-
14
+ from . import bucket, errors, session
16
15
 
17
16
  log = logging.getLogger(__name__)
18
17
 
vastdb/util.py CHANGED
@@ -6,13 +6,14 @@ import pyarrow.parquet as pq
6
6
 
7
7
  from .errors import InvalidArgument
8
8
  from .schema import Schema
9
- from .table import Table
9
+ from .table import ImportConfig, Table
10
10
 
11
11
  log = logging.getLogger(__name__)
12
12
 
13
13
 
14
14
  def create_table_from_files(
15
- schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None) -> Table:
15
+ schema: Schema, table_name: str, parquet_files: [str], schema_merge_func: Callable = None,
16
+ config: ImportConfig = None) -> Table:
16
17
  if not schema_merge_func:
17
18
  schema_merge_func = default_schema_merge
18
19
  else:
@@ -32,7 +33,7 @@ def create_table_from_files(
32
33
  table = schema.create_table(table_name, current_schema)
33
34
 
34
35
  log.info("Starting import of %d files to table: %s", len(parquet_files), table)
35
- table.import_files(parquet_files)
36
+ table.import_files(parquet_files, config=config)
36
37
  log.info("Finished import of %d files to table: %s", len(parquet_files), table)
37
38
  return table
38
39
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vastdb
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: VAST Data SDK
5
5
  Home-page: https://github.com/vast-data/vastdb_sdk
6
6
  Author: VAST DATA
@@ -149,23 +149,27 @@ vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI
149
149
  vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
150
150
  vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
151
  vastdb/__init__.py,sha256=GY30IfZQApfl7HfcFmfTzFpx48oHgQIrDcUQCiTnxpo,206
152
- vastdb/bucket.py,sha256=Xbgn5Ns7veBL4oKH7EaSj4SxTPDRlicl9Saaz_39ZsU,2526
153
- vastdb/errors.py,sha256=mveQ2O0fLKOS51V9k5Y-HwY8Y1XiYdE9aJ9j0wlltWQ,3119
154
- vastdb/internal_commands.py,sha256=FR4rkr-sRvqMc-Y5hW7APOPa41a8d7L8DNJ2ROHRaFM,104441
155
- vastdb/schema.py,sha256=TbgqaUqAraj43vKCyVQNUSOMuJdw2Y4M06svs5jpcPo,2836
156
- vastdb/session.py,sha256=qgjT9rt1yUr4GyHOZRsVGFj3HYBoHFAEdczi_R26O8o,1731
157
- vastdb/table.py,sha256=p0uE0Gw9hen7hLTx9xC_MbxTaG6ZhZEFieaenUnbyUY,20442
158
- vastdb/transaction.py,sha256=jleiVmg4iui2q1GqMFsPo5GZRNtIa5NzGLFVooBbzkQ,1797
159
- vastdb/util.py,sha256=pBw4ywNJfkvKik-T7ZKPrWBoZOqGns-WsSZkG7HHa2I,2908
152
+ vastdb/bucket.py,sha256=5J8KBdRViaz5bZ8WEPggQj7DfJaIhY7CqpoWP6reaDo,2854
153
+ vastdb/conftest.py,sha256=pKpo_46Vq4QHzTDQAFxasrVhnZ2V2L-y6IMLxojxaFM,2132
154
+ vastdb/errors.py,sha256=wCJp70QyBW8US2KMwhB6e5ZnKRft4GiN8jyJ36f1Yuo,3315
155
+ vastdb/internal_commands.py,sha256=rmxOjIq229gsxFFZ4nKXwVIFJcu8WR1DVsE-91w4-BY,101564
156
+ vastdb/schema.py,sha256=x9Yn4tFTFkSpJbQqpqlecKUSOK214XsRLdOUrNW0jzM,3192
157
+ vastdb/session.py,sha256=VZOFGZbAdr5Tl4cp88VRQYnR4Q16UNuYjSmX_QPW1II,1718
158
+ vastdb/table.py,sha256=eALN5YpUfDFqZNF_lp6lZD5RJkBKqp5Mlc6hpwGI8Rg,20443
159
+ vastdb/transaction.py,sha256=2I5k81YvcgDsp07BrAWkmXf41qUP6m88Y40rFfuIKvI,1796
160
+ vastdb/util.py,sha256=VR0UJ1D0WUpqS5edG_mkxDZYZJ_qqce8y7iJOvqeyWE,2974
161
+ vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
+ vastdb/bench/test_perf.py,sha256=X7BIo60L5Oj7H-56e8pDFtXY9rNLerkywKexXWiqvrY,1111
160
163
  vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
- vastdb/tests/conftest.py,sha256=VLqImQ1XMr1FLCLCqZpgv8wMjNskfYAtMp-qjZFBqWo,1694
162
- vastdb/tests/test_imports.py,sha256=OhkbuhTeLgD4I6Vbub-B7wQo-G37TlXoHVQhGCLz9Wo,5035
163
- vastdb/tests/test_projections.py,sha256=PRi1Jf__95fsL9ZCQ_s2PtszWIO5FIFbniiL6FnV18M,1253
164
- vastdb/tests/test_sanity.py,sha256=gijOWK4ymGhVRHkf0ecHibVlaJxl92RinPdFUwWj1OQ,2959
165
- vastdb/tests/test_schemas.py,sha256=YX0lF8FbXzNCNVUAxq3g0L0OCjGq1OwkQaNaBtzDe4Q,1253
166
- vastdb/tests/test_tables.py,sha256=qnDRBtOPh9qcV4O3kB6xF4WJFj3B3WX2RAmUzkzl05g,23634
167
- vastdb-0.1.0.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
168
- vastdb-0.1.0.dist-info/METADATA,sha256=pCY34hVZGzoB51SyIgrzrtxDaOXC_2DGOQbafZdOmQg,1331
169
- vastdb-0.1.0.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
170
- vastdb-0.1.0.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
171
- vastdb-0.1.0.dist-info/RECORD,,
164
+ vastdb/tests/test_imports.py,sha256=fDUjO5U-5i4QTIMoNnSSW4X_ZnOStLbx0mJkNq2pj9Q,5033
165
+ vastdb/tests/test_nested.py,sha256=3kejEvtSqV0LrUgb1QglRjrlxnKI4_AXTFw2nE7Q520,951
166
+ vastdb/tests/test_projections.py,sha256=0ZiFya6rzGvnKOrdb1xxxv-BEerNmiK_ymfZM6eIvvw,1254
167
+ vastdb/tests/test_sanity.py,sha256=kaOmZWDGBc-XhZ8eFQ3sks2Mo9De8q41Z5pqYWzJsHM,2958
168
+ vastdb/tests/test_schemas.py,sha256=8ZlEvnU7Fyg-TDQDxD65GAql4rU8R2_SFWVGrdv564o,1721
169
+ vastdb/tests/test_tables.py,sha256=o_JPqr2GX1DDpPB4Zq4E1YPFgmlsiXyVe1S3TcCjF-w,26226
170
+ vastdb/tests/util.py,sha256=_euE3fKJqgNssT9gVxlcHjdE61mnsNQcwDPzn1tTe9g,597
171
+ vastdb-0.1.1.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
172
+ vastdb-0.1.1.dist-info/METADATA,sha256=e84OEOXS09DEXniHJAU2aeK80-1h2rIZmYNBCMLa1AM,1331
173
+ vastdb-0.1.1.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
174
+ vastdb-0.1.1.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
175
+ vastdb-0.1.1.dist-info/RECORD,,
File without changes