vastdb 0.1.10__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vastdb/__init__.py CHANGED
@@ -1,9 +1,6 @@
1
1
  """VAST Database Python SDK."""
2
2
 
3
3
  import functools
4
- import importlib.metadata
5
-
6
- __version__ = importlib.metadata.distribution(__package__).version
7
4
 
8
5
  from . import session
9
6
 
@@ -12,3 +9,9 @@ from . import session
12
9
  @functools.wraps(session.Session)
13
10
  def connect(*args, **kwargs): # noqa: D103
14
11
  return session.Session(*args, **kwargs)
12
+
13
+
14
+ def version():
15
+ """Return VAST DB SDK version."""
16
+ import importlib
17
+ return importlib.metadata.distribution(__package__).version
vastdb/_internal.py CHANGED
@@ -5,9 +5,8 @@ import re
5
5
  import struct
6
6
  import urllib.parse
7
7
  from collections import defaultdict, namedtuple
8
- from dataclasses import dataclass, field
9
8
  from enum import Enum
10
- from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
9
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
11
10
 
12
11
  import backoff
13
12
  import flatbuffers
@@ -104,6 +103,7 @@ from vast_flatbuf.tabular.ListSchemasResponse import ListSchemasResponse as list
104
103
  from vast_flatbuf.tabular.ListTablesResponse import ListTablesResponse as list_tables
105
104
 
106
105
  from . import errors
106
+ from .config import BackoffConfig
107
107
 
108
108
  UINT64_MAX = 18446744073709551615
109
109
 
@@ -742,46 +742,38 @@ def _backoff_giveup(exc: Exception) -> bool:
742
742
  return True # give up in case of other exceptions
743
743
 
744
744
 
745
- @dataclass
746
- class BackoffConfig:
747
- wait_gen: Callable = field(default=backoff.expo)
748
- max_value: Optional[float] = None # max duration for a single wait period
749
- max_tries: int = 10
750
- max_time: float = 60.0 # in seconds
751
- backoff_log_level: int = logging.DEBUG
752
-
753
-
754
745
  class VastdbApi:
755
746
  # we expect the vast version to be <major>.<minor>.<patch>.<protocol>
756
747
  VAST_VERSION_REGEX = re.compile(r'^vast (\d+\.\d+\.\d+\.\d+)$')
757
748
 
758
749
  def __init__(self, endpoint, access_key, secret_key,
759
750
  *,
760
- auth_type=AuthType.SIGV4,
761
751
  ssl_verify=True,
752
+ timeout=None,
762
753
  backoff_config: Optional[BackoffConfig] = None):
763
754
 
764
- from . import __version__ # import lazily here (to avoid circular dependencies)
765
- self.client_sdk_version = f"VAST Database Python SDK {__version__} - 2024 (c)"
755
+ from . import version # import lazily here (to avoid circular dependencies)
756
+ self.client_sdk_version = f"VAST Database Python SDK {version()} - 2024 (c)"
766
757
 
767
758
  url = urllib3.util.parse_url(endpoint)
768
759
  self.access_key = access_key
769
760
  self.secret_key = secret_key
770
761
 
762
+ self.timeout = timeout
771
763
  self.default_max_list_columns_page_size = 1000
772
764
  self._session = requests.Session()
773
765
  self._session.verify = ssl_verify
774
766
  self._session.headers['user-agent'] = self.client_sdk_version
775
767
 
776
- backoff_config = backoff_config or BackoffConfig()
768
+ self.backoff_config = backoff_config or BackoffConfig()
777
769
  self._backoff_decorator = backoff.on_exception(
778
- wait_gen=backoff_config.wait_gen,
770
+ wait_gen=self.backoff_config.wait_gen,
779
771
  exception=_RETRIABLE_EXCEPTIONS,
780
772
  giveup=_backoff_giveup,
781
- max_tries=backoff_config.max_tries,
782
- max_time=backoff_config.max_time,
783
- max_value=backoff_config.max_value, # passed to `backoff_config.wait_gen`
784
- backoff_log_level=backoff_config.backoff_log_level)
773
+ max_tries=self.backoff_config.max_tries,
774
+ max_time=self.backoff_config.max_time,
775
+ max_value=self.backoff_config.max_value, # passed to `self.backoff_config.wait_gen`
776
+ backoff_log_level=self.backoff_config.backoff_log_level)
785
777
  self._request = self._backoff_decorator(self._single_request)
786
778
 
787
779
  if url.port in {80, 443, None}:
@@ -819,10 +811,18 @@ class VastdbApi:
819
811
  _logger.critical(msg)
820
812
  raise NotImplementedError(msg)
821
813
 
814
+ def with_endpoint(self, endpoint):
815
+ return VastdbApi(endpoint=endpoint,
816
+ access_key=self.access_key,
817
+ secret_key=self.secret_key,
818
+ ssl_verify=self._session.verify,
819
+ timeout=self.timeout,
820
+ backoff_config=self.backoff_config)
821
+
822
822
  def _single_request(self, *, method, url, skip_status_check=False, **kwargs):
823
- _logger.debug("Sending request: %s %s %s", method, url, kwargs)
823
+ _logger.debug("Sending request: %s %s %s timeout=%s", method, url, kwargs, self.timeout)
824
824
  try:
825
- res = self._session.request(method=method, url=url, **kwargs)
825
+ res = self._session.request(method=method, url=url, timeout=self.timeout, **kwargs)
826
826
  except requests.exceptions.ConnectionError as err:
827
827
  # low-level connection issue, it is safe to retry only read-only requests
828
828
  may_retry = (method == "GET")
vastdb/bench/test_perf.py CHANGED
@@ -10,11 +10,11 @@ log = logging.getLogger(__name__)
10
10
 
11
11
 
12
12
  @pytest.mark.benchmark
13
- def test_bench(session, clean_bucket_name, parquets_path, crater_path):
13
+ def test_bench(session, bucket_name, parquets_path, crater_path):
14
14
  files = [str(parquets_path / f) for f in (parquets_path.glob('**/*.pq'))]
15
15
 
16
16
  with session.transaction() as tx:
17
- b = tx.bucket(clean_bucket_name)
17
+ b = tx.bucket(bucket_name)
18
18
  s = b.create_schema('s1')
19
19
  t = util.create_table_from_files(s, 't1', files, config=ImportConfig(import_concurrency=8))
20
20
  config = QueryConfig(num_splits=8, num_sub_splits=4)
@@ -22,7 +22,8 @@ logging.basicConfig(
22
22
 
23
23
  log = logging.getLogger()
24
24
 
25
- log.info("Python SDK version: %s", vastdb.util.version())
25
+ sdk_version = vastdb.version()
26
+ log.info("Python SDK version: %s", sdk_version)
26
27
 
27
28
  NUM_COLUMNS = 10_000
28
29
  COLUMNS_BATCH = 10
@@ -63,6 +64,8 @@ def load_batch(bucket, session_kwargs, offset, limit):
63
64
  # skip already loaded rows
64
65
  log.info('skipping [%d..%d)', offset, limit)
65
66
 
67
+ pid = os.getpid()
68
+ tid = threading.get_native_id()
66
69
  total_nbytes = 0
67
70
  calls = 0
68
71
  t0 = time.time()
@@ -89,7 +92,7 @@ def load_batch(bucket, session_kwargs, offset, limit):
89
92
  metrics_rows.append(metrics.Row(
90
93
  start=start, finish=finish, table_path=table.path, op=op,
91
94
  nbytes=nbytes, rows=len(chunk), cols=len(cols_batch),
92
- pid=os.getpid(), tid=threading.get_native_id()))
95
+ pid=pid, tid=tid, sdk_version=sdk_version))
93
96
 
94
97
  total_nbytes += nbytes
95
98
  calls += 1
@@ -104,12 +107,12 @@ def load_batch(bucket, session_kwargs, offset, limit):
104
107
  return metrics_rows
105
108
 
106
109
 
107
- def test_ingest(clean_bucket_name, session_kwargs, tabular_endpoint_urls, num_workers, perf_metrics_db):
110
+ def test_ingest(test_bucket_name, session_kwargs, tabular_endpoint_urls, num_workers, perf_metrics_db):
108
111
  session = vastdb.connect(**session_kwargs)
109
112
  metrics_table = metrics.Table(perf_metrics_db, "ingest")
110
113
 
111
114
  with session.transaction() as tx:
112
- b = tx.bucket(clean_bucket_name)
115
+ b = tx.bucket(test_bucket_name)
113
116
  try:
114
117
  s = b.schema(SCHEMA)
115
118
  except vastdb.errors.MissingSchema:
@@ -127,7 +130,7 @@ def test_ingest(clean_bucket_name, session_kwargs, tabular_endpoint_urls, num_wo
127
130
 
128
131
  with ProcessPoolExecutor(max_workers=num_workers) as executor:
129
132
  futures = [
130
- executor.submit(load_batch, clean_bucket_name, session_kwargs | {'endpoint': url}, offset, limit)
133
+ executor.submit(load_batch, test_bucket_name, session_kwargs | {'endpoint': url}, offset, limit)
131
134
  for (offset, limit), url in zip(ranges, itertools.cycle(tabular_endpoint_urls))
132
135
  ]
133
136
  log.info("spawned %d futures", len(futures))
@@ -135,7 +138,7 @@ def test_ingest(clean_bucket_name, session_kwargs, tabular_endpoint_urls, num_wo
135
138
  metrics_table.insert(future.result())
136
139
 
137
140
  with session.transaction() as tx:
138
- t = tx.bucket(clean_bucket_name).schema(SCHEMA).table(TABLE)
141
+ t = tx.bucket(test_bucket_name).schema(SCHEMA).table(TABLE)
139
142
  count = sum(len(rb) for rb in t.select([]))
140
143
  log.info("%s has %d rows: %s", t, count, t.stats)
141
144
 
@@ -154,6 +157,10 @@ def run_query(session_kwargs, i, bucket_name, endpoint_url):
154
157
  r = random.Random(i)
155
158
  r.shuffle(row_group_indices)
156
159
 
160
+ pid = os.getpid()
161
+ tid = threading.get_native_id()
162
+ metrics_rows = []
163
+
157
164
  session = vastdb.connect(**(session_kwargs | {"endpoint": endpoint_url}))
158
165
  with session.transaction() as tx:
159
166
  t = tx.bucket(bucket_name).schema(SCHEMA).table(TABLE)
@@ -174,23 +181,37 @@ def run_query(session_kwargs, i, bucket_name, endpoint_url):
174
181
 
175
182
  for j, pred in enumerate(preds):
176
183
  log.info("%d) starting query #%d on %s", i, j, endpoint_url)
177
- t0 = time.time()
184
+
185
+ start = time.perf_counter()
178
186
  res = t.select(columns=cols, predicate=pred, config=config)
179
187
  rows = 0
180
188
  data = 0
181
189
  for rb in res:
182
190
  rows += len(rb)
183
191
  data += rb.nbytes
184
- dt = time.time() - t0
192
+ dt = time.perf_counter() - start
185
193
  log.info("%d) got query #%d batch %.3f[s], %.3f[GB] %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
186
194
 
187
- dt = time.time() - t0
195
+ finish = time.perf_counter()
196
+ dt = finish - start
188
197
  log.info("%d) finished query #%d %.3f[s], %.3f[GB], %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
189
198
 
199
+ metrics_rows.append(metrics.Row(
200
+ start=start, finish=finish, table_path=t.path, op="select",
201
+ nbytes=data, rows=rows, cols=len(cols),
202
+ pid=pid, tid=tid, sdk_version=sdk_version))
203
+
204
+
205
+ def test_scan(test_bucket_name, session, num_workers, session_kwargs, tabular_endpoint_urls, perf_metrics_db):
206
+ metrics_table = metrics.Table(perf_metrics_db, "query")
190
207
 
191
- def test_scan(test_bucket_name, session, num_workers, session_kwargs, tabular_endpoint_urls):
192
208
  log.info("starting %d workers, endpoints=%s", num_workers, tabular_endpoint_urls)
193
209
  with ProcessPoolExecutor(max_workers=num_workers) as executor:
194
- for i, url in zip(range(num_workers), itertools.cycle(tabular_endpoint_urls)):
210
+ futures = [
195
211
  executor.submit(run_query, session_kwargs, i, test_bucket_name, url)
212
+ for i, url in zip(range(num_workers), itertools.cycle(tabular_endpoint_urls))
213
+ ]
214
+ for future in as_completed(futures):
215
+ metrics_table.insert(future.result())
216
+
196
217
  log.info("finished %d workers", num_workers)
vastdb/config.py ADDED
@@ -0,0 +1,65 @@
1
+ """Configuration-related dataclasses."""
2
+
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from typing import Callable, List, Optional
6
+
7
+ import backoff
8
+
9
+
10
+ @dataclass
11
+ class BackoffConfig:
12
+ """Retry configuration."""
13
+
14
+ wait_gen: Callable = field(default=backoff.expo)
15
+ max_value: Optional[float] = None # max duration for a single wait period
16
+ max_tries: int = 10
17
+ max_time: float = 60.0 # in seconds
18
+ backoff_log_level: int = logging.DEBUG
19
+
20
+
21
+ @dataclass
22
+ class QueryConfig:
23
+ """Query execution configiration."""
24
+
25
+ # allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
26
+ num_sub_splits: int = 4
27
+
28
+ # used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
29
+ # will be estimated from the table's row count, if not explicitly set
30
+ num_splits: Optional[int] = None
31
+
32
+ # each endpoint will be handled by a separate worker thread
33
+ # a single endpoint can be specified more than once to benefit from multithreaded execution
34
+ data_endpoints: Optional[List[str]] = None
35
+
36
+ # a subsplit fiber will finish after sending this number of rows back to the client
37
+ limit_rows_per_sub_split: int = 128 * 1024
38
+
39
+ # each fiber will read the following number of rowgroups coninuously before skipping
40
+ # in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
41
+ num_row_groups_per_sub_split: int = 8
42
+
43
+ # can be disabled for benchmarking purposes
44
+ use_semi_sorted_projections: bool = True
45
+
46
+ # enforce using a specific semi-sorted projection (if enabled above)
47
+ semi_sorted_projection_name: Optional[str] = None
48
+
49
+ # used to estimate the number of splits, given the table rows' count
50
+ rows_per_split: int = 4000000
51
+
52
+ # used for worker threads' naming
53
+ query_id: str = ""
54
+
55
+ # non-negative integer, used for server-side prioritization of queued requests:
56
+ # - requests with lower values will be served before requests with higher values.
57
+ # - if unset, the request will be added to the queue's end.
58
+ queue_priority: Optional[int] = None
59
+
60
+
61
+ @dataclass
62
+ class ImportConfig:
63
+ """Import execution configiration."""
64
+
65
+ import_concurrency: int = 2
vastdb/errors.py CHANGED
@@ -3,7 +3,6 @@ import xml.etree.ElementTree
3
3
  from dataclasses import dataclass
4
4
  from enum import Enum
5
5
 
6
- import pyarrow as pa
7
6
  import requests
8
7
 
9
8
 
@@ -170,11 +169,6 @@ class NotSupportedCommand(NotSupported):
170
169
  table: str
171
170
 
172
171
 
173
- @dataclass
174
- class NotSupportedType(NotSupported):
175
- field: pa.Field
176
-
177
-
178
172
  @dataclass
179
173
  class NotSupportedVersion(NotSupported):
180
174
  err_msg: str
vastdb/features.py ADDED
@@ -0,0 +1,42 @@
1
+ """Version-dependent features."""
2
+
3
+ import logging
4
+
5
+ from .errors import NotSupportedVersion
6
+
7
+ log = logging.getLogger()
8
+
9
+
10
+ class Features:
11
+ """VAST database features - check if server is already support a feature."""
12
+
13
+ def __init__(self, vast_version):
14
+ """Save the server version."""
15
+ self.vast_version = vast_version
16
+
17
+ self.check_imports_table = self._check(
18
+ "Imported objects' table feature requires 5.2+ VAST release",
19
+ vast_version >= (5, 2))
20
+
21
+ self.check_return_row_ids = self._check(
22
+ "Returning row IDs requires 5.1+ VAST release",
23
+ vast_version >= (5, 1))
24
+
25
+ self.check_enforce_semisorted_projection = self._check(
26
+ "Semi-sorted projection enforcement requires 5.1+ VAST release",
27
+ vast_version >= (5, 1))
28
+
29
+ self.check_external_row_ids_allocation = self._check(
30
+ "External row IDs allocation requires 5.1+ VAST release",
31
+ vast_version >= (5, 1))
32
+
33
+ def _check(self, msg, supported):
34
+ log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
35
+ if not supported:
36
+ def fail():
37
+ raise NotSupportedVersion(msg, self.vast_version)
38
+ return fail
39
+
40
+ def noop():
41
+ pass
42
+ return noop
vastdb/schema.py CHANGED
@@ -10,7 +10,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional
10
10
 
11
11
  import pyarrow as pa
12
12
 
13
- from . import bucket, errors, schema, table, util
13
+ from . import bucket, errors, schema, table
14
14
 
15
15
  if TYPE_CHECKING:
16
16
  from .table import Table
@@ -86,7 +86,6 @@ class Schema:
86
86
  if use_external_row_ids_allocation:
87
87
  self.tx._rpc.features.check_external_row_ids_allocation()
88
88
 
89
- util.check_supported_types(columns)
90
89
  self.tx._rpc.api.create_table(self.bucket.name, self.name, table_name, columns, txid=self.tx.txid,
91
90
  use_external_row_ids_allocation=use_external_row_ids_allocation)
92
91
  log.info("Created table: %s", table_name)
vastdb/session.py CHANGED
@@ -7,51 +7,11 @@ For more details see:
7
7
  - [Tabular identity policy with the proper permissions](https://support.vastdata.com/s/article/UUID-14322b60-d6a2-89ac-3df0-3dfbb6974182)
8
8
  """
9
9
 
10
- import logging
11
10
  import os
12
- from typing import Optional
11
+ from typing import TYPE_CHECKING, Optional
13
12
 
14
- import boto3
15
-
16
- from . import _internal, errors, transaction
17
- from ._internal import BackoffConfig
18
-
19
- log = logging.getLogger()
20
-
21
-
22
- class Features:
23
- """VAST database features - check if server is already support a feature."""
24
-
25
- def __init__(self, vast_version):
26
- """Save the server version."""
27
- self.vast_version = vast_version
28
-
29
- self.check_imports_table = self._check(
30
- "Imported objects' table feature requires 5.2+ VAST release",
31
- vast_version >= (5, 2))
32
-
33
- self.check_return_row_ids = self._check(
34
- "Returning row IDs requires 5.1+ VAST release",
35
- vast_version >= (5, 1))
36
-
37
- self.check_enforce_semisorted_projection = self._check(
38
- "Semi-sorted projection enforcement requires 5.1+ VAST release",
39
- vast_version >= (5, 1))
40
-
41
- self.check_external_row_ids_allocation = self._check(
42
- "External row IDs allocation requires 5.1+ VAST release",
43
- vast_version >= (5, 1))
44
-
45
- def _check(self, msg, supported):
46
- log.debug("%s (current version is %s): supported=%s", msg, self.vast_version, supported)
47
- if not supported:
48
- def fail():
49
- raise errors.NotSupportedVersion(msg, self.vast_version)
50
- return fail
51
-
52
- def noop():
53
- pass
54
- return noop
13
+ if TYPE_CHECKING:
14
+ from .config import BackoffConfig
55
15
 
56
16
 
57
17
  class Session:
@@ -60,8 +20,13 @@ class Session:
60
20
  def __init__(self, access=None, secret=None, endpoint=None,
61
21
  *,
62
22
  ssl_verify=True,
63
- backoff_config: Optional[BackoffConfig] = None):
23
+ timeout=None,
24
+ backoff_config: Optional["BackoffConfig"] = None):
64
25
  """Connect to a VAST Database endpoint, using specified credentials."""
26
+ import boto3
27
+
28
+ from . import _internal, features
29
+
65
30
  if access is None:
66
31
  access = os.environ['AWS_ACCESS_KEY_ID']
67
32
  if secret is None:
@@ -74,8 +39,9 @@ class Session:
74
39
  access_key=access,
75
40
  secret_key=secret,
76
41
  ssl_verify=ssl_verify,
42
+ timeout=timeout,
77
43
  backoff_config=backoff_config)
78
- self.features = Features(self.api.vast_version)
44
+ self.features = features.Features(self.api.vast_version)
79
45
  self.s3 = boto3.client('s3',
80
46
  aws_access_key_id=access,
81
47
  aws_secret_access_key=secret,
@@ -93,4 +59,5 @@ class Session:
93
59
  with session.transaction() as tx:
94
60
  tx.bucket("bucket").create_schema("schema")
95
61
  """
62
+ from . import transaction
96
63
  return transaction.Transaction(self)
vastdb/table.py CHANGED
@@ -14,6 +14,7 @@ import pyarrow as pa
14
14
  import urllib3
15
15
 
16
16
  from . import _internal, errors, schema, util
17
+ from .config import ImportConfig, QueryConfig
17
18
 
18
19
  log = logging.getLogger(__name__)
19
20
 
@@ -39,53 +40,6 @@ class TableStats:
39
40
  endpoints: Tuple[str, ...] = ()
40
41
 
41
42
 
42
- @dataclass
43
- class QueryConfig:
44
- """Query execution configiration."""
45
-
46
- # allows server-side parallel processing by issuing multiple reads concurrently for a single RPC
47
- num_sub_splits: int = 4
48
-
49
- # used to split the table into disjoint subsets of rows, to be processed concurrently using multiple RPCs
50
- # will be estimated from the table's row count, if not explicitly set
51
- num_splits: Optional[int] = None
52
-
53
- # each endpoint will be handled by a separate worker thread
54
- # a single endpoint can be specified more than once to benefit from multithreaded execution
55
- data_endpoints: Optional[List[str]] = None
56
-
57
- # a subsplit fiber will finish after sending this number of rows back to the client
58
- limit_rows_per_sub_split: int = 128 * 1024
59
-
60
- # each fiber will read the following number of rowgroups coninuously before skipping
61
- # in order to use semi-sorted projections this value must be 8 (this is the hard coded size of a row groups per row block).
62
- num_row_groups_per_sub_split: int = 8
63
-
64
- # can be disabled for benchmarking purposes
65
- use_semi_sorted_projections: bool = True
66
-
67
- # enforce using a specific semi-sorted projection (if enabled above)
68
- semi_sorted_projection_name: Optional[str] = None
69
-
70
- # used to estimate the number of splits, given the table rows' count
71
- rows_per_split: int = 4000000
72
-
73
- # used for worker threads' naming
74
- query_id: str = ""
75
-
76
- # non-negative integer, used for server-side prioritization of queued requests:
77
- # - requests with lower values will be served before requests with higher values.
78
- # - if unset, the request will be added to the queue's end.
79
- queue_priority: Optional[int] = None
80
-
81
-
82
- @dataclass
83
- class ImportConfig:
84
- """Import execution configiration."""
85
-
86
- import_concurrency: int = 2
87
-
88
-
89
43
  class SelectSplitState:
90
44
  """State of a specific query split execution."""
91
45
 
@@ -299,7 +253,7 @@ class Table:
299
253
  max_workers=config.import_concurrency, thread_name_prefix='import_thread') as pool:
300
254
  try:
301
255
  for endpoint in endpoints:
302
- session = _internal.VastdbApi(endpoint, self.tx._rpc.api.access_key, self.tx._rpc.api.secret_key)
256
+ session = self.tx._rpc.api.with_endpoint(endpoint)
303
257
  futures.append(pool.submit(import_worker, files_queue, session))
304
258
 
305
259
  log.debug("Waiting for import workers to finish")
@@ -397,7 +351,7 @@ class Table:
397
351
 
398
352
  def single_endpoint_worker(endpoint: str):
399
353
  try:
400
- host_api = _internal.VastdbApi(endpoint=endpoint, access_key=self.tx._rpc.api.access_key, secret_key=self.tx._rpc.api.secret_key)
354
+ host_api = self.tx._rpc.api.with_endpoint(endpoint)
401
355
  backoff_decorator = self.tx._rpc.api._backoff_decorator
402
356
  while True:
403
357
  check_stop()
@@ -491,7 +445,6 @@ class Table:
491
445
  raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
492
446
  try:
493
447
  row_ids = []
494
- util.check_supported_types(rows.schema)
495
448
  serialized_slices = util.iter_serialized_slices(rows, MAX_INSERT_ROWS_PER_PATCH)
496
449
  for slice in serialized_slices:
497
450
  res = self.tx._rpc.api.insert_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -534,7 +487,6 @@ class Table:
534
487
 
535
488
  update_rows_rb = util.sort_record_batch_if_needed(update_rows_rb, INTERNAL_ROW_ID)
536
489
 
537
- util.check_supported_types(update_rows_rb.schema)
538
490
  serialized_slices = util.iter_serialized_slices(update_rows_rb, MAX_ROWS_PER_BATCH)
539
491
  for slice in serialized_slices:
540
492
  self.tx._rpc.api.update_rows(self.bucket.name, self.schema.name, self.name, record_batch=slice,
@@ -579,7 +531,6 @@ class Table:
579
531
  """Add a new column."""
580
532
  if self._imports_table:
581
533
  raise errors.NotSupportedCommand(self.bucket.name, self.schema.name, self.name)
582
- util.check_supported_types(new_column)
583
534
  self.tx._rpc.api.add_columns(self.bucket.name, self.schema.name, self.name, new_column, txid=self.tx.txid)
584
535
  log.info("Added column(s): %s", new_column)
585
536
  self.arrow_schema = self.columns()
vastdb/tests/metrics.py CHANGED
@@ -20,6 +20,7 @@ class Row:
20
20
  cols: int
21
21
  pid: int
22
22
  tid: int
23
+ sdk_version: str
23
24
 
24
25
 
25
26
  class Table:
@@ -24,7 +24,7 @@ def test_bad_credentials(session):
24
24
 
25
25
 
26
26
  def test_bad_endpoint(session):
27
- backoff_config = vastdb.session.BackoffConfig(max_tries=3)
27
+ backoff_config = vastdb.config.BackoffConfig(max_tries=3)
28
28
  with pytest.raises(vastdb.errors.ConnectionError):
29
29
  vastdb.connect(access='BAD', secret='BAD', endpoint='http://invalid-host-name-for-tests:12345', backoff_config=backoff_config)
30
30
 
@@ -227,6 +227,35 @@ def test_select_with_priority(session, clean_bucket_name):
227
227
  t.select(config=config).read_all()
228
228
 
229
229
 
230
+ def test_timezones(session, clean_bucket_name):
231
+ columns_with_tz = pa.schema([
232
+ ('ts0', pa.timestamp('s', tz='+00:00')),
233
+ ('ts3', pa.timestamp('ms', tz='UTC')),
234
+ ('ts6', pa.timestamp('us', tz='GMT')),
235
+ ('ts9', pa.timestamp('ns', tz='Universal')),
236
+ ])
237
+
238
+ # currently timezone information is not stored
239
+ columns_without_tz = pa.schema([
240
+ ('ts0', pa.timestamp('s')),
241
+ ('ts3', pa.timestamp('ms')),
242
+ ('ts6', pa.timestamp('us')),
243
+ ('ts9', pa.timestamp('ns')),
244
+ ])
245
+
246
+ data = [
247
+ [dt.datetime(2024, 4, 10, 12, 34, 56), dt.datetime(2025, 4, 10, 12, 34, 56), dt.datetime(2026, 4, 10, 12, 34, 56)],
248
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789000), dt.datetime(2025, 4, 10, 12, 34, 56, 789000), dt.datetime(2026, 4, 10, 12, 34, 56, 789000)],
249
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
250
+ [dt.datetime(2024, 4, 10, 12, 34, 56, 789789), dt.datetime(2025, 4, 10, 12, 34, 56, 789789), dt.datetime(2026, 4, 10, 12, 34, 56, 789789)],
251
+ ]
252
+
253
+ inserted = pa.table(schema=columns_with_tz, data=data)
254
+ with prepare_data(session, clean_bucket_name, 's', 't', inserted) as table:
255
+ assert table.arrow_schema == columns_without_tz
256
+ assert table.select().read_all() == pa.table(schema=columns_without_tz, data=data)
257
+
258
+
230
259
  def test_types(session, clean_bucket_name):
231
260
  columns = pa.schema([
232
261
  ('tb', pa.bool_()),
@@ -311,46 +340,6 @@ def test_types(session, clean_bucket_name):
311
340
  assert select(t['ts9'] == ts_literal) == expected.filter(pc.field('ts9') == ts_literal)
312
341
 
313
342
 
314
- TIMESTAMP_UNITS = ['s', 'ms', 'us', 'ns']
315
-
316
-
317
- def test_unsupported_timezone(session, clean_bucket_name):
318
- with session.transaction() as tx:
319
- s = tx.bucket(clean_bucket_name).create_schema('s1')
320
- for unit in TIMESTAMP_UNITS:
321
- col_type = pa.timestamp(unit, 'UTC')
322
- with pytest.raises(errors.NotSupportedType):
323
- s.create_table('t1', pa.schema([('ts', col_type)]))
324
- assert s.tables() == []
325
-
326
- cols = [('c', pa.int64())]
327
- t1 = s.create_table('t1', pa.schema(cols))
328
- for unit in TIMESTAMP_UNITS:
329
- col_type = pa.timestamp(unit, 'UTC')
330
- with pytest.raises(errors.NotSupportedType):
331
- t1.add_column(pa.schema([('ts', col_type)]))
332
-
333
- cols = [(f'c_{unit}', pa.timestamp(unit)) for unit in TIMESTAMP_UNITS]
334
- t2 = s.create_table('t2', pa.schema(cols))
335
-
336
- for unit in TIMESTAMP_UNITS:
337
- col_type = pa.timestamp(unit, 'UTC')
338
-
339
- rb = pa.record_batch(
340
- data=[[None]],
341
- schema=pa.schema([(f'c_{unit}', col_type)]))
342
- with pytest.raises(errors.NotSupportedType):
343
- t2.insert(rb)
344
-
345
- rb = pa.record_batch(
346
- data=[[0], [None]],
347
- schema=pa.schema([
348
- (INTERNAL_ROW_ID, pa.uint64()),
349
- (f'c_{unit}', col_type)]))
350
- with pytest.raises(errors.NotSupportedType):
351
- t2.update(rb)
352
-
353
-
354
343
  def test_filters(session, clean_bucket_name):
355
344
  columns = pa.schema([
356
345
  ('a', pa.int32()),
vastdb/util.py CHANGED
@@ -1,4 +1,3 @@
1
- import importlib
2
1
  import logging
3
2
  import re
4
3
  from typing import TYPE_CHECKING, Callable, List, Optional, Union
@@ -7,7 +6,7 @@ import pyarrow as pa
7
6
  import pyarrow.compute as pc
8
7
  import pyarrow.parquet as pq
9
8
 
10
- from .errors import InvalidArgument, NotSupportedType, TooWideRow
9
+ from .errors import InvalidArgument, TooWideRow
11
10
 
12
11
  log = logging.getLogger(__name__)
13
12
 
@@ -153,14 +152,3 @@ def sort_record_batch_if_needed(record_batch, sort_column):
153
152
  return record_batch.sort_by(sort_column)
154
153
  else:
155
154
  return record_batch
156
-
157
-
158
- def check_supported_types(fields: pa.Schema):
159
- for f in fields:
160
- if isinstance(f.type, pa.TimestampType):
161
- if f.type.tz:
162
- raise NotSupportedType(f)
163
-
164
-
165
- def version():
166
- return importlib.metadata.distribution('vastdb').version
@@ -0,0 +1,68 @@
1
+ import logging
2
+ import random
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor
5
+
6
+ import pyarrow as pa
7
+
8
+ from vastdb.table import QueryConfig
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def test_concurrent_query(session, test_bucket_name, schema_name, table_name):
14
+ """
15
+ This test runs several selective queries in parallel. It is used to check various internal VAST scenarios.
16
+ """
17
+ amount_of_queries_in_parallel = 10 # due to limit on requests connection-pool
18
+ config = QueryConfig(num_splits=1, num_sub_splits=1)
19
+
20
+ def _execute_single_query():
21
+ with session.transaction() as tx:
22
+ t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
23
+ pred = (t["a"] == 0) # 0 is in the min-max range
24
+ s = time.time()
25
+ t.select(config=config, predicate=pred).read_all()
26
+ e = time.time()
27
+ logger.info(f"Query took {e - s}")
28
+
29
+ logger.info(f"about to submit {amount_of_queries_in_parallel} queries in parallel")
30
+ with ThreadPoolExecutor() as executor:
31
+ futures = [executor.submit(_execute_single_query) for _ in range(amount_of_queries_in_parallel)]
32
+ for future in futures:
33
+ future.result()
34
+ logger.info(f"finished running {amount_of_queries_in_parallel} queries")
35
+
36
+
37
+ def test_table_stats(session, test_bucket_name, schema_name, table_name):
38
+ """
39
+ Testing stats integrity while altering table
40
+ """
41
+ NUM_TIMES_TO_INSERT = 1000
42
+ seed = random.randint(0, 10)
43
+ logger.info(f"random seed is {seed}")
44
+ r = random.Random(seed)
45
+
46
+ with session.transaction() as tx:
47
+ t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
48
+ initial_stat = t.get_stats()
49
+ table_fields = t.columns()
50
+
51
+ rand_values = {} # create a dict with a random value from each column
52
+ with session.transaction() as tx:
53
+ t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
54
+ for col in table_fields:
55
+ res = t.select(columns=[col.name]).read_all().column(col.name)
56
+ rand_values[col.name] = res[int(r.uniform(0, len(res)))].as_py()
57
+
58
+ logger.info(f"rand row to insert to the table - {rand_values}, {NUM_TIMES_TO_INSERT} times")
59
+ rb = pa.RecordBatch.from_pylist([rand_values] * NUM_TIMES_TO_INSERT)
60
+ with session.transaction() as tx:
61
+ t = tx.bucket(test_bucket_name).schema(schema_name).table(table_name)
62
+ t.insert(rb)
63
+ time.sleep(2) # waiting for stats to get updated
64
+ new_stat = t.get_stats()
65
+
66
+ logger.info("inserted to table")
67
+ assert new_stat.size_in_bytes != initial_stat.size_in_bytes
68
+ assert new_stat.num_rows - NUM_TIMES_TO_INSERT == initial_stat.num_rows
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vastdb
3
- Version: 0.1.10
3
+ Version: 1.0.0
4
4
  Summary: VAST Data SDK
5
5
  Home-page: https://github.com/vast-data/vastdb_sdk
6
6
  Author: VAST DATA
7
7
  Author-email: hello@vastdata.com
8
8
  License: Copyright (C) VAST Data Ltd.
9
- Classifier: Development Status :: 4 - Beta
9
+ Classifier: Development Status :: 5 - Production/Stable
10
10
  Classifier: License :: OSI Approved :: Apache Software License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.9
@@ -148,34 +148,37 @@ vast_flatbuf/tabular/ObjectDetails.py,sha256=qW0WtbkCYYE_L-Kw6VNRDCLYaRm5lKvTbLN
148
148
  vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI0,4450
149
149
  vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
150
150
  vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
- vastdb/__init__.py,sha256=8PLcZowy_vM0zuiYSQPXuxIEMcwHD7IRFpgcPK-03bk,386
152
- vastdb/_internal.py,sha256=4vi6KgkfHnDOSZUrU3oQcNKKdeMH3alODLGk7Yt59Gk,90001
151
+ vastdb/__init__.py,sha256=J1JjKiFkKC95BHowfh9kJfQFTjRce-QMsc6zF_FfxC0,432
152
+ vastdb/_internal.py,sha256=jVQuXsqk7CDuzNsR4cmxBTYOYE09og9EUYPcJ0UA_7s,90096
153
153
  vastdb/bucket.py,sha256=5KuKhPjZOevznZqWHDVVocejvAy7dcwobPuV6BJCfPc,2544
154
+ vastdb/config.py,sha256=1tMYtzKXerGcIUjH4tIGEvZNWvO4fviCEdcNCnELJZo,2269
154
155
  vastdb/conftest.py,sha256=ePzQiEQmlNGcM2T4GZevE4XuvcnFWfnTSzr8IVZpVKk,3438
155
- vastdb/errors.py,sha256=jER5RQYsBRlQsjym1ItQYRukggMypATOo_sKvsJtMbo,4278
156
- vastdb/schema.py,sha256=yaueil92MSMYJf6bWseov_8fXTdW5zaKLXNjP5uuyzI,5963
157
- vastdb/session.py,sha256=3YHhG7IamFOKuy-Fkq_IVtPNriSfI6IN_4z4arBFbDU,3349
158
- vastdb/table.py,sha256=C0kgV8CJVgoRxVx83SPTn75mgbTz9OWgYwK_RzLPZ5Q,32994
156
+ vastdb/errors.py,sha256=2XR1ko7J5nkfiHSAgwuVAADw0SsyqxOwSeFaGgKZEXM,4186
157
+ vastdb/features.py,sha256=DxV746LSkORwVSD6MP2hdXRfnyoLkJwtOwGmp1dnquo,1322
158
+ vastdb/schema.py,sha256=X7IRrogXH7Z0kes-DsDh1bRqIhvjH6owlFigGBXy7XQ,5913
159
+ vastdb/session.py,sha256=ZrQf8cecVIHIBUOPNg4ed8ZCnEEu0QW1OBxQgz_ia80,2241
160
+ vastdb/table.py,sha256=7U4UbooR1DUSpnZo8t8hzAMP78hF7IVvZpqZRrDDAQw,30953
159
161
  vastdb/transaction.py,sha256=qu2rOlR7AS1ojMOzgWapQMpcorrutelZZLH1mLmTHxk,3186
160
- vastdb/util.py,sha256=KQ2CjML-ipWxrJjwiaBbj4bxpTCtL24Pr2Co9woyw3Y,5983
162
+ vastdb/util.py,sha256=4LTYBBR13na376AmDm5lQILJzLcfelIKdkNPy0IqI0o,5684
161
163
  vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
- vastdb/bench/test_perf.py,sha256=yn5gE7t_nzmJHBl9bCs1hxQOgzhvFphuYElsWGko8ts,1084
163
- vastdb/bench/test_sample.py,sha256=0qsKPj3i88J-YTrOrGvsP19xsyWGZy_-ptIt3oXBbSw,7181
164
+ vastdb/bench/test_perf.py,sha256=gZIqfHva6lNFpD-9bHAe7M8COBjUyrPkHu3E7F8J2L0,1072
165
+ vastdb/bench/test_sample.py,sha256=bFmw7BOCr5FoGn4TY9pQGd6_cVNK4uBeSRi33tTubyk,7847
164
166
  vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
- vastdb/tests/metrics.py,sha256=f1oOPKDsu-BzBLin0IQvjG-ueRDHTY-Hzl357TuoxCQ,989
167
+ vastdb/tests/metrics.py,sha256=N6ELJUmZubhAMmUtDbisXr6TFhSDgVCTTU05gBVxHRA,1010
166
168
  vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
167
169
  vastdb/tests/test_imports.py,sha256=xKub3-bisFjH0BsZM8COfiUWuMrtoOoQKprF6VQT9RI,5669
168
170
  vastdb/tests/test_nested.py,sha256=22NAxBTm7Aq-Vn6AIYbi5Cb1ET8W0XeLK3pp4D8BYWI,3448
169
171
  vastdb/tests/test_projections.py,sha256=3y1kubwVrzO-xoR0hyps7zrjOJI8niCYspaFTN16Q9w,4540
170
- vastdb/tests/test_sanity.py,sha256=V6dO5Y44B6pG8Eet6atTTGGH1yPz75_k0ZybHY-IiF8,3039
172
+ vastdb/tests/test_sanity.py,sha256=oiV2gb05aPyG5RMNUQZlyjNlg3T7Fig1_8OJzpAgcsk,3038
171
173
  vastdb/tests/test_schemas.py,sha256=l70YQMlx2UL1KRQhApriiG2ZM7GJF-IzWU31H3Yqn1U,3312
172
- vastdb/tests/test_tables.py,sha256=RlwVfzs2hjfs2gchiRY0hnWoOAu4MV_9NbQCeHR6_us,31590
174
+ vastdb/tests/test_tables.py,sha256=qWicD0BYuhrh1kRVqkHMJNsxcHxDcCprbEXuZJm1wm4,31529
173
175
  vastdb/tests/test_util.py,sha256=Ok_sAEBJsRGF5Voa_v5eu3eAd52GWu8jMjjQbadwW-s,1260
174
176
  vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
175
177
  vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
178
  vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
177
- vastdb-0.1.10.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
178
- vastdb-0.1.10.dist-info/METADATA,sha256=Pjw1EZvwnKhfEjuRVVMR0DBOSkmVo5wcHftqddxqRNY,1351
179
- vastdb-0.1.10.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
180
- vastdb-0.1.10.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
181
- vastdb-0.1.10.dist-info/RECORD,,
179
+ vastdb/vast_tests/test_scale.py,sha256=EpjCJmVAQrNBxVnHGJ-KHCoxevhqOcyqYFPMIIY9s60,2714
180
+ vastdb-1.0.0.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
181
+ vastdb-1.0.0.dist-info/METADATA,sha256=iEjwUBDXLAZ3RC64iEHRiQkz0afSR5PPnHGmsthuzRI,1363
182
+ vastdb-1.0.0.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
183
+ vastdb-1.0.0.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
184
+ vastdb-1.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.2.0)
2
+ Generator: setuptools (70.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5