vastdb 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vastdb/_internal.py CHANGED
@@ -1807,7 +1807,7 @@ def _iter_query_data_response_columns(fileobj, stream_ids=None):
1807
1807
  batches.append(batch)
1808
1808
  except StopIteration: # we got an end-of-stream IPC message for a given stream ID
1809
1809
  reader, batches = readers.pop(stream_id) # end of column
1810
- table = pa.Table.from_batches(batches) # concatenate all column chunks (as a single)
1810
+ table = pa.Table.from_batches(batches=batches, schema=reader.schema) # concatenate all column chunks (as a single)
1811
1811
  _logger.debug("stream_id=%d rows=%d column=%s", stream_id, len(table), table)
1812
1812
  yield (stream_id, next_row_id, table)
1813
1813
 
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import functools
4
+ import itertools
5
+ import logging
6
+ import os
7
+ import random
8
+ import threading
9
+ import time
10
+ from concurrent.futures import ProcessPoolExecutor, as_completed
11
+
12
+ import numpy as np
13
+ import pyarrow as pa
14
+
15
+ import vastdb.errors
16
+ from vastdb.table import INTERNAL_ROW_ID
17
+ from vastdb.tests import metrics
18
+
19
+ logging.basicConfig(
20
+ level="INFO",
21
+ format="%(asctime)s %(levelname)-10s %(process)d/%(thread)d %(filename)s:%(lineno)d %(message)s")
22
+
23
+ log = logging.getLogger()
24
+
25
+ log.info("Python SDK version: %s", vastdb.util.version())
26
+
27
+ NUM_COLUMNS = 10_000
28
+ COLUMNS_BATCH = 10
29
+
30
+ NUM_ROW_GROUPS = 100
31
+ ROW_GROUP_SIZE = 100_000
32
+
33
+
34
+ INTERNAL_ROWID_FIELD = pa.field(INTERNAL_ROW_ID, pa.uint64()) # used for UPDATE
35
+ EXTERNAL_ROWID_FIELD = pa.field("vastdb_rowid", pa.int64()) # used for INSERT & SELECT
36
+
37
+ SCHEMA = "perf"
38
+ TABLE = "sample"
39
+
40
+ SCHEMA_ARROW = pa.schema(
41
+ [pa.field(f'c{i}', pa.float32()) for i in range(NUM_COLUMNS)]
42
+ )
43
+
44
+
45
+ def load_batch(bucket, session_kwargs, offset, limit):
46
+ log.info('loading into [%d..%d)', offset, limit)
47
+
48
+ # Iterate over all row-groups in this file
49
+ rowids_range = range(offset, limit)
50
+ rowids = pa.array(rowids_range, INTERNAL_ROWID_FIELD.type)
51
+
52
+ session = vastdb.connect(**session_kwargs)
53
+ metrics_rows = []
54
+
55
+ with session.transaction() as tx:
56
+ table = tx.bucket(bucket).schema(SCHEMA).table(TABLE)
57
+
58
+ col = table[EXTERNAL_ROWID_FIELD.name]
59
+ pred = (col >= rowids_range[0]) & (col <= rowids_range[-1])
60
+ count = sum(len(rb) for rb in table.select(columns=[], predicate=pred))
61
+ log.info("%d rows exist at %s", count, rowids_range)
62
+ if count == len(rowids_range):
63
+ # skip already loaded rows
64
+ log.info('skipping [%d..%d)', offset, limit)
65
+
66
+ total_nbytes = 0
67
+ calls = 0
68
+ t0 = time.time()
69
+ # Insert/update every chunk of columns in this rowgroup
70
+ for j in range(0, len(SCHEMA_ARROW), COLUMNS_BATCH):
71
+ cols_batch = list(SCHEMA_ARROW)[j:j + COLUMNS_BATCH]
72
+ arrays = [
73
+ pa.array(np.float32(np.random.uniform(size=[ROW_GROUP_SIZE])))
74
+ for _ in cols_batch
75
+ ]
76
+ chunk = pa.table(data=arrays, schema=pa.schema(cols_batch))
77
+ nbytes = chunk.get_total_buffer_size()
78
+ start = time.perf_counter()
79
+ if j == 0:
80
+ chunk = chunk.add_column(0, EXTERNAL_ROWID_FIELD, rowids.cast(EXTERNAL_ROWID_FIELD.type))
81
+ op = 'insert'
82
+ table.insert(chunk)
83
+ else:
84
+ chunk = chunk.add_column(0, INTERNAL_ROWID_FIELD, rowids)
85
+ op = 'update'
86
+ table.update(chunk)
87
+ finish = time.perf_counter()
88
+
89
+ metrics_rows.append(metrics.Row(
90
+ start=start, finish=finish, table_path=table.path, op=op,
91
+ nbytes=nbytes, rows=len(chunk), cols=len(cols_batch),
92
+ pid=os.getpid(), tid=threading.get_native_id()))
93
+
94
+ total_nbytes += nbytes
95
+ calls += 1
96
+ log.debug("%s into %s: %d rows x %d cols, %.3f MB",
97
+ op, rowids_range, len(chunk), len(chunk.schema),
98
+ chunk.get_total_buffer_size() / 1e6)
99
+
100
+ dt = time.time() - t0
101
+
102
+ log.info('loaded into [%d..%d): %d rows x %d cols, %.3f MB, %d RPCs, %.3f seconds',
103
+ offset, limit, limit - offset, NUM_COLUMNS, total_nbytes / 1e6, calls, dt)
104
+ return metrics_rows
105
+
106
+
107
+ def test_ingest(clean_bucket_name, session_kwargs, tabular_endpoint_urls, num_workers, perf_metrics_db):
108
+ session = vastdb.connect(**session_kwargs)
109
+ metrics_table = metrics.Table(perf_metrics_db, "ingest")
110
+
111
+ with session.transaction() as tx:
112
+ b = tx.bucket(clean_bucket_name)
113
+ try:
114
+ s = b.schema(SCHEMA)
115
+ except vastdb.errors.MissingSchema:
116
+ s = b.create_schema(SCHEMA)
117
+
118
+ try:
119
+ s.table(TABLE)
120
+ except vastdb.errors.MissingTable:
121
+ s.create_table(TABLE, pa.schema([EXTERNAL_ROWID_FIELD] + list(SCHEMA_ARROW)))
122
+
123
+ ranges = [
124
+ (i * ROW_GROUP_SIZE, (i + 1) * ROW_GROUP_SIZE)
125
+ for i in range(NUM_ROW_GROUPS)
126
+ ]
127
+
128
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
129
+ futures = [
130
+ executor.submit(load_batch, clean_bucket_name, session_kwargs | {'endpoint': url}, offset, limit)
131
+ for (offset, limit), url in zip(ranges, itertools.cycle(tabular_endpoint_urls))
132
+ ]
133
+ log.info("spawned %d futures", len(futures))
134
+ for future in as_completed(futures):
135
+ metrics_table.insert(future.result())
136
+
137
+ with session.transaction() as tx:
138
+ t = tx.bucket(clean_bucket_name).schema(SCHEMA).table(TABLE)
139
+ count = sum(len(rb) for rb in t.select([]))
140
+ log.info("%s has %d rows: %s", t, count, t.stats)
141
+
142
+
143
+ def run_query(session_kwargs, i, bucket_name, endpoint_url):
144
+ num_columns = 2000
145
+ row_groups_per_query = 10
146
+
147
+ config = vastdb.table.QueryConfig(
148
+ num_sub_splits=1,
149
+ num_splits=1,
150
+ limit_rows_per_sub_split=ROW_GROUP_SIZE,
151
+ num_row_groups_per_sub_split=1)
152
+
153
+ row_group_indices = list(range(NUM_ROW_GROUPS))
154
+ r = random.Random(i)
155
+ r.shuffle(row_group_indices)
156
+
157
+ session = vastdb.connect(**(session_kwargs | {"endpoint": endpoint_url}))
158
+ with session.transaction() as tx:
159
+ t = tx.bucket(bucket_name).schema(SCHEMA).table(TABLE)
160
+
161
+ fields = list(t.arrow_schema)[1:]
162
+ r.shuffle(fields)
163
+ cols = [f.name for f in fields[:num_columns]]
164
+
165
+ vastdb_rowid = t['vastdb_rowid']
166
+ preds = []
167
+ for offset in range(0, len(row_group_indices), row_groups_per_query):
168
+ rowid_ranges = (
169
+ vastdb_rowid.between(j * ROW_GROUP_SIZE, (j + 1) * ROW_GROUP_SIZE - 1)
170
+ for j in row_group_indices[offset:offset + row_groups_per_query]
171
+ )
172
+ pred = functools.reduce((lambda x, y: x | y), rowid_ranges)
173
+ preds.append(pred)
174
+
175
+ for j, pred in enumerate(preds):
176
+ log.info("%d) starting query #%d on %s", i, j, endpoint_url)
177
+ t0 = time.time()
178
+ res = t.select(columns=cols, predicate=pred, config=config)
179
+ rows = 0
180
+ data = 0
181
+ for rb in res:
182
+ rows += len(rb)
183
+ data += rb.nbytes
184
+ dt = time.time() - t0
185
+ log.info("%d) got query #%d batch %.3f[s], %.3f[GB] %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
186
+
187
+ dt = time.time() - t0
188
+ log.info("%d) finished query #%d %.3f[s], %.3f[GB], %.3f[MB/s], %.3f[Mrows]", i, j, dt, data / 1e9, data / 1e6 / dt, rows / 1e6)
189
+
190
+
191
+ def test_scan(test_bucket_name, session, num_workers, session_kwargs, tabular_endpoint_urls):
192
+ log.info("starting %d workers, endpoints=%s", num_workers, tabular_endpoint_urls)
193
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
194
+ for i, url in zip(range(num_workers), itertools.cycle(tabular_endpoint_urls)):
195
+ executor.submit(run_query, session_kwargs, i, test_bucket_name, url)
196
+ log.info("finished %d workers", num_workers)
vastdb/conftest.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import sqlite3
2
3
  from pathlib import Path
3
4
 
4
5
  import boto3
@@ -13,27 +14,43 @@ def pytest_addoption(parser):
13
14
  default=os.environ.get("AWS_ACCESS_KEY_ID", None))
14
15
  parser.addoption("--tabular-secret-key", help="Secret key with Tabular permissions (AWS_SECRET_ACCESS_KEY)",
15
16
  default=os.environ.get("AWS_SECRET_ACCESS_KEY", None))
16
- parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default="http://localhost:9090")
17
+ parser.addoption("--tabular-endpoint-url", help="Tabular server endpoint", default=[], action="append")
17
18
  parser.addoption("--data-path", help="Data files location", default=None)
18
19
  parser.addoption("--crater-path", help="Save benchmark results in a dedicated location", default=None)
19
20
  parser.addoption("--schema-name", help="Name of schema for the test to operate on", default=None)
20
21
  parser.addoption("--table-name", help="Name of table for the test to operate on", default=None)
22
+ parser.addoption("--num-workers", help="Number of concurrent workers", default=1)
21
23
 
22
24
 
23
25
  @pytest.fixture(scope="session")
24
- def session(request):
25
- return vastdb.connect(
26
+ def session_kwargs(request, tabular_endpoint_urls):
27
+ return dict(
26
28
  access=request.config.getoption("--tabular-access-key"),
27
29
  secret=request.config.getoption("--tabular-secret-key"),
28
- endpoint=request.config.getoption("--tabular-endpoint-url"),
30
+ endpoint=tabular_endpoint_urls[0],
29
31
  )
30
32
 
31
33
 
34
+ @pytest.fixture(scope="session")
35
+ def session(session_kwargs):
36
+ return vastdb.connect(**session_kwargs)
37
+
38
+
39
+ @pytest.fixture(scope="session")
40
+ def num_workers(request):
41
+ return int(request.config.getoption("--num-workers"))
42
+
43
+
32
44
  @pytest.fixture(scope="session")
33
45
  def test_bucket_name(request):
34
46
  return request.config.getoption("--tabular-bucket-name")
35
47
 
36
48
 
49
+ @pytest.fixture(scope="session")
50
+ def tabular_endpoint_urls(request):
51
+ return request.config.getoption("--tabular-endpoint-url") or ["http://localhost:9090"]
52
+
53
+
37
54
  def iter_schemas(s):
38
55
  """Recusively scan all schemas."""
39
56
  children = s.schemas()
@@ -55,12 +72,12 @@ def clean_bucket_name(request, test_bucket_name, session):
55
72
 
56
73
 
57
74
  @pytest.fixture(scope="session")
58
- def s3(request):
75
+ def s3(request, tabular_endpoint_urls):
59
76
  return boto3.client(
60
77
  's3',
61
78
  aws_access_key_id=request.config.getoption("--tabular-access-key"),
62
79
  aws_secret_access_key=request.config.getoption("--tabular-secret-key"),
63
- endpoint_url=request.config.getoption("--tabular-endpoint-url"))
80
+ endpoint_url=tabular_endpoint_urls[0])
64
81
 
65
82
 
66
83
  @pytest.fixture(scope="function")
@@ -81,3 +98,8 @@ def schema_name(request):
81
98
  @pytest.fixture(scope="function")
82
99
  def table_name(request):
83
100
  return request.config.getoption("--table-name")
101
+
102
+
103
+ @pytest.fixture(scope="function")
104
+ def perf_metrics_db(crater_path):
105
+ return sqlite3.connect(f"{crater_path}/metrics.sqlite")
vastdb/table.py CHANGED
@@ -167,8 +167,13 @@ class Table:
167
167
  """Also, load columns' metadata."""
168
168
  self.arrow_schema = self.columns()
169
169
 
170
- table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
171
- self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), table_path)
170
+ self._table_path = f'{self.schema.bucket.name}/{self.schema.name}/{self.name}'
171
+ self._ibis_table = ibis.table(ibis.Schema.from_pyarrow(self.arrow_schema), self._table_path)
172
+
173
+ @property
174
+ def path(self):
175
+ """Return table's path."""
176
+ return self._table_path
172
177
 
173
178
  @property
174
179
  def tx(self):
@@ -0,0 +1,42 @@
1
+ import dataclasses
2
+ import sqlite3
3
+ from typing import List
4
+
5
+ _MAP_SQLITE_TYPES = {
6
+ str: "TEXT",
7
+ float: "REAL",
8
+ int: "INTEGER",
9
+ }
10
+
11
+
12
+ @dataclasses.dataclass
13
+ class Row:
14
+ start: float
15
+ finish: float
16
+ table_path: str
17
+ op: str
18
+ nbytes: int
19
+ rows: int
20
+ cols: int
21
+ pid: int
22
+ tid: int
23
+
24
+
25
+ class Table:
26
+ def __init__(self, conn: sqlite3.Connection, name: str):
27
+ self.fields = dataclasses.fields(Row)
28
+ self.conn = conn
29
+ self.name = name
30
+ columns = ", ".join(
31
+ f"{f.name} {_MAP_SQLITE_TYPES[f.type]}"
32
+ for f in self.fields
33
+ )
34
+ cmd = f"CREATE TABLE {self.name} ({columns})"
35
+ self.conn.execute(cmd).fetchall()
36
+
37
+ def insert(self, rows: List[Row]):
38
+ args = ", ".join(["?"] * len(self.fields))
39
+ cmd = f"INSERT INTO {self.name} VALUES ({args})"
40
+ data = [dataclasses.astuple(row) for row in rows]
41
+ self.conn.executemany(cmd, data).fetchall()
42
+ self.conn.commit()
vastdb/util.py CHANGED
@@ -1,3 +1,4 @@
1
+ import importlib
1
2
  import logging
2
3
  import re
3
4
  from typing import TYPE_CHECKING, Callable, List, Optional, Union
@@ -159,3 +160,7 @@ def check_supported_types(fields: pa.Schema):
159
160
  if isinstance(f.type, pa.TimestampType):
160
161
  if f.type.tz:
161
162
  raise NotSupportedType(f)
163
+
164
+
165
+ def version():
166
+ return importlib.metadata.distribution('vastdb').version
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vastdb
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: VAST Data SDK
5
5
  Home-page: https://github.com/vast-data/vastdb_sdk
6
6
  Author: VAST DATA
@@ -149,18 +149,20 @@ vast_flatbuf/tabular/S3File.py,sha256=KC9c2oS5-JXwTTriUVFdjOvRG0B54Cq9kviSDZY3NI
149
149
  vast_flatbuf/tabular/VipRange.py,sha256=_BJd1RRZAcK76T9vlsHzXKYVsPVaz6WTEAqStMQCAUQ,2069
150
150
  vast_flatbuf/tabular/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
151
  vastdb/__init__.py,sha256=8PLcZowy_vM0zuiYSQPXuxIEMcwHD7IRFpgcPK-03bk,386
152
- vastdb/_internal.py,sha256=bENTnMZFAXolkFaWa7op9bsPWMk3wVE6oKKU--0ukXk,89971
152
+ vastdb/_internal.py,sha256=4vi6KgkfHnDOSZUrU3oQcNKKdeMH3alODLGk7Yt59Gk,90001
153
153
  vastdb/bucket.py,sha256=5KuKhPjZOevznZqWHDVVocejvAy7dcwobPuV6BJCfPc,2544
154
- vastdb/conftest.py,sha256=wnnPXjeLems2zpOBB1UGbZ_YW5S169NhGA5UZu7H5SM,2831
154
+ vastdb/conftest.py,sha256=ePzQiEQmlNGcM2T4GZevE4XuvcnFWfnTSzr8IVZpVKk,3438
155
155
  vastdb/errors.py,sha256=jER5RQYsBRlQsjym1ItQYRukggMypATOo_sKvsJtMbo,4278
156
156
  vastdb/schema.py,sha256=yaueil92MSMYJf6bWseov_8fXTdW5zaKLXNjP5uuyzI,5963
157
157
  vastdb/session.py,sha256=3YHhG7IamFOKuy-Fkq_IVtPNriSfI6IN_4z4arBFbDU,3349
158
- vastdb/table.py,sha256=TGBiIp0pB7vHd-92q4_sDRDjd4klHDLFOeEgdn1ACQI,32880
158
+ vastdb/table.py,sha256=C0kgV8CJVgoRxVx83SPTn75mgbTz9OWgYwK_RzLPZ5Q,32994
159
159
  vastdb/transaction.py,sha256=qu2rOlR7AS1ojMOzgWapQMpcorrutelZZLH1mLmTHxk,3186
160
- vastdb/util.py,sha256=2W5bBnlihIFvdV4im4HiDLArEhU8zjKMZB3Xw0lzgz0,5888
160
+ vastdb/util.py,sha256=KQ2CjML-ipWxrJjwiaBbj4bxpTCtL24Pr2Co9woyw3Y,5983
161
161
  vastdb/bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
162
  vastdb/bench/test_perf.py,sha256=yn5gE7t_nzmJHBl9bCs1hxQOgzhvFphuYElsWGko8ts,1084
163
+ vastdb/bench/test_sample.py,sha256=0qsKPj3i88J-YTrOrGvsP19xsyWGZy_-ptIt3oXBbSw,7181
163
164
  vastdb/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
+ vastdb/tests/metrics.py,sha256=f1oOPKDsu-BzBLin0IQvjG-ueRDHTY-Hzl357TuoxCQ,989
164
166
  vastdb/tests/test_duckdb.py,sha256=STw_1PwTQR8Naz6s0p6lQTV1ZTKKhe3LPBUbhqzTCu0,1880
165
167
  vastdb/tests/test_imports.py,sha256=xKub3-bisFjH0BsZM8COfiUWuMrtoOoQKprF6VQT9RI,5669
166
168
  vastdb/tests/test_nested.py,sha256=22NAxBTm7Aq-Vn6AIYbi5Cb1ET8W0XeLK3pp4D8BYWI,3448
@@ -172,8 +174,8 @@ vastdb/tests/test_util.py,sha256=Ok_sAEBJsRGF5Voa_v5eu3eAd52GWu8jMjjQbadwW-s,126
172
174
  vastdb/tests/util.py,sha256=dpRJYbboDnlqL4qIdvScpp8--5fxRUBIcIYitrfcj9o,555
173
175
  vastdb/vast_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
176
  vastdb/vast_tests/test_ha.py,sha256=744P4G6VJ09RIkHhMQL4wlipCBJWQVMhyvUrSc4k1HQ,975
175
- vastdb-0.1.9.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
176
- vastdb-0.1.9.dist-info/METADATA,sha256=YRP3W_JzaDywneqZ0cFWJzLbLE_NGf9QYja6-CFwQl4,1350
177
- vastdb-0.1.9.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
178
- vastdb-0.1.9.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
179
- vastdb-0.1.9.dist-info/RECORD,,
177
+ vastdb-0.1.10.dist-info/LICENSE,sha256=obffan7LYrq7hLHNrY7vHcn2pKUTBUYXMKu-VOAvDxU,11333
178
+ vastdb-0.1.10.dist-info/METADATA,sha256=Pjw1EZvwnKhfEjuRVVMR0DBOSkmVo5wcHftqddxqRNY,1351
179
+ vastdb-0.1.10.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
180
+ vastdb-0.1.10.dist-info/top_level.txt,sha256=Vsj2MKtlhPg0J4so64slQtnwjhgoPmJgcG-6YcVAwVc,20
181
+ vastdb-0.1.10.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.1.1)
2
+ Generator: setuptools (70.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5