vastdb 1.3.7__py3-none-any.whl → 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/__init__.py +2 -2
- vastdb/_internal.py +197 -83
- vastdb/bench/test_perf.py +2 -2
- vastdb/config.py +3 -0
- vastdb/errors.py +6 -0
- vastdb/features.py +9 -0
- vastdb/schema.py +5 -3
- vastdb/table.py +99 -17
- vastdb/tests/test_imports.py +70 -1
- vastdb/tests/test_tables.py +217 -0
- vastdb/tests/util.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Aggregate.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Call.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/CaseFragment.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Cast.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ConditionalCase.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Filter.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Grouping.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Join.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/KeyValue.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Limit.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ListLiteral.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Literal.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/LiteralColumn.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/LiteralRelation.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/MapKey.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/MapLiteral.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/OrderBy.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Plan.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Project.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/SetOperation.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/SimpleCase.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/SortKey.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Source.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/StructLiteral.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/WindowCall.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/DictionaryBatch.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/DictionaryEncoding.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Field.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Footer.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Message.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/RecordBatch.py +3 -3
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Schema.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseMatrixIndexCSX.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseTensor.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseTensorIndexCOO.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseTensorIndexCSF.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Tensor.py +2 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/CreateProjectionRequest.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/GetRowColumnSecurityResponse.py +4 -4
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/GetTableStatsResponse.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ImportDataRequest.py +34 -1
- vastdb/vast_flatbuf/tabular/KeyName.py +45 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ListProjectionsResponse.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ListSchemasResponse.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ListTablesResponse.py +1 -1
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ListViewsResponse.py +1 -1
- {vastdb-1.3.7.dist-info → vastdb-1.3.9.dist-info}/METADATA +1 -1
- vastdb-1.3.9.dist-info/RECORD +216 -0
- vastdb-1.3.9.dist-info/top_level.txt +1 -0
- vastdb-1.3.7.dist-info/RECORD +0 -215
- vastdb-1.3.7.dist-info/top_level.txt +0 -2
- {vast_flatbuf → vastdb/vast_flatbuf}/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ArraySlice.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ArraySubscript.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/BinaryLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/BooleanLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Bound.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ConcreteBoundImpl.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/CurrentRow.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/DateLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/DecimalLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Deref.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/DurationLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Expression.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/ExpressionImpl.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/FieldIndex.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/FieldRef.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/FixedSizeBinaryLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Float16Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Float32Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Float64Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Following.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Frame.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Int16Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Int32Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Int64Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Int8Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/IntervalLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/IntervalLiteralDaysMilliseconds.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/IntervalLiteralImpl.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/IntervalLiteralMonths.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/JoinKind.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/LiteralImpl.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Ordering.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Preceding.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/RelId.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Relation.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/RelationImpl.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/SetOpKind.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/StringLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/StructField.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/TimeLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/TimestampLiteral.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/UInt16Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/UInt32Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/UInt64Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/UInt8Literal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/Unbounded.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/computeir/flatbuf/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Binary.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Block.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/BodyCompression.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/BodyCompressionMethod.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Bool.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Buffer.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/CompressionType.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Date.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/DateUnit.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Decimal.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/DictionaryKind.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Duration.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Endianness.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Feature.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/FieldNode.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/FixedSizeBinary.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/FixedSizeList.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/FloatingPoint.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Int.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Interval.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/IntervalUnit.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/KeyValue.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/LargeBinary.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/LargeList.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/LargeUtf8.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/List.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Map.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/MessageHeader.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/MetadataVersion.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Null.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Precision.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseMatrixCompressedAxis.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/SparseTensorIndex.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Struct_.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/TensorDim.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Time.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/TimeUnit.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Timestamp.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Type.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Union.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/UnionMode.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/Utf8.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/org/apache/arrow/flatbuf/__init__.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/AlterColumnRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/AlterProjectionTableRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/AlterSchemaRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/AlterTableRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/Column.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ColumnDetails.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ColumnType.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/CreateSchemaRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/CreateViewRequest.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/FilterString.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/GetProjectionTableStatsResponse.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/NameString.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/ObjectDetails.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/S3File.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/VipRange.py +0 -0
- {vast_flatbuf → vastdb/vast_flatbuf}/tabular/__init__.py +0 -0
- {vastdb-1.3.7.dist-info → vastdb-1.3.9.dist-info}/LICENSE +0 -0
- {vastdb-1.3.7.dist-info → vastdb-1.3.9.dist-info}/WHEEL +0 -0
vastdb/table.py
CHANGED
|
@@ -21,6 +21,7 @@ log = logging.getLogger(__name__)
|
|
|
21
21
|
|
|
22
22
|
INTERNAL_ROW_ID = "$row_id"
|
|
23
23
|
INTERNAL_ROW_ID_FIELD = pa.field(INTERNAL_ROW_ID, pa.uint64())
|
|
24
|
+
INTERNAL_ROW_ID_SORTED_FIELD = pa.field(INTERNAL_ROW_ID, pa.decimal128(38, 0)) # Sorted tables have longer row ids
|
|
24
25
|
|
|
25
26
|
MAX_ROWS_PER_BATCH = 512 * 1024
|
|
26
27
|
# for insert we need a smaller limit due to response amplification
|
|
@@ -28,6 +29,7 @@ MAX_ROWS_PER_BATCH = 512 * 1024
|
|
|
28
29
|
MAX_INSERT_ROWS_PER_PATCH = 512 * 1024
|
|
29
30
|
# in case insert has TooWideRow - need to insert in smaller batches - each cell could contain up to 128K, and our wire is limited to 5MB
|
|
30
31
|
MAX_COLUMN_IN_BATCH = int(5 * 1024 / 128)
|
|
32
|
+
SORTING_SCORE_BITS = 63
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
@dataclass
|
|
@@ -36,7 +38,12 @@ class TableStats:
|
|
|
36
38
|
|
|
37
39
|
num_rows: int
|
|
38
40
|
size_in_bytes: int
|
|
41
|
+
sorting_score: int
|
|
42
|
+
write_amplification: int
|
|
43
|
+
acummulative_row_inserition_count: int
|
|
39
44
|
is_external_rowid_alloc: bool = False
|
|
45
|
+
sorting_key_enabled: bool = False
|
|
46
|
+
sorting_done: bool = False
|
|
40
47
|
endpoints: Tuple[str, ...] = ()
|
|
41
48
|
|
|
42
49
|
|
|
@@ -115,6 +122,7 @@ class Table:
|
|
|
115
122
|
arrow_schema: pa.Schema = field(init=False, compare=False, repr=False)
|
|
116
123
|
_ibis_table: ibis.Schema = field(init=False, compare=False, repr=False)
|
|
117
124
|
_imports_table: bool
|
|
125
|
+
sorted_table: bool
|
|
118
126
|
|
|
119
127
|
def __post_init__(self):
|
|
120
128
|
"""Also, load columns' metadata."""
|
|
@@ -157,6 +165,29 @@ class Table:
|
|
|
157
165
|
self.arrow_schema = pa.schema(fields)
|
|
158
166
|
return self.arrow_schema
|
|
159
167
|
|
|
168
|
+
def sorted_columns(self) -> list:
|
|
169
|
+
"""Return sorted columns' metadata."""
|
|
170
|
+
fields = []
|
|
171
|
+
try:
|
|
172
|
+
self.tx._rpc.features.check_elysium()
|
|
173
|
+
next_key = 0
|
|
174
|
+
while True:
|
|
175
|
+
cur_columns, next_key, is_truncated, _count = self.tx._rpc.api.list_sorted_columns(
|
|
176
|
+
bucket=self.bucket.name, schema=self.schema.name, table=self.name, next_key=next_key, txid=self.tx.txid, list_imports_table=self._imports_table)
|
|
177
|
+
fields.extend(cur_columns)
|
|
178
|
+
if not is_truncated:
|
|
179
|
+
break
|
|
180
|
+
except errors.BadRequest:
|
|
181
|
+
pass
|
|
182
|
+
except errors.InternalServerError as ise:
|
|
183
|
+
log.warning("Failed to get the sorted columns Elysium might not be supported: %s", ise)
|
|
184
|
+
pass
|
|
185
|
+
except errors.NotSupportedVersion:
|
|
186
|
+
log.warning("Failed to get the sorted columns, Elysium not supported")
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
return fields
|
|
190
|
+
|
|
160
191
|
def projection(self, name: str) -> "Projection":
|
|
161
192
|
"""Get a specific semi-sorted projection of this table."""
|
|
162
193
|
if self._imports_table:
|
|
@@ -228,6 +259,10 @@ class Table:
|
|
|
228
259
|
endpoints = [self.tx._rpc.api.url for _ in range(config.import_concurrency)] # TODO: use valid endpoints...
|
|
229
260
|
files_queue = queue.Queue()
|
|
230
261
|
|
|
262
|
+
key_names = config.key_names or []
|
|
263
|
+
if key_names:
|
|
264
|
+
self.tx._rpc.features.check_zip_import()
|
|
265
|
+
|
|
231
266
|
for source_file in source_files.items():
|
|
232
267
|
files_queue.put(source_file)
|
|
233
268
|
|
|
@@ -248,9 +283,11 @@ class Table:
|
|
|
248
283
|
except queue.Empty:
|
|
249
284
|
pass
|
|
250
285
|
if files_batch:
|
|
251
|
-
log.
|
|
286
|
+
log.info("Starting import batch of %s files", len(files_batch))
|
|
287
|
+
log.debug(f"starting import of {files_batch}")
|
|
252
288
|
session.import_data(
|
|
253
|
-
self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid
|
|
289
|
+
self.bucket.name, self.schema.name, self.name, files_batch, txid=self.tx.txid,
|
|
290
|
+
key_names=key_names)
|
|
254
291
|
except (Exception, KeyboardInterrupt) as e:
|
|
255
292
|
stop_event.set()
|
|
256
293
|
log.error("Got exception inside import_worker. exception: %s", e)
|
|
@@ -277,6 +314,21 @@ class Table:
|
|
|
277
314
|
imports_table_stats=self._imports_table)
|
|
278
315
|
return TableStats(**stats_tuple._asdict())
|
|
279
316
|
|
|
317
|
+
def _get_row_estimate(self, columns: List[str], predicate: ibis.expr.types.BooleanColumn, arrow_schema: pa.Schema):
|
|
318
|
+
query_data_request = _internal.build_query_data_request(
|
|
319
|
+
schema=arrow_schema,
|
|
320
|
+
predicate=predicate,
|
|
321
|
+
field_names=columns)
|
|
322
|
+
response = self.tx._rpc.api.query_data(
|
|
323
|
+
bucket=self.bucket.name,
|
|
324
|
+
schema=self.schema.name,
|
|
325
|
+
table=self.name,
|
|
326
|
+
params=query_data_request.serialized,
|
|
327
|
+
split=(0xffffffff - 3, 1, 1),
|
|
328
|
+
txid=self.tx.txid)
|
|
329
|
+
batch = _internal.read_first_batch(response.raw)
|
|
330
|
+
return batch.num_rows * 2**16 if batch is not None else 0
|
|
331
|
+
|
|
280
332
|
def select(self, columns: Optional[List[str]] = None,
|
|
281
333
|
predicate: Union[ibis.expr.types.BooleanColumn, ibis.common.deferred.Deferred] = None,
|
|
282
334
|
config: Optional[QueryConfig] = None,
|
|
@@ -293,30 +345,22 @@ class Table:
|
|
|
293
345
|
if config is None:
|
|
294
346
|
config = QueryConfig()
|
|
295
347
|
|
|
348
|
+
stats = None
|
|
296
349
|
# Retrieve snapshots only if needed
|
|
297
|
-
if config.data_endpoints is None
|
|
350
|
+
if config.data_endpoints is None:
|
|
298
351
|
stats = self.get_stats()
|
|
299
352
|
log.debug("stats: %s", stats)
|
|
300
|
-
|
|
301
|
-
if config.data_endpoints is None:
|
|
302
353
|
endpoints = stats.endpoints
|
|
303
354
|
else:
|
|
304
355
|
endpoints = tuple(config.data_endpoints)
|
|
305
356
|
log.debug("endpoints: %s", endpoints)
|
|
306
357
|
|
|
307
|
-
if config.num_splits is None:
|
|
308
|
-
config.num_splits = max(1, stats.num_rows // config.rows_per_split)
|
|
309
|
-
log.debug("config: %s", config)
|
|
310
|
-
|
|
311
|
-
if config.semi_sorted_projection_name:
|
|
312
|
-
self.tx._rpc.features.check_enforce_semisorted_projection()
|
|
313
|
-
|
|
314
358
|
if columns is None:
|
|
315
359
|
columns = [f.name for f in self.arrow_schema]
|
|
316
360
|
|
|
317
361
|
query_schema = self.arrow_schema
|
|
318
362
|
if internal_row_id:
|
|
319
|
-
queried_fields = [INTERNAL_ROW_ID_FIELD]
|
|
363
|
+
queried_fields = [INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]
|
|
320
364
|
queried_fields.extend(column for column in self.arrow_schema)
|
|
321
365
|
query_schema = pa.schema(queried_fields)
|
|
322
366
|
columns.append(INTERNAL_ROW_ID)
|
|
@@ -330,6 +374,22 @@ class Table:
|
|
|
330
374
|
if isinstance(predicate, ibis.common.deferred.Deferred):
|
|
331
375
|
predicate = predicate.resolve(self._ibis_table) # may raise if the predicate is invalid (e.g. wrong types / missing column)
|
|
332
376
|
|
|
377
|
+
if config.num_splits is None:
|
|
378
|
+
num_rows = 0
|
|
379
|
+
if self.sorted_table:
|
|
380
|
+
num_rows = self._get_row_estimate(columns, predicate, query_schema)
|
|
381
|
+
log.debug(f'sorted estimate: {num_rows}')
|
|
382
|
+
if num_rows == 0:
|
|
383
|
+
if stats is None:
|
|
384
|
+
stats = self.get_stats()
|
|
385
|
+
num_rows = stats.num_rows
|
|
386
|
+
|
|
387
|
+
config.num_splits = max(1, num_rows // config.rows_per_split)
|
|
388
|
+
log.debug("config: %s", config)
|
|
389
|
+
|
|
390
|
+
if config.semi_sorted_projection_name:
|
|
391
|
+
self.tx._rpc.features.check_enforce_semisorted_projection()
|
|
392
|
+
|
|
333
393
|
query_data_request = _internal.build_query_data_request(
|
|
334
394
|
schema=query_schema,
|
|
335
395
|
predicate=predicate,
|
|
@@ -485,7 +545,7 @@ class Table:
|
|
|
485
545
|
if columns is None:
|
|
486
546
|
columns = [name for name in rows.schema.names if name != INTERNAL_ROW_ID]
|
|
487
547
|
|
|
488
|
-
update_fields = [
|
|
548
|
+
update_fields = [INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]
|
|
489
549
|
update_values = [_combine_chunks(rows_chunk)]
|
|
490
550
|
for col in columns:
|
|
491
551
|
update_fields.append(rows.field(col))
|
|
@@ -511,7 +571,7 @@ class Table:
|
|
|
511
571
|
rows_chunk = rows[INTERNAL_ROW_ID]
|
|
512
572
|
except KeyError:
|
|
513
573
|
raise errors.MissingRowIdColumn
|
|
514
|
-
delete_rows_rb = pa.record_batch(schema=pa.schema([
|
|
574
|
+
delete_rows_rb = pa.record_batch(schema=pa.schema([INTERNAL_ROW_ID_SORTED_FIELD if self.sorted_table else INTERNAL_ROW_ID_FIELD]),
|
|
515
575
|
data=[_combine_chunks(rows_chunk)])
|
|
516
576
|
|
|
517
577
|
delete_rows_rb = util.sort_record_batch_if_needed(delete_rows_rb, INTERNAL_ROW_ID)
|
|
@@ -535,6 +595,13 @@ class Table:
|
|
|
535
595
|
log.info("Renamed table from %s to %s ", self.name, new_name)
|
|
536
596
|
self.name = new_name
|
|
537
597
|
|
|
598
|
+
def add_sorting_key(self, sorting_key: list) -> None:
|
|
599
|
+
"""Ads a sorting key to a table that doesn't have any."""
|
|
600
|
+
self.tx._rpc.features.check_elysium()
|
|
601
|
+
self.tx._rpc.api.alter_table(
|
|
602
|
+
self.bucket.name, self.schema.name, self.name, txid=self.tx.txid, sorting_key=sorting_key)
|
|
603
|
+
log.info("Enabled Elysium for table %s with sorting key %s ", self.name, str(sorting_key))
|
|
604
|
+
|
|
538
605
|
def add_column(self, new_column: pa.Schema) -> None:
|
|
539
606
|
"""Add a new column."""
|
|
540
607
|
if self._imports_table:
|
|
@@ -583,7 +650,7 @@ class Table:
|
|
|
583
650
|
def imports_table(self) -> Optional["Table"]:
|
|
584
651
|
"""Get the imports table of this table."""
|
|
585
652
|
self.tx._rpc.features.check_imports_table()
|
|
586
|
-
return Table(name=self.name, schema=self.schema, handle=int(self.handle), _imports_table=True)
|
|
653
|
+
return Table(name=self.name, schema=self.schema, handle=int(self.handle), _imports_table=True, sorted_table=self.sorted_table)
|
|
587
654
|
|
|
588
655
|
def __getitem__(self, col_name: str):
|
|
589
656
|
"""Allow constructing ibis-like column expressions from this table.
|
|
@@ -592,6 +659,20 @@ class Table:
|
|
|
592
659
|
"""
|
|
593
660
|
return self._ibis_table[col_name]
|
|
594
661
|
|
|
662
|
+
def sorting_done(self) -> int:
|
|
663
|
+
"""Sorting done indicator for the table. Always False for unsorted tables."""
|
|
664
|
+
if not self.sorted_table:
|
|
665
|
+
return False
|
|
666
|
+
raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
|
|
667
|
+
return bool(raw_sorting_score >> SORTING_SCORE_BITS)
|
|
668
|
+
|
|
669
|
+
def sorting_score(self) -> int:
|
|
670
|
+
"""Sorting score for the table. Always 0 for unsorted tables."""
|
|
671
|
+
if not self.sorted_table:
|
|
672
|
+
return 0
|
|
673
|
+
raw_sorting_score = self.tx._rpc.api.raw_sorting_score(self.schema.bucket.name, self.schema.name, self.schema.tx.txid, self.name)
|
|
674
|
+
return raw_sorting_score & ((1 << SORTING_SCORE_BITS) - 1)
|
|
675
|
+
|
|
595
676
|
|
|
596
677
|
@dataclass
|
|
597
678
|
class Projection:
|
|
@@ -649,7 +730,8 @@ class Projection:
|
|
|
649
730
|
|
|
650
731
|
def _parse_projection_info(projection_info, table: "Table"):
|
|
651
732
|
log.info("Projection info %s", str(projection_info))
|
|
652
|
-
stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes
|
|
733
|
+
stats = TableStats(num_rows=projection_info.num_rows, size_in_bytes=projection_info.size_in_bytes,
|
|
734
|
+
sorting_score=0, write_amplification=0, acummulative_row_inserition_count=0)
|
|
653
735
|
return Projection(name=projection_info.name, table=table, stats=stats, handle=int(projection_info.handle))
|
|
654
736
|
|
|
655
737
|
|
vastdb/tests/test_imports.py
CHANGED
|
@@ -6,11 +6,27 @@ import pyarrow.parquet as pq
|
|
|
6
6
|
import pytest
|
|
7
7
|
|
|
8
8
|
from vastdb import util
|
|
9
|
-
from vastdb.
|
|
9
|
+
from vastdb.config import ImportConfig
|
|
10
|
+
from vastdb.errors import (
|
|
11
|
+
ImportFilesError,
|
|
12
|
+
InternalServerError,
|
|
13
|
+
InvalidArgument,
|
|
14
|
+
NotSupportedVersion,
|
|
15
|
+
)
|
|
10
16
|
|
|
11
17
|
log = logging.getLogger(__name__)
|
|
12
18
|
|
|
13
19
|
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def zip_import_session(session):
|
|
22
|
+
with session.transaction() as tx:
|
|
23
|
+
try:
|
|
24
|
+
tx._rpc.features.check_zip_import()
|
|
25
|
+
return session
|
|
26
|
+
except NotSupportedVersion:
|
|
27
|
+
pytest.skip("Skipped because this test requires version 5.3.1")
|
|
28
|
+
|
|
29
|
+
|
|
14
30
|
def test_parallel_imports(session, clean_bucket_name, s3):
|
|
15
31
|
num_rows = 1000
|
|
16
32
|
num_files = 53
|
|
@@ -54,6 +70,59 @@ def test_parallel_imports(session, clean_bucket_name, s3):
|
|
|
54
70
|
assert len(object_names) == len(objects_name['ObjectName'])
|
|
55
71
|
|
|
56
72
|
|
|
73
|
+
def test_zip_imports(zip_import_session, clean_bucket_name, s3):
|
|
74
|
+
num_rows = 10
|
|
75
|
+
num_files = 5
|
|
76
|
+
files = []
|
|
77
|
+
ids = [i for i in range(num_rows)]
|
|
78
|
+
symbols = [chr(c) for c in range(ord('a'), ord('a') + num_rows)]
|
|
79
|
+
for i in range(num_files):
|
|
80
|
+
ds = {'id': ids,
|
|
81
|
+
'symbol': symbols,
|
|
82
|
+
f'feature{i}': [i * 10 + k for k in range(num_rows)]}
|
|
83
|
+
table = pa.Table.from_pydict(ds)
|
|
84
|
+
with NamedTemporaryFile() as f:
|
|
85
|
+
pq.write_table(table, f.name)
|
|
86
|
+
pname = f'prq{i}'
|
|
87
|
+
s3.put_object(Bucket=clean_bucket_name, Key=pname, Body=f)
|
|
88
|
+
files.append(f'/{clean_bucket_name}/{pname}')
|
|
89
|
+
|
|
90
|
+
with zip_import_session.transaction() as tx:
|
|
91
|
+
b = tx.bucket(clean_bucket_name)
|
|
92
|
+
s = b.create_schema('s1')
|
|
93
|
+
t = s.create_table('t1', pa.schema([('vastdb_rowid', pa.int64()), ('id', pa.int64()), ('symbol', pa.string())]))
|
|
94
|
+
columns = pa.schema([
|
|
95
|
+
('vastdb_rowid', pa.int64()),
|
|
96
|
+
('id', pa.int64()),
|
|
97
|
+
('symbol', pa.string()),
|
|
98
|
+
])
|
|
99
|
+
ext_row_ids = [10 + i for i in range(num_rows)]
|
|
100
|
+
arrow_table = pa.table(schema=columns, data=[
|
|
101
|
+
ext_row_ids,
|
|
102
|
+
ids,
|
|
103
|
+
symbols,
|
|
104
|
+
])
|
|
105
|
+
row_ids_array = t.insert(arrow_table)
|
|
106
|
+
row_ids = row_ids_array.to_pylist()
|
|
107
|
+
assert row_ids == ext_row_ids
|
|
108
|
+
|
|
109
|
+
with zip_import_session.transaction() as tx:
|
|
110
|
+
s = tx.bucket(clean_bucket_name).schema('s1')
|
|
111
|
+
t = s.table('t1')
|
|
112
|
+
log.info("Starting import of %d files", num_files)
|
|
113
|
+
config = ImportConfig()
|
|
114
|
+
config.key_names = ['id', 'symbol']
|
|
115
|
+
t.import_files(files, config=config)
|
|
116
|
+
|
|
117
|
+
with zip_import_session.transaction() as tx:
|
|
118
|
+
s = tx.bucket(clean_bucket_name).schema('s1')
|
|
119
|
+
t = s.table('t1')
|
|
120
|
+
arrow_table = t.select(columns=['feature0']).read_all()
|
|
121
|
+
assert arrow_table.num_rows == num_rows
|
|
122
|
+
log.debug(f"table schema={t.arrow_schema}")
|
|
123
|
+
assert len(t.arrow_schema) == 8
|
|
124
|
+
|
|
125
|
+
|
|
57
126
|
def test_create_table_from_files(session, clean_bucket_name, s3):
|
|
58
127
|
datasets = [
|
|
59
128
|
{'num': [0],
|
vastdb/tests/test_tables.py
CHANGED
|
@@ -3,6 +3,7 @@ import decimal
|
|
|
3
3
|
import logging
|
|
4
4
|
import random
|
|
5
5
|
import threading
|
|
6
|
+
import time
|
|
6
7
|
from contextlib import closing
|
|
7
8
|
from tempfile import NamedTemporaryFile
|
|
8
9
|
|
|
@@ -13,6 +14,8 @@ import pyarrow.parquet as pq
|
|
|
13
14
|
import pytest
|
|
14
15
|
from requests.exceptions import HTTPError
|
|
15
16
|
|
|
17
|
+
from vastdb.errors import BadRequest
|
|
18
|
+
|
|
16
19
|
from .. import errors
|
|
17
20
|
from ..table import INTERNAL_ROW_ID, QueryConfig
|
|
18
21
|
from .util import prepare_data
|
|
@@ -20,6 +23,16 @@ from .util import prepare_data
|
|
|
20
23
|
log = logging.getLogger(__name__)
|
|
21
24
|
|
|
22
25
|
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def elysium_session(session):
|
|
28
|
+
with session.transaction() as tx:
|
|
29
|
+
try:
|
|
30
|
+
tx._rpc.features.check_elysium()
|
|
31
|
+
return session
|
|
32
|
+
except errors.NotSupportedVersion:
|
|
33
|
+
pytest.skip("Skipped because this test requires version 5.3.5 with Elysium")
|
|
34
|
+
|
|
35
|
+
|
|
23
36
|
def test_tables(session, clean_bucket_name):
|
|
24
37
|
columns = pa.schema([
|
|
25
38
|
('a', pa.int64()),
|
|
@@ -990,3 +1003,207 @@ def test_multiple_contains_clauses(session, clean_bucket_name):
|
|
|
990
1003
|
for pred in failed_preds:
|
|
991
1004
|
with pytest.raises(NotImplementedError):
|
|
992
1005
|
t.select(predicate=pred(t)).read_all()
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
def test_tables_elysium(elysium_session, clean_bucket_name):
|
|
1009
|
+
columns = pa.schema([
|
|
1010
|
+
('a', pa.int8()),
|
|
1011
|
+
('b', pa.int32()),
|
|
1012
|
+
('c', pa.int16()),
|
|
1013
|
+
])
|
|
1014
|
+
expected = pa.table(schema=columns, data=[
|
|
1015
|
+
[1, 2, 3],
|
|
1016
|
+
[111111, 222222, 333333],
|
|
1017
|
+
[111, 222, 333],
|
|
1018
|
+
])
|
|
1019
|
+
sorting = [2, 1]
|
|
1020
|
+
with prepare_data(elysium_session, clean_bucket_name, 's', 't', expected, sorting_key=sorting) as t:
|
|
1021
|
+
sorted_columns = t.sorted_columns()
|
|
1022
|
+
assert sorted_columns[0].name == 'c'
|
|
1023
|
+
assert sorted_columns[1].name == 'b'
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
# Fails because of a known issue: ORION-240102
|
|
1027
|
+
# def test_enable_elysium(session, clean_bucket_name):
|
|
1028
|
+
# columns = pa.schema([
|
|
1029
|
+
# ('a', pa.int8()),
|
|
1030
|
+
# ('b', pa.int32()),
|
|
1031
|
+
# ('c', pa.int16()),
|
|
1032
|
+
# ])
|
|
1033
|
+
# expected = pa.table(schema=columns, data=[
|
|
1034
|
+
# [1,2,3],
|
|
1035
|
+
# [111111,222222,333333],
|
|
1036
|
+
# [111, 222, 333],
|
|
1037
|
+
# ])
|
|
1038
|
+
# sorting = [2, 1]
|
|
1039
|
+
# with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
1040
|
+
# sorted_columns = t.sorted_columns()
|
|
1041
|
+
# assert len(sorted_columns) == 0
|
|
1042
|
+
# t.add_sorting_key(sorting)
|
|
1043
|
+
# time.sleep(10)
|
|
1044
|
+
# sorted_columns = t.sorted_columns()
|
|
1045
|
+
# assert len(sorted_columns) == 2
|
|
1046
|
+
# assert sorted_columns[0].name == 'c'
|
|
1047
|
+
# assert sorted_columns[1].name == 'b'
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def test_elysium_tx(elysium_session, clean_bucket_name):
|
|
1051
|
+
columns = pa.schema([
|
|
1052
|
+
('a', pa.int8()),
|
|
1053
|
+
('b', pa.int32()),
|
|
1054
|
+
('c', pa.int16()),
|
|
1055
|
+
])
|
|
1056
|
+
arrow_table = pa.table(schema=columns, data=[
|
|
1057
|
+
[1, 2, 3],
|
|
1058
|
+
[111111, 222222, 333333],
|
|
1059
|
+
[111, 222, 333],
|
|
1060
|
+
])
|
|
1061
|
+
sorting = [2, 1]
|
|
1062
|
+
schema_name = 's'
|
|
1063
|
+
table_name = 't'
|
|
1064
|
+
with elysium_session.transaction() as tx:
|
|
1065
|
+
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
1066
|
+
t = s.create_table(table_name, arrow_table.schema)
|
|
1067
|
+
row_ids_array = t.insert(arrow_table)
|
|
1068
|
+
row_ids = row_ids_array.to_pylist()
|
|
1069
|
+
assert row_ids == list(range(arrow_table.num_rows))
|
|
1070
|
+
sorted_columns = t.sorted_columns()
|
|
1071
|
+
assert len(sorted_columns) == 0
|
|
1072
|
+
t.add_sorting_key(sorting)
|
|
1073
|
+
|
|
1074
|
+
with elysium_session.transaction() as tx:
|
|
1075
|
+
s = tx.bucket(clean_bucket_name).schema(schema_name)
|
|
1076
|
+
t = s.table(table_name)
|
|
1077
|
+
sorted_columns = t.sorted_columns()
|
|
1078
|
+
assert len(sorted_columns) == 2
|
|
1079
|
+
assert sorted_columns[0].name == 'c'
|
|
1080
|
+
assert sorted_columns[1].name == 'b'
|
|
1081
|
+
t.drop()
|
|
1082
|
+
s.drop()
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
def test_elysium_double_enable(elysium_session, clean_bucket_name):
|
|
1086
|
+
columns = pa.schema([
|
|
1087
|
+
('a', pa.int8()),
|
|
1088
|
+
('b', pa.int32()),
|
|
1089
|
+
('c', pa.int16()),
|
|
1090
|
+
])
|
|
1091
|
+
expected = pa.table(schema=columns, data=[
|
|
1092
|
+
[1, 2, 3],
|
|
1093
|
+
[111111, 222222, 333333],
|
|
1094
|
+
[111, 222, 333],
|
|
1095
|
+
])
|
|
1096
|
+
sorting = [2, 1]
|
|
1097
|
+
with pytest.raises(BadRequest):
|
|
1098
|
+
with prepare_data(elysium_session, clean_bucket_name, 's', 't', expected, sorting_key=sorting) as t:
|
|
1099
|
+
sorted_columns = t.sorted_columns()
|
|
1100
|
+
assert sorted_columns[0].name == 'c'
|
|
1101
|
+
assert sorted_columns[1].name == 'b'
|
|
1102
|
+
t.add_sorting_key(sorting)
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def test_elysium_update_table_tx(elysium_session, clean_bucket_name):
|
|
1106
|
+
columns = pa.schema([
|
|
1107
|
+
('a', pa.int64()),
|
|
1108
|
+
('b', pa.float32()),
|
|
1109
|
+
('s', pa.utf8()),
|
|
1110
|
+
])
|
|
1111
|
+
arrow_table = pa.table(schema=columns, data=[
|
|
1112
|
+
[111, 222, 333],
|
|
1113
|
+
[0.5, 1.5, 2.5],
|
|
1114
|
+
['a', 'bb', 'ccc'],
|
|
1115
|
+
])
|
|
1116
|
+
sorting = [2, 1]
|
|
1117
|
+
schema_name = 's'
|
|
1118
|
+
table_name = 't'
|
|
1119
|
+
with elysium_session.transaction() as tx:
|
|
1120
|
+
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
1121
|
+
t = s.create_table(table_name, arrow_table.schema, sorting_key=sorting)
|
|
1122
|
+
row_ids_array = t.insert(arrow_table)
|
|
1123
|
+
row_ids = row_ids_array.to_pylist()
|
|
1124
|
+
assert row_ids == list(range(arrow_table.num_rows))
|
|
1125
|
+
sorted_columns = t.sorted_columns()
|
|
1126
|
+
assert sorted_columns[0].name == 's'
|
|
1127
|
+
assert sorted_columns[1].name == 'b'
|
|
1128
|
+
|
|
1129
|
+
with elysium_session.transaction() as tx:
|
|
1130
|
+
s = tx.bucket(clean_bucket_name).schema(schema_name)
|
|
1131
|
+
t = s.table(table_name)
|
|
1132
|
+
sorted_columns = t.sorted_columns()
|
|
1133
|
+
assert sorted_columns[0].name == 's'
|
|
1134
|
+
assert sorted_columns[1].name == 'b'
|
|
1135
|
+
|
|
1136
|
+
actual = t.select(columns=['a', 'b'], predicate=(t['a'] == 222), internal_row_id=True).read_all()
|
|
1137
|
+
column_index = actual.column_names.index('a')
|
|
1138
|
+
column_field = actual.field(column_index)
|
|
1139
|
+
new_data = pc.add(actual.column('a'), 2000)
|
|
1140
|
+
update_table = actual.set_column(column_index, column_field, new_data)
|
|
1141
|
+
|
|
1142
|
+
t.update(update_table, columns=['a'])
|
|
1143
|
+
actual = t.select(columns=['a', 'b']).read_all()
|
|
1144
|
+
assert actual.to_pydict() == {
|
|
1145
|
+
'a': [111, 2222, 333],
|
|
1146
|
+
'b': [0.5, 1.5, 2.5]
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
actual = t.select(columns=['a', 'b'], predicate=(t['a'] != 2222), internal_row_id=True).read_all()
|
|
1150
|
+
column_index = actual.column_names.index('a')
|
|
1151
|
+
column_field = actual.field(column_index)
|
|
1152
|
+
new_data = pc.divide(actual.column('a'), 10)
|
|
1153
|
+
update_table = actual.set_column(column_index, column_field, new_data)
|
|
1154
|
+
|
|
1155
|
+
t.update(update_table.to_batches()[0], columns=['a'])
|
|
1156
|
+
actual = t.select(columns=['a', 'b']).read_all()
|
|
1157
|
+
assert actual.to_pydict() == {
|
|
1158
|
+
'a': [11, 2222, 33],
|
|
1159
|
+
'b': [0.5, 1.5, 2.5]
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
actual = t.select(columns=['a', 'b'], predicate=(t['a'] < 222), internal_row_id=True).read_all()
|
|
1163
|
+
column_index = actual.column_names.index('a')
|
|
1164
|
+
column_field = actual.field(column_index)
|
|
1165
|
+
new_data = pc.divide(actual.column('a'), 10)
|
|
1166
|
+
delete_rows = actual.set_column(column_index, column_field, new_data)
|
|
1167
|
+
|
|
1168
|
+
t.delete(delete_rows)
|
|
1169
|
+
actual = t.select(columns=['a', 'b']).read_all()
|
|
1170
|
+
assert actual.to_pydict() == {
|
|
1171
|
+
'a': [2222],
|
|
1172
|
+
'b': [1.5]
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
def test_elysium_splits(elysium_session, clean_bucket_name):
|
|
1177
|
+
columns = pa.schema([
|
|
1178
|
+
('a', pa.int32())
|
|
1179
|
+
])
|
|
1180
|
+
|
|
1181
|
+
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
|
1182
|
+
data = data * 10000
|
|
1183
|
+
arrow_table = pa.table(schema=columns, data=[data])
|
|
1184
|
+
|
|
1185
|
+
config = QueryConfig()
|
|
1186
|
+
config.rows_per_split = 1000
|
|
1187
|
+
|
|
1188
|
+
sorting = [0]
|
|
1189
|
+
schema_name = 's'
|
|
1190
|
+
table_name = 't'
|
|
1191
|
+
|
|
1192
|
+
with elysium_session.transaction() as tx:
|
|
1193
|
+
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
1194
|
+
t = s.create_table(table_name, arrow_table.schema, sorting_key=sorting)
|
|
1195
|
+
row_ids_array = t.insert(arrow_table)
|
|
1196
|
+
row_ids = row_ids_array.to_pylist()
|
|
1197
|
+
assert row_ids == list(range(arrow_table.num_rows))
|
|
1198
|
+
sorted_columns = t.sorted_columns()
|
|
1199
|
+
assert sorted_columns[0].name == 'a'
|
|
1200
|
+
|
|
1201
|
+
time.sleep(300)
|
|
1202
|
+
with elysium_session.transaction() as tx:
|
|
1203
|
+
s = tx.bucket(clean_bucket_name).schema(schema_name)
|
|
1204
|
+
t = s.table(table_name)
|
|
1205
|
+
sorted_columns = t.sorted_columns()
|
|
1206
|
+
assert sorted_columns[0].name == 'a'
|
|
1207
|
+
|
|
1208
|
+
actual = t.select(columns=['a'], predicate=(t['a'] == 1), config=config).read_all()
|
|
1209
|
+
assert len(actual) == 10000
|
vastdb/tests/util.py
CHANGED
|
@@ -5,10 +5,10 @@ log = logging.getLogger(__name__)
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@contextmanager
|
|
8
|
-
def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table):
|
|
8
|
+
def prepare_data(session, clean_bucket_name, schema_name, table_name, arrow_table, sorting_key=[]):
|
|
9
9
|
with session.transaction() as tx:
|
|
10
10
|
s = tx.bucket(clean_bucket_name).create_schema(schema_name)
|
|
11
|
-
t = s.create_table(table_name, arrow_table.schema)
|
|
11
|
+
t = s.create_table(table_name, arrow_table.schema, sorting_key=sorting_key)
|
|
12
12
|
row_ids_array = t.insert(arrow_table)
|
|
13
13
|
row_ids = row_ids_array.to_pylist()
|
|
14
14
|
assert row_ids == list(range(arrow_table.num_rows))
|
|
@@ -32,7 +32,7 @@ class Aggregate(object):
|
|
|
32
32
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
|
33
33
|
if o != 0:
|
|
34
34
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
35
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelId import RelId
|
|
35
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.RelId import RelId
|
|
36
36
|
obj = RelId()
|
|
37
37
|
obj.Init(self._tab.Bytes, x)
|
|
38
38
|
return obj
|
|
@@ -44,7 +44,7 @@ class Aggregate(object):
|
|
|
44
44
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
|
|
45
45
|
if o != 0:
|
|
46
46
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
47
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation import Relation
|
|
47
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Relation import Relation
|
|
48
48
|
obj = Relation()
|
|
49
49
|
obj.Init(self._tab.Bytes, x)
|
|
50
50
|
return obj
|
|
@@ -59,7 +59,7 @@ class Aggregate(object):
|
|
|
59
59
|
x = self._tab.Vector(o)
|
|
60
60
|
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
|
61
61
|
x = self._tab.Indirect(x)
|
|
62
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
62
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
63
63
|
obj = Expression()
|
|
64
64
|
obj.Init(self._tab.Bytes, x)
|
|
65
65
|
return obj
|
|
@@ -98,7 +98,7 @@ class Aggregate(object):
|
|
|
98
98
|
x = self._tab.Vector(o)
|
|
99
99
|
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
|
100
100
|
x = self._tab.Indirect(x)
|
|
101
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Grouping import Grouping
|
|
101
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Grouping import Grouping
|
|
102
102
|
obj = Grouping()
|
|
103
103
|
obj.Init(self._tab.Bytes, x)
|
|
104
104
|
return obj
|
|
@@ -41,7 +41,7 @@ class Call(object):
|
|
|
41
41
|
x = self._tab.Vector(o)
|
|
42
42
|
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
|
43
43
|
x = self._tab.Indirect(x)
|
|
44
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
44
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
45
45
|
obj = Expression()
|
|
46
46
|
obj.Init(self._tab.Bytes, x)
|
|
47
47
|
return obj
|
|
@@ -69,7 +69,7 @@ class Call(object):
|
|
|
69
69
|
x = self._tab.Vector(o)
|
|
70
70
|
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
|
71
71
|
x = self._tab.Indirect(x)
|
|
72
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.SortKey import SortKey
|
|
72
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.SortKey import SortKey
|
|
73
73
|
obj = SortKey()
|
|
74
74
|
obj.Init(self._tab.Bytes, x)
|
|
75
75
|
return obj
|
|
@@ -30,7 +30,7 @@ class CaseFragment(object):
|
|
|
30
30
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
|
31
31
|
if o != 0:
|
|
32
32
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
33
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
33
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
34
34
|
obj = Expression()
|
|
35
35
|
obj.Init(self._tab.Bytes, x)
|
|
36
36
|
return obj
|
|
@@ -41,7 +41,7 @@ class CaseFragment(object):
|
|
|
41
41
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
|
|
42
42
|
if o != 0:
|
|
43
43
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
44
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
44
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
45
45
|
obj = Expression()
|
|
46
46
|
obj.Init(self._tab.Bytes, x)
|
|
47
47
|
return obj
|
|
@@ -31,7 +31,7 @@ class Cast(object):
|
|
|
31
31
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
|
32
32
|
if o != 0:
|
|
33
33
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
34
|
-
from vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
34
|
+
from vastdb.vast_flatbuf.org.apache.arrow.computeir.flatbuf.Expression import Expression
|
|
35
35
|
obj = Expression()
|
|
36
36
|
obj.Init(self._tab.Bytes, x)
|
|
37
37
|
return obj
|
|
@@ -47,7 +47,7 @@ class Cast(object):
|
|
|
47
47
|
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
|
|
48
48
|
if o != 0:
|
|
49
49
|
x = self._tab.Indirect(o + self._tab.Pos)
|
|
50
|
-
from vast_flatbuf.org.apache.arrow.flatbuf.Field import Field
|
|
50
|
+
from vastdb.vast_flatbuf.org.apache.arrow.flatbuf.Field import Field
|
|
51
51
|
obj = Field()
|
|
52
52
|
obj.Init(self._tab.Bytes, x)
|
|
53
53
|
return obj
|