vastdb 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vastdb/_internal.py +50 -34
- vastdb/bench/test_perf.py +68 -9
- vastdb/conftest.py +9 -2
- vastdb/errors.py +57 -3
- vastdb/features.py +5 -1
- vastdb/schema.py +7 -6
- vastdb/table.py +51 -15
- vastdb/tests/test_fixed_list.py +294 -0
- vastdb/tests/test_imports.py +405 -53
- vastdb/tests/test_nested.py +13 -8
- vastdb/tests/test_tables.py +88 -4
- vastdb/tests/util.py +21 -0
- {vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/METADATA +1 -1
- {vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/RECORD +17 -16
- {vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/LICENSE +0 -0
- {vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/WHEEL +0 -0
- {vastdb-1.3.9.dist-info → vastdb-1.3.11.dist-info}/top_level.txt +0 -0
vastdb/tests/test_imports.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from datetime import datetime
|
|
2
3
|
from tempfile import NamedTemporaryFile
|
|
3
4
|
|
|
4
5
|
import pyarrow as pa
|
|
@@ -70,59 +71,6 @@ def test_parallel_imports(session, clean_bucket_name, s3):
|
|
|
70
71
|
assert len(object_names) == len(objects_name['ObjectName'])
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def test_zip_imports(zip_import_session, clean_bucket_name, s3):
|
|
74
|
-
num_rows = 10
|
|
75
|
-
num_files = 5
|
|
76
|
-
files = []
|
|
77
|
-
ids = [i for i in range(num_rows)]
|
|
78
|
-
symbols = [chr(c) for c in range(ord('a'), ord('a') + num_rows)]
|
|
79
|
-
for i in range(num_files):
|
|
80
|
-
ds = {'id': ids,
|
|
81
|
-
'symbol': symbols,
|
|
82
|
-
f'feature{i}': [i * 10 + k for k in range(num_rows)]}
|
|
83
|
-
table = pa.Table.from_pydict(ds)
|
|
84
|
-
with NamedTemporaryFile() as f:
|
|
85
|
-
pq.write_table(table, f.name)
|
|
86
|
-
pname = f'prq{i}'
|
|
87
|
-
s3.put_object(Bucket=clean_bucket_name, Key=pname, Body=f)
|
|
88
|
-
files.append(f'/{clean_bucket_name}/{pname}')
|
|
89
|
-
|
|
90
|
-
with zip_import_session.transaction() as tx:
|
|
91
|
-
b = tx.bucket(clean_bucket_name)
|
|
92
|
-
s = b.create_schema('s1')
|
|
93
|
-
t = s.create_table('t1', pa.schema([('vastdb_rowid', pa.int64()), ('id', pa.int64()), ('symbol', pa.string())]))
|
|
94
|
-
columns = pa.schema([
|
|
95
|
-
('vastdb_rowid', pa.int64()),
|
|
96
|
-
('id', pa.int64()),
|
|
97
|
-
('symbol', pa.string()),
|
|
98
|
-
])
|
|
99
|
-
ext_row_ids = [10 + i for i in range(num_rows)]
|
|
100
|
-
arrow_table = pa.table(schema=columns, data=[
|
|
101
|
-
ext_row_ids,
|
|
102
|
-
ids,
|
|
103
|
-
symbols,
|
|
104
|
-
])
|
|
105
|
-
row_ids_array = t.insert(arrow_table)
|
|
106
|
-
row_ids = row_ids_array.to_pylist()
|
|
107
|
-
assert row_ids == ext_row_ids
|
|
108
|
-
|
|
109
|
-
with zip_import_session.transaction() as tx:
|
|
110
|
-
s = tx.bucket(clean_bucket_name).schema('s1')
|
|
111
|
-
t = s.table('t1')
|
|
112
|
-
log.info("Starting import of %d files", num_files)
|
|
113
|
-
config = ImportConfig()
|
|
114
|
-
config.key_names = ['id', 'symbol']
|
|
115
|
-
t.import_files(files, config=config)
|
|
116
|
-
|
|
117
|
-
with zip_import_session.transaction() as tx:
|
|
118
|
-
s = tx.bucket(clean_bucket_name).schema('s1')
|
|
119
|
-
t = s.table('t1')
|
|
120
|
-
arrow_table = t.select(columns=['feature0']).read_all()
|
|
121
|
-
assert arrow_table.num_rows == num_rows
|
|
122
|
-
log.debug(f"table schema={t.arrow_schema}")
|
|
123
|
-
assert len(t.arrow_schema) == 8
|
|
124
|
-
|
|
125
|
-
|
|
126
74
|
def test_create_table_from_files(session, clean_bucket_name, s3):
|
|
127
75
|
datasets = [
|
|
128
76
|
{'num': [0],
|
|
@@ -202,3 +150,407 @@ def test_import_type_mismatch_error(session, clean_bucket_name, s3):
|
|
|
202
150
|
assert exc.value.error_dict['object_name'] == prq_name
|
|
203
151
|
assert exc.value.error_dict['res'] == 'TabularMismatchColumnType'
|
|
204
152
|
assert 'num_type_mismatch' in exc.value.error_dict['err_msg']
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def create_parquet_file(s3, bucket_name, file_key, data):
|
|
156
|
+
"""Creates a Parquet file and uploads it to S3."""
|
|
157
|
+
parquet_table = pa.Table.from_pydict(data)
|
|
158
|
+
with NamedTemporaryFile(delete=False) as f:
|
|
159
|
+
pq.write_table(parquet_table, f.name)
|
|
160
|
+
with open(f.name, 'rb') as file_data:
|
|
161
|
+
s3.put_object(Bucket=bucket_name, Key=file_key, Body=file_data)
|
|
162
|
+
return f'/{bucket_name}/{file_key}'
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def create_table_with_data(session, bucket_name, schema_name, table_name, schema, data=None):
|
|
166
|
+
"""Creates a table with the specified schema and optional initial data."""
|
|
167
|
+
with session.transaction() as tx:
|
|
168
|
+
b = tx.bucket(bucket_name)
|
|
169
|
+
s = b.create_schema(schema_name)
|
|
170
|
+
t = s.create_table(table_name, schema)
|
|
171
|
+
if data:
|
|
172
|
+
arrow_table = pa.table(schema=schema, data=data)
|
|
173
|
+
t.insert(arrow_table)
|
|
174
|
+
return t
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def attempt_import(session, bucket_name, schema_name, table_name, files, key_names, expected_error=None):
|
|
178
|
+
"""Attempts to import files into a table and handles expected errors."""
|
|
179
|
+
with session.transaction() as tx:
|
|
180
|
+
t = tx.bucket(bucket_name).schema(schema_name).table(table_name)
|
|
181
|
+
config = ImportConfig()
|
|
182
|
+
config.key_names = key_names
|
|
183
|
+
|
|
184
|
+
if expected_error:
|
|
185
|
+
try:
|
|
186
|
+
t.import_files(files, config=config)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
log.info(f"Caught expected error: {e}")
|
|
189
|
+
assert expected_error in str(e)
|
|
190
|
+
else:
|
|
191
|
+
t.import_files(files, config=config)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def test_zip_imports(zip_import_session, clean_bucket_name, s3):
|
|
195
|
+
schema = pa.schema([
|
|
196
|
+
('vastdb_rowid', pa.int64()),
|
|
197
|
+
('id', pa.int64()),
|
|
198
|
+
('symbol', pa.string()),
|
|
199
|
+
])
|
|
200
|
+
num_rows = 10
|
|
201
|
+
num_files = 5
|
|
202
|
+
|
|
203
|
+
# Step 1: Generate and upload Parquet files
|
|
204
|
+
files = []
|
|
205
|
+
for i in range(num_files):
|
|
206
|
+
data = {
|
|
207
|
+
'id': [k for k in range(num_rows)],
|
|
208
|
+
'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
|
|
209
|
+
f'feature{i}': [i * 10 + k for k in range(num_rows)],
|
|
210
|
+
}
|
|
211
|
+
file_key = f'prq{i}'
|
|
212
|
+
files.append(create_parquet_file(s3, clean_bucket_name, file_key, data))
|
|
213
|
+
|
|
214
|
+
# Step 2: Create table and insert initial data
|
|
215
|
+
data = {
|
|
216
|
+
'vastdb_rowid': [10 + i for i in range(num_rows)],
|
|
217
|
+
'id': [i for i in range(num_rows)],
|
|
218
|
+
'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
|
|
219
|
+
}
|
|
220
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, data)
|
|
221
|
+
|
|
222
|
+
# Step 3: Import files into the table
|
|
223
|
+
attempt_import(zip_import_session, clean_bucket_name, 's1', 't1', files, key_names=['id', 'symbol'])
|
|
224
|
+
|
|
225
|
+
# Step 4: Construct expected rows
|
|
226
|
+
expected_rows = []
|
|
227
|
+
for i in range(num_rows):
|
|
228
|
+
row = {
|
|
229
|
+
'vastdb_rowid': 10 + i, # Initial vastdb_rowid values (10-19)
|
|
230
|
+
'id': i, # ID values (0-9)
|
|
231
|
+
'symbol': chr(ord('a') + i), # Symbol values ('a' to 'j')
|
|
232
|
+
'feature0': 0 * 10 + i, # Values from file 1 (0-9)
|
|
233
|
+
'feature1': 1 * 10 + i, # Values from file 2 (10-19)
|
|
234
|
+
'feature2': 2 * 10 + i, # Values from file 3 (20-29)
|
|
235
|
+
'feature3': 3 * 10 + i, # Values from file 4 (30-39)
|
|
236
|
+
'feature4': 4 * 10 + i, # Values from file 5 (40-49)
|
|
237
|
+
}
|
|
238
|
+
expected_rows.append(row)
|
|
239
|
+
|
|
240
|
+
# Step 5: Query the actual data from the table
|
|
241
|
+
with zip_import_session.transaction() as tx:
|
|
242
|
+
t = tx.bucket(clean_bucket_name).schema('s1').table('t1')
|
|
243
|
+
arrow_table = t.select().read_all()
|
|
244
|
+
actual_data = arrow_table.to_pydict()
|
|
245
|
+
|
|
246
|
+
# Step 6: Compare expected and actual data
|
|
247
|
+
num_actual_rows = len(next(iter(actual_data.values()), []))
|
|
248
|
+
assert num_actual_rows == len(expected_rows), f"Expected {len(expected_rows)} rows but got {num_actual_rows}"
|
|
249
|
+
|
|
250
|
+
# Convert expected_rows to a comparable format (pydict format)
|
|
251
|
+
expected_data = {k: [] for k in expected_rows[0].keys()}
|
|
252
|
+
for row in expected_rows:
|
|
253
|
+
for k, v in row.items():
|
|
254
|
+
expected_data[k].append(v)
|
|
255
|
+
|
|
256
|
+
# Check that all expected columns exist in actual data
|
|
257
|
+
for col in expected_data:
|
|
258
|
+
assert col in actual_data, f"Expected column {col} not found in actual data"
|
|
259
|
+
|
|
260
|
+
# Compare column values
|
|
261
|
+
for col in expected_data:
|
|
262
|
+
assert actual_data[col] == expected_data[col], f"Values in column {col} don't match expected values"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_zip_imports_scale(zip_import_session, clean_bucket_name, s3):
|
|
266
|
+
"""Verify that many key names, and large amounts of data of different kind work as expected."""
|
|
267
|
+
# Step 1: Create and upload Parquet data
|
|
268
|
+
log.info("Step 1: Creating and uploading Parquet data")
|
|
269
|
+
num_rows = 1_000_000
|
|
270
|
+
data = {
|
|
271
|
+
'id': [i for i in range(num_rows)],
|
|
272
|
+
'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
|
|
273
|
+
'feature': [i * 10 for i in range(num_rows)], # Extra column not in the initial table
|
|
274
|
+
'col_0': [datetime.now() for _ in range(num_rows)],
|
|
275
|
+
'col_1': [1 for _ in range(num_rows)],
|
|
276
|
+
'col_2': [2 for _ in range(num_rows)],
|
|
277
|
+
'col_3': [3 for _ in range(num_rows)],
|
|
278
|
+
'col_4': [4 for _ in range(num_rows)],
|
|
279
|
+
'col_5': [5 for _ in range(num_rows)], # Extra column not in the initial table
|
|
280
|
+
}
|
|
281
|
+
file_key = 'large_data.parquet'
|
|
282
|
+
file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
|
|
283
|
+
|
|
284
|
+
# Step 2: Create table and insert initial data
|
|
285
|
+
log.info("Step 2: Creating table and inserting initial data")
|
|
286
|
+
table_data = {
|
|
287
|
+
'vastdb_rowid': [10 + i for i in range(num_rows)],
|
|
288
|
+
'id': data['id'],
|
|
289
|
+
'symbol': data['symbol'],
|
|
290
|
+
'col_0': data['col_0'],
|
|
291
|
+
'col_1': data['col_1'],
|
|
292
|
+
'col_2': data['col_2'],
|
|
293
|
+
'col_3': data['col_3'],
|
|
294
|
+
'col_4': data['col_4'],
|
|
295
|
+
}
|
|
296
|
+
schema = pa.schema([
|
|
297
|
+
('vastdb_rowid', pa.int64()),
|
|
298
|
+
('id', pa.int64()),
|
|
299
|
+
('symbol', pa.string()),
|
|
300
|
+
('col_0', pa.timestamp('s')),
|
|
301
|
+
('col_1', pa.int64()),
|
|
302
|
+
('col_2', pa.int64()),
|
|
303
|
+
('col_3', pa.int64()),
|
|
304
|
+
('col_4', pa.int64()),
|
|
305
|
+
])
|
|
306
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
|
|
307
|
+
|
|
308
|
+
# Step 3: Import the Parquet file into the table
|
|
309
|
+
log.info("Step 3: Importing Parquet file into the table")
|
|
310
|
+
attempt_import(
|
|
311
|
+
zip_import_session,
|
|
312
|
+
clean_bucket_name,
|
|
313
|
+
's1',
|
|
314
|
+
't1',
|
|
315
|
+
[file_path],
|
|
316
|
+
key_names=['id', 'symbol', 'col_0', 'col_1', 'col_2', 'col_3', 'col_4']
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Step 4: Verify schema and row count
|
|
320
|
+
log.info("Step 4: Verifying schema and row count")
|
|
321
|
+
with (zip_import_session.transaction() as tx):
|
|
322
|
+
table = tx.bucket(clean_bucket_name).schema('s1').table('t1')
|
|
323
|
+
updated_schema = table.arrow_schema
|
|
324
|
+
updated_data = table.select().read_all()
|
|
325
|
+
|
|
326
|
+
# Verify schema
|
|
327
|
+
expected_schema = pa.schema([
|
|
328
|
+
('vastdb_rowid', pa.int64()),
|
|
329
|
+
('id', pa.int64()),
|
|
330
|
+
('symbol', pa.string()),
|
|
331
|
+
('col_0', pa.timestamp('s')),
|
|
332
|
+
('col_1', pa.int64()),
|
|
333
|
+
('col_2', pa.int64()),
|
|
334
|
+
('col_3', pa.int64()),
|
|
335
|
+
('col_4', pa.int64()),
|
|
336
|
+
('feature', pa.int64()), # Added during import
|
|
337
|
+
('col_5', pa.int64()), # Added during import
|
|
338
|
+
])
|
|
339
|
+
assert updated_schema == expected_schema, \
|
|
340
|
+
"The table schema does not match the expected schema."
|
|
341
|
+
|
|
342
|
+
assert updated_data.num_rows == num_rows, \
|
|
343
|
+
f"Expected {num_rows} rows, but got {updated_data.num_rows}."
|
|
344
|
+
|
|
345
|
+
assert len(updated_schema.names) == 10, \
|
|
346
|
+
"The table should have exactly 10 columns"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_zip_imports_missing_columns(zip_import_session, clean_bucket_name, s3):
|
|
350
|
+
"""Verify that importing Parquet data with missing columns fails as expected."""
|
|
351
|
+
# Step 1: Create and upload Parquet data missing key columns
|
|
352
|
+
log.info("Step 1: Creating and uploading Parquet data without key columns")
|
|
353
|
+
data = {
|
|
354
|
+
'feature': [i * 10 for i in range(10)], # Only feature column, no 'id' or 'symbol'
|
|
355
|
+
}
|
|
356
|
+
file_key = 'missing_keys.parquet'
|
|
357
|
+
file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
|
|
358
|
+
|
|
359
|
+
# Step 2: Create table with key columns
|
|
360
|
+
log.info("Step 2: Creating table with key columns")
|
|
361
|
+
schema = pa.schema([
|
|
362
|
+
('vastdb_rowid', pa.int64()),
|
|
363
|
+
('id', pa.int64()),
|
|
364
|
+
('symbol', pa.string()),
|
|
365
|
+
])
|
|
366
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema)
|
|
367
|
+
|
|
368
|
+
# Step 3: Attempt to import Parquet data missing key columns
|
|
369
|
+
log.info("Step 3: Attempting to import data without key columns")
|
|
370
|
+
attempt_import(
|
|
371
|
+
zip_import_session,
|
|
372
|
+
clean_bucket_name,
|
|
373
|
+
's1',
|
|
374
|
+
't1',
|
|
375
|
+
[file_path],
|
|
376
|
+
key_names=['id', 'symbol'],
|
|
377
|
+
expected_error="Failed to verify import keys"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def test_zip_imports_missing_key_values(zip_import_session, clean_bucket_name, s3):
|
|
382
|
+
"""Verify that importing Parquet data with extra key values fails as expected
|
|
383
|
+
and that importing a subset of key values fails as expected."""
|
|
384
|
+
schema = pa.schema([
|
|
385
|
+
('vastdb_rowid', pa.int64()),
|
|
386
|
+
('id', pa.int64()),
|
|
387
|
+
('symbol', pa.string()),
|
|
388
|
+
])
|
|
389
|
+
num_rows = 5
|
|
390
|
+
|
|
391
|
+
# Step 1: Create Parquet data with keys 0-4
|
|
392
|
+
data = {
|
|
393
|
+
'id': [i for i in range(num_rows)],
|
|
394
|
+
'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
|
|
395
|
+
'feature': [i * 10 for i in range(num_rows)],
|
|
396
|
+
}
|
|
397
|
+
file_key = 'missing_key_values.parquet'
|
|
398
|
+
file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
|
|
399
|
+
|
|
400
|
+
# Step 2: Create a table with non-overlapping keys 3-7
|
|
401
|
+
table_data = {
|
|
402
|
+
'vastdb_rowid': [i + 3 for i in range(num_rows)],
|
|
403
|
+
'id': [i + 3 for i in range(num_rows)],
|
|
404
|
+
'symbol': [chr(((i + 3) % 26) + ord('k')) for i in range(num_rows)],
|
|
405
|
+
}
|
|
406
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
|
|
407
|
+
|
|
408
|
+
# Step 3: Attempt to import Parquet data with mismatched keys
|
|
409
|
+
log.info("Step 3: Attempting to import Parquet data with keys that do not match the table")
|
|
410
|
+
attempt_import(
|
|
411
|
+
zip_import_session,
|
|
412
|
+
clean_bucket_name,
|
|
413
|
+
's1',
|
|
414
|
+
't1',
|
|
415
|
+
[file_path],
|
|
416
|
+
key_names=['id', 'symbol'],
|
|
417
|
+
expected_error="Failed to get row_ids to update on table"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Step 4: Create and upload Parquet data with fewer rows but all key values present in the table
|
|
421
|
+
log.info("Step 4: Creating and uploading Parquet data with fewer rows, but matching all table keys")
|
|
422
|
+
smaller_data = {
|
|
423
|
+
'id': [3, 4], # Subset of the table keys
|
|
424
|
+
'symbol': ['k', 'l'], # Matching symbols for keys 3 and 4
|
|
425
|
+
'feature': [300, 400], # Example new feature data
|
|
426
|
+
}
|
|
427
|
+
smaller_file_key = 'subset_matching_keys.parquet'
|
|
428
|
+
smaller_file_path = create_parquet_file(s3, clean_bucket_name, smaller_file_key, smaller_data)
|
|
429
|
+
|
|
430
|
+
# Step 5: Attempt to import the Parquet data with fewer rows but all key values present
|
|
431
|
+
log.info("Step 5: Attempting to import smaller Parquet data with all table keys")
|
|
432
|
+
attempt_import(
|
|
433
|
+
zip_import_session,
|
|
434
|
+
clean_bucket_name,
|
|
435
|
+
's1',
|
|
436
|
+
't1',
|
|
437
|
+
[smaller_file_path],
|
|
438
|
+
key_names=['id', 'symbol'],
|
|
439
|
+
expected_error='Failed to get row_ids to update on table'
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def test_zip_imports_nested_keys(zip_import_session, clean_bucket_name, s3):
|
|
444
|
+
"""Verify that importing Parquet data with nested key columns fails as expected."""
|
|
445
|
+
# Step 1: Creating Parquet data with nested key columns
|
|
446
|
+
log.info("Step 1: Creating Parquet data with nested key columns")
|
|
447
|
+
num_rows = 10
|
|
448
|
+
nested_keys = [{'id': i, 'symbol': chr(ord('a') + i)} for i in range(num_rows)]
|
|
449
|
+
feature_column = [i * 10 for i in range(num_rows)]
|
|
450
|
+
|
|
451
|
+
ds = {
|
|
452
|
+
'nested_key': nested_keys,
|
|
453
|
+
'feature': feature_column,
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
# Use create_parquet_file helper
|
|
457
|
+
file_key = 'nested_keys.parquet'
|
|
458
|
+
file_path = create_parquet_file(s3, clean_bucket_name, file_key, ds)
|
|
459
|
+
|
|
460
|
+
# Step 2: Creating table with flat key columns
|
|
461
|
+
log.info("Step 2: Creating table with flat key columns")
|
|
462
|
+
schema = pa.schema([
|
|
463
|
+
('vastdb_rowid', pa.int64()),
|
|
464
|
+
('id', pa.int64()),
|
|
465
|
+
('symbol', pa.string()),
|
|
466
|
+
])
|
|
467
|
+
|
|
468
|
+
# Use create_table_with_data helper
|
|
469
|
+
create_table_with_data(
|
|
470
|
+
zip_import_session,
|
|
471
|
+
clean_bucket_name,
|
|
472
|
+
's1',
|
|
473
|
+
't1',
|
|
474
|
+
schema
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Step 3: Attempt to import Parquet data with nested key columns
|
|
478
|
+
log.info("Step 3: Attempting to import data with nested key columns")
|
|
479
|
+
|
|
480
|
+
# Use attempt_import helper with expected error
|
|
481
|
+
attempt_import(
|
|
482
|
+
zip_import_session,
|
|
483
|
+
clean_bucket_name,
|
|
484
|
+
's1',
|
|
485
|
+
't1',
|
|
486
|
+
[file_path],
|
|
487
|
+
['id', 'symbol'],
|
|
488
|
+
expected_error="Failed to verify import keys"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def test_zip_imports_type_mismatch(zip_import_session, clean_bucket_name, s3):
|
|
493
|
+
"""Verify behavior when key column data types in the Parquet file do not match the table schema."""
|
|
494
|
+
# Step 1: Define table schema with id as string
|
|
495
|
+
schema = pa.schema([
|
|
496
|
+
('vastdb_rowid', pa.int64()),
|
|
497
|
+
('id', pa.string()), # Expecting strings here
|
|
498
|
+
('symbol', pa.string()),
|
|
499
|
+
])
|
|
500
|
+
num_rows = 10
|
|
501
|
+
|
|
502
|
+
# Step 2: Generate and upload a single Parquet file with mismatched id type (integers)
|
|
503
|
+
log.info("Step 2: Creating a Parquet file with mismatched key column data types")
|
|
504
|
+
data = {
|
|
505
|
+
'id': [k for k in range(num_rows)], # Integers, causing the type mismatch
|
|
506
|
+
'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
|
|
507
|
+
'feature': [k * 10 for k in range(num_rows)],
|
|
508
|
+
}
|
|
509
|
+
file_key = 'mismatched_data.parquet'
|
|
510
|
+
file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
|
|
511
|
+
|
|
512
|
+
# Step 3: Create table with string id column and insert valid initial data
|
|
513
|
+
log.info("Step 3: Creating table with string key column and valid initial data")
|
|
514
|
+
table_data = {
|
|
515
|
+
'vastdb_rowid': [10 + i for i in range(num_rows)],
|
|
516
|
+
'id': [str(i) for i in range(num_rows)], # Strings to match schema
|
|
517
|
+
'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
|
|
518
|
+
}
|
|
519
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
|
|
520
|
+
|
|
521
|
+
# Step 4: Attempt to import the file into the table
|
|
522
|
+
log.info("Step 4: Attempting to import the Parquet file with mismatched key column data types")
|
|
523
|
+
attempt_import(
|
|
524
|
+
zip_import_session,
|
|
525
|
+
clean_bucket_name,
|
|
526
|
+
's1',
|
|
527
|
+
't1',
|
|
528
|
+
[file_path],
|
|
529
|
+
key_names=['id', 'symbol'],
|
|
530
|
+
expected_error="TabularMismatchColumnType"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def test_zip_imports_duplicate_key_values(zip_import_session, clean_bucket_name):
|
|
535
|
+
"""Verify that creating a table with duplicate key values fails as expected,
|
|
536
|
+
also show that it has to be in same order."""
|
|
537
|
+
schema = pa.schema([
|
|
538
|
+
('vastdb_rowid', pa.int64()),
|
|
539
|
+
('id', pa.int64()),
|
|
540
|
+
('symbol', pa.string()),
|
|
541
|
+
])
|
|
542
|
+
|
|
543
|
+
# Data with duplicate keys
|
|
544
|
+
table_data = {
|
|
545
|
+
'vastdb_rowid': [1, 2, 2, 4, 5],
|
|
546
|
+
'id': [1, 2, 2, 4, 5],
|
|
547
|
+
'symbol': ['a', 'b', 'b', 'd', 'e'],
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
# Attempt to create the table
|
|
552
|
+
create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
|
|
553
|
+
assert False, "Expected an error due to duplicate keys, but the table was created successfully."
|
|
554
|
+
except Exception as e:
|
|
555
|
+
# Verify the exception is due to duplicate row IDs
|
|
556
|
+
assert "Found duplicate row ids or not in ascending order" in str(e), f"Unexpected error: {e}"
|
vastdb/tests/test_nested.py
CHANGED
|
@@ -12,11 +12,15 @@ from .util import prepare_data
|
|
|
12
12
|
def test_nested_select(session, clean_bucket_name):
|
|
13
13
|
columns = pa.schema([
|
|
14
14
|
('l', pa.list_(pa.int8())),
|
|
15
|
+
('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
|
|
16
|
+
('lfl', pa.list_(pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2))),
|
|
15
17
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
16
18
|
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
17
19
|
])
|
|
18
20
|
expected = pa.table(schema=columns, data=[
|
|
19
21
|
[[1], [], [2, 3], None],
|
|
22
|
+
[[1, 2], None, [3, 4], None],
|
|
23
|
+
[[[1, 2], [3, 4], [4, 5]], None, [[5, 6], [7, 8]], [None, None]],
|
|
20
24
|
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
21
25
|
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
22
26
|
])
|
|
@@ -36,6 +40,7 @@ def test_nested_filter(session, clean_bucket_name):
|
|
|
36
40
|
columns = pa.schema([
|
|
37
41
|
('x', pa.int64()),
|
|
38
42
|
('l', pa.list_(pa.int8())),
|
|
43
|
+
('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
|
|
39
44
|
('y', pa.int64()),
|
|
40
45
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
41
46
|
('z', pa.int64()),
|
|
@@ -45,6 +50,7 @@ def test_nested_filter(session, clean_bucket_name):
|
|
|
45
50
|
expected = pa.table(schema=columns, data=[
|
|
46
51
|
[1, 2, 3, None],
|
|
47
52
|
[[1], [], [2, 3], None],
|
|
53
|
+
[[1, 2], None, [3, 4], None],
|
|
48
54
|
[1, 2, None, 3],
|
|
49
55
|
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
50
56
|
[1, None, 2, 3],
|
|
@@ -72,22 +78,16 @@ def test_nested_filter(session, clean_bucket_name):
|
|
|
72
78
|
|
|
73
79
|
def test_nested_unsupported_filter(session, clean_bucket_name):
|
|
74
80
|
columns = pa.schema([
|
|
75
|
-
('x', pa.int64()),
|
|
76
81
|
('l', pa.list_(pa.int8())),
|
|
77
|
-
('
|
|
82
|
+
('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
|
|
78
83
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
79
|
-
('z', pa.int64()),
|
|
80
84
|
('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
|
|
81
|
-
('w', pa.int64()),
|
|
82
85
|
])
|
|
83
86
|
expected = pa.table(schema=columns, data=[
|
|
84
|
-
[1, 2, 3, None],
|
|
85
87
|
[[1], [], [2, 3], None],
|
|
86
|
-
[1, 2, None, 3],
|
|
88
|
+
[[1, 2], None, [3, 4], None],
|
|
87
89
|
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
88
|
-
[1, None, 2, 3],
|
|
89
90
|
[{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
|
|
90
|
-
[None, 1, 2, 3],
|
|
91
91
|
])
|
|
92
92
|
|
|
93
93
|
with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
|
|
@@ -95,6 +95,9 @@ def test_nested_unsupported_filter(session, clean_bucket_name):
|
|
|
95
95
|
with pytest.raises(NotImplementedError):
|
|
96
96
|
list(t.select(predicate=(t['l'].isnull())))
|
|
97
97
|
|
|
98
|
+
with pytest.raises(NotImplementedError):
|
|
99
|
+
list(t.select(predicate=(t['fl'].isnull())))
|
|
100
|
+
|
|
98
101
|
with pytest.raises(NotImplementedError):
|
|
99
102
|
list(t.select(predicate=(t['m'].isnull())))
|
|
100
103
|
|
|
@@ -106,6 +109,7 @@ def test_nested_subfields_predicate_pushdown(session, clean_bucket_name):
|
|
|
106
109
|
columns = pa.schema([
|
|
107
110
|
('x', pa.int64()),
|
|
108
111
|
('l', pa.list_(pa.int8())),
|
|
112
|
+
('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
|
|
109
113
|
('y', pa.int64()),
|
|
110
114
|
('m', pa.map_(pa.utf8(), pa.float64())),
|
|
111
115
|
('z', pa.int64()),
|
|
@@ -122,6 +126,7 @@ def test_nested_subfields_predicate_pushdown(session, clean_bucket_name):
|
|
|
122
126
|
expected = pa.table(schema=columns, data=[
|
|
123
127
|
[1, 2, 3, None],
|
|
124
128
|
[[1], [], [2, 3], None],
|
|
129
|
+
[[1, 2], None, [3, 4], None],
|
|
125
130
|
[1, 2, None, 3],
|
|
126
131
|
[None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
|
|
127
132
|
[1, None, 2, 3],
|