vastdb 1.3.9__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import datetime
2
3
  from tempfile import NamedTemporaryFile
3
4
 
4
5
  import pyarrow as pa
@@ -70,59 +71,6 @@ def test_parallel_imports(session, clean_bucket_name, s3):
70
71
  assert len(object_names) == len(objects_name['ObjectName'])
71
72
 
72
73
 
73
- def test_zip_imports(zip_import_session, clean_bucket_name, s3):
74
- num_rows = 10
75
- num_files = 5
76
- files = []
77
- ids = [i for i in range(num_rows)]
78
- symbols = [chr(c) for c in range(ord('a'), ord('a') + num_rows)]
79
- for i in range(num_files):
80
- ds = {'id': ids,
81
- 'symbol': symbols,
82
- f'feature{i}': [i * 10 + k for k in range(num_rows)]}
83
- table = pa.Table.from_pydict(ds)
84
- with NamedTemporaryFile() as f:
85
- pq.write_table(table, f.name)
86
- pname = f'prq{i}'
87
- s3.put_object(Bucket=clean_bucket_name, Key=pname, Body=f)
88
- files.append(f'/{clean_bucket_name}/{pname}')
89
-
90
- with zip_import_session.transaction() as tx:
91
- b = tx.bucket(clean_bucket_name)
92
- s = b.create_schema('s1')
93
- t = s.create_table('t1', pa.schema([('vastdb_rowid', pa.int64()), ('id', pa.int64()), ('symbol', pa.string())]))
94
- columns = pa.schema([
95
- ('vastdb_rowid', pa.int64()),
96
- ('id', pa.int64()),
97
- ('symbol', pa.string()),
98
- ])
99
- ext_row_ids = [10 + i for i in range(num_rows)]
100
- arrow_table = pa.table(schema=columns, data=[
101
- ext_row_ids,
102
- ids,
103
- symbols,
104
- ])
105
- row_ids_array = t.insert(arrow_table)
106
- row_ids = row_ids_array.to_pylist()
107
- assert row_ids == ext_row_ids
108
-
109
- with zip_import_session.transaction() as tx:
110
- s = tx.bucket(clean_bucket_name).schema('s1')
111
- t = s.table('t1')
112
- log.info("Starting import of %d files", num_files)
113
- config = ImportConfig()
114
- config.key_names = ['id', 'symbol']
115
- t.import_files(files, config=config)
116
-
117
- with zip_import_session.transaction() as tx:
118
- s = tx.bucket(clean_bucket_name).schema('s1')
119
- t = s.table('t1')
120
- arrow_table = t.select(columns=['feature0']).read_all()
121
- assert arrow_table.num_rows == num_rows
122
- log.debug(f"table schema={t.arrow_schema}")
123
- assert len(t.arrow_schema) == 8
124
-
125
-
126
74
  def test_create_table_from_files(session, clean_bucket_name, s3):
127
75
  datasets = [
128
76
  {'num': [0],
@@ -202,3 +150,407 @@ def test_import_type_mismatch_error(session, clean_bucket_name, s3):
202
150
  assert exc.value.error_dict['object_name'] == prq_name
203
151
  assert exc.value.error_dict['res'] == 'TabularMismatchColumnType'
204
152
  assert 'num_type_mismatch' in exc.value.error_dict['err_msg']
153
+
154
+
155
+ def create_parquet_file(s3, bucket_name, file_key, data):
156
+ """Creates a Parquet file and uploads it to S3."""
157
+ parquet_table = pa.Table.from_pydict(data)
158
+ with NamedTemporaryFile(delete=False) as f:
159
+ pq.write_table(parquet_table, f.name)
160
+ with open(f.name, 'rb') as file_data:
161
+ s3.put_object(Bucket=bucket_name, Key=file_key, Body=file_data)
162
+ return f'/{bucket_name}/{file_key}'
163
+
164
+
165
+ def create_table_with_data(session, bucket_name, schema_name, table_name, schema, data=None):
166
+ """Creates a table with the specified schema and optional initial data."""
167
+ with session.transaction() as tx:
168
+ b = tx.bucket(bucket_name)
169
+ s = b.create_schema(schema_name)
170
+ t = s.create_table(table_name, schema)
171
+ if data:
172
+ arrow_table = pa.table(schema=schema, data=data)
173
+ t.insert(arrow_table)
174
+ return t
175
+
176
+
177
+ def attempt_import(session, bucket_name, schema_name, table_name, files, key_names, expected_error=None):
178
+ """Attempts to import files into a table and handles expected errors."""
179
+ with session.transaction() as tx:
180
+ t = tx.bucket(bucket_name).schema(schema_name).table(table_name)
181
+ config = ImportConfig()
182
+ config.key_names = key_names
183
+
184
+ if expected_error:
185
+ try:
186
+ t.import_files(files, config=config)
187
+ except Exception as e:
188
+ log.info(f"Caught expected error: {e}")
189
+ assert expected_error in str(e)
190
+ else:
191
+ t.import_files(files, config=config)
192
+
193
+
194
+ def test_zip_imports(zip_import_session, clean_bucket_name, s3):
195
+ schema = pa.schema([
196
+ ('vastdb_rowid', pa.int64()),
197
+ ('id', pa.int64()),
198
+ ('symbol', pa.string()),
199
+ ])
200
+ num_rows = 10
201
+ num_files = 5
202
+
203
+ # Step 1: Generate and upload Parquet files
204
+ files = []
205
+ for i in range(num_files):
206
+ data = {
207
+ 'id': [k for k in range(num_rows)],
208
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
209
+ f'feature{i}': [i * 10 + k for k in range(num_rows)],
210
+ }
211
+ file_key = f'prq{i}'
212
+ files.append(create_parquet_file(s3, clean_bucket_name, file_key, data))
213
+
214
+ # Step 2: Create table and insert initial data
215
+ data = {
216
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
217
+ 'id': [i for i in range(num_rows)],
218
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
219
+ }
220
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, data)
221
+
222
+ # Step 3: Import files into the table
223
+ attempt_import(zip_import_session, clean_bucket_name, 's1', 't1', files, key_names=['id', 'symbol'])
224
+
225
+ # Step 4: Construct expected rows
226
+ expected_rows = []
227
+ for i in range(num_rows):
228
+ row = {
229
+ 'vastdb_rowid': 10 + i, # Initial vastdb_rowid values (10-19)
230
+ 'id': i, # ID values (0-9)
231
+ 'symbol': chr(ord('a') + i), # Symbol values ('a' to 'j')
232
+ 'feature0': 0 * 10 + i, # Values from file 1 (0-9)
233
+ 'feature1': 1 * 10 + i, # Values from file 2 (10-19)
234
+ 'feature2': 2 * 10 + i, # Values from file 3 (20-29)
235
+ 'feature3': 3 * 10 + i, # Values from file 4 (30-39)
236
+ 'feature4': 4 * 10 + i, # Values from file 5 (40-49)
237
+ }
238
+ expected_rows.append(row)
239
+
240
+ # Step 5: Query the actual data from the table
241
+ with zip_import_session.transaction() as tx:
242
+ t = tx.bucket(clean_bucket_name).schema('s1').table('t1')
243
+ arrow_table = t.select().read_all()
244
+ actual_data = arrow_table.to_pydict()
245
+
246
+ # Step 6: Compare expected and actual data
247
+ num_actual_rows = len(next(iter(actual_data.values()), []))
248
+ assert num_actual_rows == len(expected_rows), f"Expected {len(expected_rows)} rows but got {num_actual_rows}"
249
+
250
+ # Convert expected_rows to a comparable format (pydict format)
251
+ expected_data = {k: [] for k in expected_rows[0].keys()}
252
+ for row in expected_rows:
253
+ for k, v in row.items():
254
+ expected_data[k].append(v)
255
+
256
+ # Check that all expected columns exist in actual data
257
+ for col in expected_data:
258
+ assert col in actual_data, f"Expected column {col} not found in actual data"
259
+
260
+ # Compare column values
261
+ for col in expected_data:
262
+ assert actual_data[col] == expected_data[col], f"Values in column {col} don't match expected values"
263
+
264
+
265
+ def test_zip_imports_scale(zip_import_session, clean_bucket_name, s3):
266
+ """Verify that many key names, and large amounts of data of different kind work as expected."""
267
+ # Step 1: Create and upload Parquet data
268
+ log.info("Step 1: Creating and uploading Parquet data")
269
+ num_rows = 1_000_000
270
+ data = {
271
+ 'id': [i for i in range(num_rows)],
272
+ 'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
273
+ 'feature': [i * 10 for i in range(num_rows)], # Extra column not in the initial table
274
+ 'col_0': [datetime.now() for _ in range(num_rows)],
275
+ 'col_1': [1 for _ in range(num_rows)],
276
+ 'col_2': [2 for _ in range(num_rows)],
277
+ 'col_3': [3 for _ in range(num_rows)],
278
+ 'col_4': [4 for _ in range(num_rows)],
279
+ 'col_5': [5 for _ in range(num_rows)], # Extra column not in the initial table
280
+ }
281
+ file_key = 'large_data.parquet'
282
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
283
+
284
+ # Step 2: Create table and insert initial data
285
+ log.info("Step 2: Creating table and inserting initial data")
286
+ table_data = {
287
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
288
+ 'id': data['id'],
289
+ 'symbol': data['symbol'],
290
+ 'col_0': data['col_0'],
291
+ 'col_1': data['col_1'],
292
+ 'col_2': data['col_2'],
293
+ 'col_3': data['col_3'],
294
+ 'col_4': data['col_4'],
295
+ }
296
+ schema = pa.schema([
297
+ ('vastdb_rowid', pa.int64()),
298
+ ('id', pa.int64()),
299
+ ('symbol', pa.string()),
300
+ ('col_0', pa.timestamp('s')),
301
+ ('col_1', pa.int64()),
302
+ ('col_2', pa.int64()),
303
+ ('col_3', pa.int64()),
304
+ ('col_4', pa.int64()),
305
+ ])
306
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
307
+
308
+ # Step 3: Import the Parquet file into the table
309
+ log.info("Step 3: Importing Parquet file into the table")
310
+ attempt_import(
311
+ zip_import_session,
312
+ clean_bucket_name,
313
+ 's1',
314
+ 't1',
315
+ [file_path],
316
+ key_names=['id', 'symbol', 'col_0', 'col_1', 'col_2', 'col_3', 'col_4']
317
+ )
318
+
319
+ # Step 4: Verify schema and row count
320
+ log.info("Step 4: Verifying schema and row count")
321
+ with (zip_import_session.transaction() as tx):
322
+ table = tx.bucket(clean_bucket_name).schema('s1').table('t1')
323
+ updated_schema = table.arrow_schema
324
+ updated_data = table.select().read_all()
325
+
326
+ # Verify schema
327
+ expected_schema = pa.schema([
328
+ ('vastdb_rowid', pa.int64()),
329
+ ('id', pa.int64()),
330
+ ('symbol', pa.string()),
331
+ ('col_0', pa.timestamp('s')),
332
+ ('col_1', pa.int64()),
333
+ ('col_2', pa.int64()),
334
+ ('col_3', pa.int64()),
335
+ ('col_4', pa.int64()),
336
+ ('feature', pa.int64()), # Added during import
337
+ ('col_5', pa.int64()), # Added during import
338
+ ])
339
+ assert updated_schema == expected_schema, \
340
+ "The table schema does not match the expected schema."
341
+
342
+ assert updated_data.num_rows == num_rows, \
343
+ f"Expected {num_rows} rows, but got {updated_data.num_rows}."
344
+
345
+ assert len(updated_schema.names) == 10, \
346
+ "The table should have exactly 10 columns"
347
+
348
+
349
+ def test_zip_imports_missing_columns(zip_import_session, clean_bucket_name, s3):
350
+ """Verify that importing Parquet data with missing columns fails as expected."""
351
+ # Step 1: Create and upload Parquet data missing key columns
352
+ log.info("Step 1: Creating and uploading Parquet data without key columns")
353
+ data = {
354
+ 'feature': [i * 10 for i in range(10)], # Only feature column, no 'id' or 'symbol'
355
+ }
356
+ file_key = 'missing_keys.parquet'
357
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
358
+
359
+ # Step 2: Create table with key columns
360
+ log.info("Step 2: Creating table with key columns")
361
+ schema = pa.schema([
362
+ ('vastdb_rowid', pa.int64()),
363
+ ('id', pa.int64()),
364
+ ('symbol', pa.string()),
365
+ ])
366
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema)
367
+
368
+ # Step 3: Attempt to import Parquet data missing key columns
369
+ log.info("Step 3: Attempting to import data without key columns")
370
+ attempt_import(
371
+ zip_import_session,
372
+ clean_bucket_name,
373
+ 's1',
374
+ 't1',
375
+ [file_path],
376
+ key_names=['id', 'symbol'],
377
+ expected_error="Failed to verify import keys"
378
+ )
379
+
380
+
381
+ def test_zip_imports_missing_key_values(zip_import_session, clean_bucket_name, s3):
382
+ """Verify that importing Parquet data with extra key values fails as expected
383
+ and that importing a subset of key values fails as expected."""
384
+ schema = pa.schema([
385
+ ('vastdb_rowid', pa.int64()),
386
+ ('id', pa.int64()),
387
+ ('symbol', pa.string()),
388
+ ])
389
+ num_rows = 5
390
+
391
+ # Step 1: Create Parquet data with keys 0-4
392
+ data = {
393
+ 'id': [i for i in range(num_rows)],
394
+ 'symbol': [chr((i % 26) + ord('a')) for i in range(num_rows)],
395
+ 'feature': [i * 10 for i in range(num_rows)],
396
+ }
397
+ file_key = 'missing_key_values.parquet'
398
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
399
+
400
+ # Step 2: Create a table with non-overlapping keys 3-7
401
+ table_data = {
402
+ 'vastdb_rowid': [i + 3 for i in range(num_rows)],
403
+ 'id': [i + 3 for i in range(num_rows)],
404
+ 'symbol': [chr(((i + 3) % 26) + ord('k')) for i in range(num_rows)],
405
+ }
406
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
407
+
408
+ # Step 3: Attempt to import Parquet data with mismatched keys
409
+ log.info("Step 3: Attempting to import Parquet data with keys that do not match the table")
410
+ attempt_import(
411
+ zip_import_session,
412
+ clean_bucket_name,
413
+ 's1',
414
+ 't1',
415
+ [file_path],
416
+ key_names=['id', 'symbol'],
417
+ expected_error="Failed to get row_ids to update on table"
418
+ )
419
+
420
+ # Step 4: Create and upload Parquet data with fewer rows but all key values present in the table
421
+ log.info("Step 4: Creating and uploading Parquet data with fewer rows, but matching all table keys")
422
+ smaller_data = {
423
+ 'id': [3, 4], # Subset of the table keys
424
+ 'symbol': ['k', 'l'], # Matching symbols for keys 3 and 4
425
+ 'feature': [300, 400], # Example new feature data
426
+ }
427
+ smaller_file_key = 'subset_matching_keys.parquet'
428
+ smaller_file_path = create_parquet_file(s3, clean_bucket_name, smaller_file_key, smaller_data)
429
+
430
+ # Step 5: Attempt to import the Parquet data with fewer rows but all key values present
431
+ log.info("Step 5: Attempting to import smaller Parquet data with all table keys")
432
+ attempt_import(
433
+ zip_import_session,
434
+ clean_bucket_name,
435
+ 's1',
436
+ 't1',
437
+ [smaller_file_path],
438
+ key_names=['id', 'symbol'],
439
+ expected_error='Failed to get row_ids to update on table'
440
+ )
441
+
442
+
443
+ def test_zip_imports_nested_keys(zip_import_session, clean_bucket_name, s3):
444
+ """Verify that importing Parquet data with nested key columns fails as expected."""
445
+ # Step 1: Creating Parquet data with nested key columns
446
+ log.info("Step 1: Creating Parquet data with nested key columns")
447
+ num_rows = 10
448
+ nested_keys = [{'id': i, 'symbol': chr(ord('a') + i)} for i in range(num_rows)]
449
+ feature_column = [i * 10 for i in range(num_rows)]
450
+
451
+ ds = {
452
+ 'nested_key': nested_keys,
453
+ 'feature': feature_column,
454
+ }
455
+
456
+ # Use create_parquet_file helper
457
+ file_key = 'nested_keys.parquet'
458
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, ds)
459
+
460
+ # Step 2: Creating table with flat key columns
461
+ log.info("Step 2: Creating table with flat key columns")
462
+ schema = pa.schema([
463
+ ('vastdb_rowid', pa.int64()),
464
+ ('id', pa.int64()),
465
+ ('symbol', pa.string()),
466
+ ])
467
+
468
+ # Use create_table_with_data helper
469
+ create_table_with_data(
470
+ zip_import_session,
471
+ clean_bucket_name,
472
+ 's1',
473
+ 't1',
474
+ schema
475
+ )
476
+
477
+ # Step 3: Attempt to import Parquet data with nested key columns
478
+ log.info("Step 3: Attempting to import data with nested key columns")
479
+
480
+ # Use attempt_import helper with expected error
481
+ attempt_import(
482
+ zip_import_session,
483
+ clean_bucket_name,
484
+ 's1',
485
+ 't1',
486
+ [file_path],
487
+ ['id', 'symbol'],
488
+ expected_error="Failed to verify import keys"
489
+ )
490
+
491
+
492
+ def test_zip_imports_type_mismatch(zip_import_session, clean_bucket_name, s3):
493
+ """Verify behavior when key column data types in the Parquet file do not match the table schema."""
494
+ # Step 1: Define table schema with id as string
495
+ schema = pa.schema([
496
+ ('vastdb_rowid', pa.int64()),
497
+ ('id', pa.string()), # Expecting strings here
498
+ ('symbol', pa.string()),
499
+ ])
500
+ num_rows = 10
501
+
502
+ # Step 2: Generate and upload a single Parquet file with mismatched id type (integers)
503
+ log.info("Step 2: Creating a Parquet file with mismatched key column data types")
504
+ data = {
505
+ 'id': [k for k in range(num_rows)], # Integers, causing the type mismatch
506
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
507
+ 'feature': [k * 10 for k in range(num_rows)],
508
+ }
509
+ file_key = 'mismatched_data.parquet'
510
+ file_path = create_parquet_file(s3, clean_bucket_name, file_key, data)
511
+
512
+ # Step 3: Create table with string id column and insert valid initial data
513
+ log.info("Step 3: Creating table with string key column and valid initial data")
514
+ table_data = {
515
+ 'vastdb_rowid': [10 + i for i in range(num_rows)],
516
+ 'id': [str(i) for i in range(num_rows)], # Strings to match schema
517
+ 'symbol': [chr(c) for c in range(ord('a'), ord('a') + num_rows)],
518
+ }
519
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
520
+
521
+ # Step 4: Attempt to import the file into the table
522
+ log.info("Step 4: Attempting to import the Parquet file with mismatched key column data types")
523
+ attempt_import(
524
+ zip_import_session,
525
+ clean_bucket_name,
526
+ 's1',
527
+ 't1',
528
+ [file_path],
529
+ key_names=['id', 'symbol'],
530
+ expected_error="TabularMismatchColumnType"
531
+ )
532
+
533
+
534
+ def test_zip_imports_duplicate_key_values(zip_import_session, clean_bucket_name):
535
+ """Verify that creating a table with duplicate key values fails as expected,
536
+ also show that it has to be in same order."""
537
+ schema = pa.schema([
538
+ ('vastdb_rowid', pa.int64()),
539
+ ('id', pa.int64()),
540
+ ('symbol', pa.string()),
541
+ ])
542
+
543
+ # Data with duplicate keys
544
+ table_data = {
545
+ 'vastdb_rowid': [1, 2, 2, 4, 5],
546
+ 'id': [1, 2, 2, 4, 5],
547
+ 'symbol': ['a', 'b', 'b', 'd', 'e'],
548
+ }
549
+
550
+ try:
551
+ # Attempt to create the table
552
+ create_table_with_data(zip_import_session, clean_bucket_name, 's1', 't1', schema, table_data)
553
+ assert False, "Expected an error due to duplicate keys, but the table was created successfully."
554
+ except Exception as e:
555
+ # Verify the exception is due to duplicate row IDs
556
+ assert "Found duplicate row ids or not in ascending order" in str(e), f"Unexpected error: {e}"
@@ -12,11 +12,15 @@ from .util import prepare_data
12
12
  def test_nested_select(session, clean_bucket_name):
13
13
  columns = pa.schema([
14
14
  ('l', pa.list_(pa.int8())),
15
+ ('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
16
+ ('lfl', pa.list_(pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2))),
15
17
  ('m', pa.map_(pa.utf8(), pa.float64())),
16
18
  ('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
17
19
  ])
18
20
  expected = pa.table(schema=columns, data=[
19
21
  [[1], [], [2, 3], None],
22
+ [[1, 2], None, [3, 4], None],
23
+ [[[1, 2], [3, 4], [4, 5]], None, [[5, 6], [7, 8]], [None, None]],
20
24
  [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
21
25
  [{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
22
26
  ])
@@ -36,6 +40,7 @@ def test_nested_filter(session, clean_bucket_name):
36
40
  columns = pa.schema([
37
41
  ('x', pa.int64()),
38
42
  ('l', pa.list_(pa.int8())),
43
+ ('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
39
44
  ('y', pa.int64()),
40
45
  ('m', pa.map_(pa.utf8(), pa.float64())),
41
46
  ('z', pa.int64()),
@@ -45,6 +50,7 @@ def test_nested_filter(session, clean_bucket_name):
45
50
  expected = pa.table(schema=columns, data=[
46
51
  [1, 2, 3, None],
47
52
  [[1], [], [2, 3], None],
53
+ [[1, 2], None, [3, 4], None],
48
54
  [1, 2, None, 3],
49
55
  [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
50
56
  [1, None, 2, 3],
@@ -72,22 +78,16 @@ def test_nested_filter(session, clean_bucket_name):
72
78
 
73
79
  def test_nested_unsupported_filter(session, clean_bucket_name):
74
80
  columns = pa.schema([
75
- ('x', pa.int64()),
76
81
  ('l', pa.list_(pa.int8())),
77
- ('y', pa.int64()),
82
+ ('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
78
83
  ('m', pa.map_(pa.utf8(), pa.float64())),
79
- ('z', pa.int64()),
80
84
  ('s', pa.struct([('x', pa.int16()), ('y', pa.int32())])),
81
- ('w', pa.int64()),
82
85
  ])
83
86
  expected = pa.table(schema=columns, data=[
84
- [1, 2, 3, None],
85
87
  [[1], [], [2, 3], None],
86
- [1, 2, None, 3],
88
+ [[1, 2], None, [3, 4], None],
87
89
  [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
88
- [1, None, 2, 3],
89
90
  [{'x': 1, 'y': None}, None, {'x': 2, 'y': 3}, {'x': None, 'y': 4}],
90
- [None, 1, 2, 3],
91
91
  ])
92
92
 
93
93
  with prepare_data(session, clean_bucket_name, 's', 't', expected) as t:
@@ -95,6 +95,9 @@ def test_nested_unsupported_filter(session, clean_bucket_name):
95
95
  with pytest.raises(NotImplementedError):
96
96
  list(t.select(predicate=(t['l'].isnull())))
97
97
 
98
+ with pytest.raises(NotImplementedError):
99
+ list(t.select(predicate=(t['fl'].isnull())))
100
+
98
101
  with pytest.raises(NotImplementedError):
99
102
  list(t.select(predicate=(t['m'].isnull())))
100
103
 
@@ -106,6 +109,7 @@ def test_nested_subfields_predicate_pushdown(session, clean_bucket_name):
106
109
  columns = pa.schema([
107
110
  ('x', pa.int64()),
108
111
  ('l', pa.list_(pa.int8())),
112
+ ('fl', pa.list_(pa.field(name='item', type=pa.int64(), nullable=False), 2)),
109
113
  ('y', pa.int64()),
110
114
  ('m', pa.map_(pa.utf8(), pa.float64())),
111
115
  ('z', pa.int64()),
@@ -122,6 +126,7 @@ def test_nested_subfields_predicate_pushdown(session, clean_bucket_name):
122
126
  expected = pa.table(schema=columns, data=[
123
127
  [1, 2, 3, None],
124
128
  [[1], [], [2, 3], None],
129
+ [[1, 2], None, [3, 4], None],
125
130
  [1, 2, None, 3],
126
131
  [None, {'a': 2.5}, {'b': 0.25, 'c': 0.025}, {}],
127
132
  [1, None, 2, 3],