deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -12,14 +12,13 @@ from pyiceberg.types import (
12
12
  from pyiceberg.partitioning import PartitionSpec, PartitionField
13
13
  from pyiceberg.transforms import IdentityTransform
14
14
  import pyarrow as pa
15
+ import daft
15
16
 
16
17
  from deltacat.compute.converter.steps.convert import convert
17
18
  from deltacat.compute.converter.model.convert_input import ConvertInput
18
19
  from deltacat.compute.converter.pyiceberg.overrides import (
19
20
  fetch_all_bucket_files,
20
- parquet_files_dict_to_iceberg_data_files,
21
21
  )
22
- from collections import defaultdict
23
22
  from deltacat.compute.converter.utils.converter_session_utils import (
24
23
  group_all_files_to_each_bucket,
25
24
  )
@@ -244,11 +243,14 @@ def test_converter_drop_duplicates_success(
244
243
  convert_task_index=i,
245
244
  iceberg_table_warehouse_prefix="warehouse/default",
246
245
  identifier_fields=["primary_key"],
247
- compact_small_files=False,
246
+ table_io=tbl.io,
247
+ table_metadata=tbl.metadata,
248
+ compact_previous_position_delete_files=False,
248
249
  enforce_primary_key_uniqueness=True,
249
250
  position_delete_for_multiple_data_files=True,
250
251
  max_parallel_data_file_download=10,
251
252
  s3_file_system=s3_file_system,
253
+ s3_client_kwargs={},
252
254
  )
253
255
 
254
256
  number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
@@ -272,38 +274,31 @@ def test_converter_drop_duplicates_success(
272
274
  [number_partitioned_array_3, primary_key_array_3], names=names
273
275
  )
274
276
 
277
+ daft_df_1 = daft.from_arrow(data_table_1)
278
+ daft_df_2 = daft.from_arrow(data_table_2)
279
+ daft_df_3 = daft.from_arrow(data_table_3)
280
+
275
281
  download_data_mock = mocker.patch(
276
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
282
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
277
283
  )
278
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
284
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
279
285
 
280
286
  convert_ref = convert.remote(convert_input)
281
287
 
282
288
  to_be_deleted_files_list = []
283
- to_be_added_files_dict_list = []
284
- convert_result = ray.get(convert_ref)
285
-
286
- partition_value = convert_input.convert_input_files.partition_value
287
289
 
288
- if convert_result[0]:
289
- to_be_deleted_files_list.extend(convert_result[0].values())
290
-
291
- file_location = convert_result[1][partition_value][0]
292
- to_be_added_files = f"s3://{file_location}"
290
+ convert_result = ray.get(convert_ref)
293
291
 
294
- to_be_added_files_dict = defaultdict()
295
- to_be_added_files_dict[partition_value] = [to_be_added_files]
296
- to_be_added_files_dict_list.append(to_be_added_files_dict)
292
+ to_be_added_files_list = []
293
+ # Check if there're files to delete
294
+ if convert_result.to_be_deleted_files:
295
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
296
+ if convert_result.to_be_added_files:
297
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
297
298
 
298
- # 4. Commit position delete, delete equality deletes from table
299
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
300
- io=tbl.io,
301
- table_metadata=tbl.metadata,
302
- files_dict_list=to_be_added_files_dict_list,
303
- )
304
299
  commit_append_snapshot(
305
300
  iceberg_table=tbl,
306
- new_position_delete_files=new_position_delete_files,
301
+ new_position_delete_files=to_be_added_files_list,
307
302
  )
308
303
  tbl.refresh()
309
304
 
@@ -413,11 +408,14 @@ def test_converter_pos_delete_read_by_spark_success(
413
408
  convert_task_index=i,
414
409
  iceberg_table_warehouse_prefix="warehouse/default",
415
410
  identifier_fields=["primary_key"],
416
- compact_small_files=False,
411
+ table_io=tbl.io,
412
+ table_metadata=tbl.metadata,
413
+ compact_previous_position_delete_files=False,
417
414
  enforce_primary_key_uniqueness=True,
418
415
  position_delete_for_multiple_data_files=True,
419
416
  max_parallel_data_file_download=10,
420
417
  s3_file_system=s3_file_system,
418
+ s3_client_kwargs={},
421
419
  )
422
420
 
423
421
  primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
@@ -432,39 +430,30 @@ def test_converter_pos_delete_read_by_spark_success(
432
430
  names = ["primary_key"]
433
431
  data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
434
432
 
433
+ daft_df_1 = daft.from_arrow(data_table_1)
434
+ daft_df_2 = daft.from_arrow(data_table_2)
435
+ daft_df_3 = daft.from_arrow(data_table_3)
436
+
435
437
  download_data_mock = mocker.patch(
436
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
438
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
437
439
  )
438
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
440
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
439
441
 
440
442
  convert_ref = convert.remote(convert_input)
441
443
 
442
444
  to_be_deleted_files_list = []
443
- to_be_added_files_dict_list = []
445
+ to_be_added_files_list = []
444
446
  convert_result = ray.get(convert_ref)
445
447
 
446
- partition_value = convert_input.convert_input_files.partition_value
447
-
448
- if convert_result[0]:
449
- to_be_deleted_files_list.extend(convert_result[0].values())
450
-
451
- file_location = convert_result[1][partition_value][0]
452
- to_be_added_files = f"s3://{file_location}"
453
-
454
- to_be_added_files_dict = defaultdict()
455
- to_be_added_files_dict[partition_value] = [to_be_added_files]
456
- to_be_added_files_dict_list.append(to_be_added_files_dict)
448
+ if convert_result.to_be_deleted_files:
449
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
450
+ if convert_result.to_be_added_files:
451
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
457
452
 
458
453
  # 4. Commit position delete, delete equality deletes from table
459
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
460
- io=tbl.io,
461
- table_metadata=tbl.metadata,
462
- files_dict_list=to_be_added_files_dict_list,
463
- )
464
-
465
454
  commit_append_snapshot(
466
455
  iceberg_table=tbl,
467
- new_position_delete_files=new_position_delete_files,
456
+ new_position_delete_files=to_be_added_files_list,
468
457
  )
469
458
  tbl.refresh()
470
459
 
@@ -476,3 +465,177 @@ def test_converter_pos_delete_read_by_spark_success(
476
465
  ]
477
466
  all_pk_sorted = sorted(all_pk)
478
467
  assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
468
+
469
+
470
+ @pytest.mark.integration
471
+ def test_converter_pos_delete_multiple_identifier_fields_success(
472
+ spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
473
+ ) -> None:
474
+ """
475
+ Test for convert compute remote function happy case. Download file results are mocked.
476
+ """
477
+
478
+ # 1. Create Iceberg table
479
+ namespace = "default"
480
+ table_name = "table_converter_ray_pos_delete_multiple_identifier_fields"
481
+
482
+ identifier = f"{namespace}.{table_name}"
483
+
484
+ schema = Schema(
485
+ NestedField(
486
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
487
+ ),
488
+ NestedField(
489
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
490
+ ),
491
+ NestedField(
492
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
493
+ ),
494
+ schema_id=0,
495
+ )
496
+
497
+ partition_field_identity = PartitionField(
498
+ source_id=1,
499
+ field_id=101,
500
+ transform=IdentityTransform(),
501
+ name="number_partitioned",
502
+ )
503
+ partition_spec = PartitionSpec(partition_field_identity)
504
+
505
+ properties = dict()
506
+ properties["write.format.default"] = "parquet"
507
+ properties["write.delete.mode"] = "merge-on-read"
508
+ properties["write.update.mode"] = "merge-on-read"
509
+ properties["write.merge.mode"] = "merge-on-read"
510
+ properties["format-version"] = "2"
511
+
512
+ drop_table_if_exists(identifier, session_catalog)
513
+ session_catalog.create_table(
514
+ identifier,
515
+ schema=schema,
516
+ partition_spec=partition_spec,
517
+ properties=properties,
518
+ )
519
+
520
+ # 2. Use Spark to generate initial data files
521
+ tbl = session_catalog.load_table(identifier)
522
+
523
+ run_spark_commands(
524
+ spark,
525
+ [
526
+ f"""
527
+ INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
528
+ """
529
+ ],
530
+ )
531
+ run_spark_commands(
532
+ spark,
533
+ [
534
+ f"""
535
+ INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
536
+ """
537
+ ],
538
+ )
539
+ run_spark_commands(
540
+ spark,
541
+ [
542
+ f"""
543
+ INSERT INTO {identifier} VALUES (0, "pk4", 1), (0, "pk2", 3), (0, "pk3", 4)
544
+ """
545
+ ],
546
+ )
547
+ tbl.refresh()
548
+
549
+ # 3. Use convert.remote() function to compute position deletes
550
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
551
+
552
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
553
+ data_file_dict=data_file_dict,
554
+ equality_delete_dict=equality_delete_dict,
555
+ pos_delete_dict=pos_delete_dict,
556
+ )
557
+
558
+ s3_file_system = get_s3_file_system()
559
+ for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
560
+ convert_input = ConvertInput.of(
561
+ convert_input_files=one_bucket_files,
562
+ convert_task_index=i,
563
+ iceberg_table_warehouse_prefix="warehouse/default",
564
+ identifier_fields=["primary_key1", "primary_key2"],
565
+ table_io=tbl.io,
566
+ table_metadata=tbl.metadata,
567
+ compact_previous_position_delete_files=False,
568
+ enforce_primary_key_uniqueness=True,
569
+ position_delete_for_multiple_data_files=True,
570
+ max_parallel_data_file_download=10,
571
+ s3_file_system=s3_file_system,
572
+ s3_client_kwargs={},
573
+ )
574
+
575
+ names = ["primary_key1", "primary_key2"]
576
+
577
+ primary_key1_array_1 = pa.array(["pk1", "pk2", "pk3"])
578
+ primary_key2_array_1 = pa.array([1, 2, 3])
579
+ data_table_1 = pa.Table.from_arrays(
580
+ [primary_key1_array_1, primary_key2_array_1], names=names
581
+ )
582
+
583
+ primary_key1_array_2 = pa.array(["pk1", "pk2", "pk3"])
584
+ primary_key2_array_2 = pa.array([1, 2, 3])
585
+ data_table_2 = pa.Table.from_arrays(
586
+ [primary_key1_array_2, primary_key2_array_2], names=names
587
+ )
588
+
589
+ primary_key1_array_3 = pa.array(["pk4", "pk2", "pk3"])
590
+ primary_key2_array_3 = pa.array([1, 3, 4])
591
+ data_table_3 = pa.Table.from_arrays(
592
+ [primary_key1_array_3, primary_key2_array_3], names=names
593
+ )
594
+
595
+ daft_df_1 = daft.from_arrow(data_table_1)
596
+ daft_df_2 = daft.from_arrow(data_table_2)
597
+ daft_df_3 = daft.from_arrow(data_table_3)
598
+
599
+ download_data_mock = mocker.patch(
600
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
601
+ )
602
+ download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
603
+
604
+ convert_ref = convert.remote(convert_input)
605
+
606
+ to_be_deleted_files_list = []
607
+ to_be_added_files_list = []
608
+ convert_result = ray.get(convert_ref)
609
+
610
+ if convert_result.to_be_deleted_files:
611
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
612
+ if convert_result.to_be_added_files:
613
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
614
+
615
+ # 4. Commit position delete, delete equality deletes from table
616
+
617
+ commit_append_snapshot(
618
+ iceberg_table=tbl,
619
+ new_position_delete_files=to_be_added_files_list,
620
+ )
621
+ tbl.refresh()
622
+
623
+ # 5. Result assertion: Expected unique primary keys to be kept
624
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
625
+ expected_result_tuple_list = [
626
+ ("pk1", 1),
627
+ ("pk2", 2),
628
+ ("pk2", 3),
629
+ ("pk3", 3),
630
+ ("pk3", 4),
631
+ ("pk4", 1),
632
+ ]
633
+ pk_combined_res = []
634
+ for pk1, pk2 in zip(
635
+ pyiceberg_scan_table_rows["primary_key1"],
636
+ pyiceberg_scan_table_rows["primary_key2"],
637
+ ):
638
+ pk_combined_res.append((pk1, pk2))
639
+
640
+ # Assert elements are same disregard ordering in list
641
+ assert sorted(pk_combined_res) == sorted(expected_result_tuple_list)
@@ -7,6 +7,7 @@ import sqlite3
7
7
  from sqlite3 import Cursor, Connection
8
8
  import uuid
9
9
  import ray
10
+
10
11
  import io
11
12
 
12
13
  from deltacat.tests.test_utils.storage import create_empty_delta
@@ -0,0 +1,327 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+
4
+ from deltacat.storage.model.expression import (
5
+ Reference,
6
+ Literal,
7
+ Equal,
8
+ NotEqual,
9
+ GreaterThan,
10
+ LessThan,
11
+ GreaterThanEqual,
12
+ LessThanEqual,
13
+ And,
14
+ Or,
15
+ Not,
16
+ In,
17
+ Between,
18
+ Like,
19
+ IsNull,
20
+ )
21
+ from deltacat.storage.model.expression.visitor import DisplayVisitor, ExpressionVisitor
22
+
23
+
24
+ @pytest.fixture
25
+ def field_ref():
26
+ return Reference("field1")
27
+
28
+
29
+ @pytest.fixture
30
+ def field_ref2():
31
+ return Reference("field2")
32
+
33
+
34
+ @pytest.fixture
35
+ def literal_int():
36
+ return Literal(pa.scalar(42))
37
+
38
+
39
+ @pytest.fixture
40
+ def literal_str():
41
+ return Literal(pa.scalar("test"))
42
+
43
+
44
+ @pytest.fixture
45
+ def display_visitor():
46
+ return DisplayVisitor()
47
+
48
+
49
+ class TestExpressionLibrary:
50
+ """Test suite for the Deltacat expression library."""
51
+
52
+ def test_reference_creation(self):
53
+ ref = Reference("field1")
54
+ assert ref.field == "field1"
55
+ assert ref.index is None
56
+
57
+ def test_reference_with_index(self):
58
+ ref = Reference("field1", 0)
59
+ assert ref.field == "field1"
60
+ assert ref.index == 0
61
+
62
+ def test_literal_creation(self):
63
+ lit = Literal(pa.scalar(42))
64
+ assert lit.value.as_py() == 42
65
+
66
+ # Test the factory methods (.of)
67
+ def test_factory_methods(self):
68
+ # Reference.of
69
+ ref = Reference.of("field1")
70
+ assert ref.field == "field1"
71
+
72
+ # Literal.of
73
+ lit = Literal.of(42)
74
+ assert lit.value.as_py() == 42
75
+
76
+ # Equal.of with mixed types
77
+ eq = Equal.of("field1", 42)
78
+ assert isinstance(eq.left, Literal)
79
+ assert isinstance(eq.right, Literal)
80
+ assert eq.left.value.as_py() == "field1"
81
+ assert eq.right.value.as_py() == 42
82
+
83
+ # Not.of
84
+ not_expr = Not.of(Equal.of("field1", 42))
85
+ assert isinstance(not_expr.operand, Equal)
86
+
87
+ # In.of
88
+ in_expr = In.of("field1", [1, 2, 3])
89
+ assert isinstance(in_expr.value, Literal)
90
+ assert len(in_expr.values) == 3
91
+ assert all(isinstance(v, Literal) for v in in_expr.values)
92
+
93
+ # Between.of
94
+ between_expr = Between.of("field1", 10, 20)
95
+ assert isinstance(between_expr.value, Literal)
96
+ assert between_expr.lower.value.as_py() == 10
97
+ assert between_expr.upper.value.as_py() == 20
98
+
99
+ # Like.of
100
+ like_expr = Like.of("field1", "%test%")
101
+ assert isinstance(like_expr.value, Literal)
102
+ assert like_expr.pattern.value.as_py() == "%test%"
103
+
104
+ # Test reference comparison helper methods
105
+ def test_reference_comparison_helpers(self, field_ref):
106
+ # Test eq, ne, gt, lt, ge, le methods
107
+ eq_expr = field_ref.eq(42)
108
+ assert isinstance(eq_expr, Equal)
109
+ assert eq_expr.left == field_ref
110
+ assert eq_expr.right.value.as_py() == 42
111
+
112
+ ne_expr = field_ref.ne(42)
113
+ assert isinstance(ne_expr, NotEqual)
114
+
115
+ gt_expr = field_ref.gt(42)
116
+ assert isinstance(gt_expr, GreaterThan)
117
+
118
+ lt_expr = field_ref.lt(42)
119
+ assert isinstance(lt_expr, LessThan)
120
+
121
+ ge_expr = field_ref.ge(42)
122
+ assert isinstance(ge_expr, GreaterThanEqual)
123
+
124
+ le_expr = field_ref.le(42)
125
+ assert isinstance(le_expr, LessThanEqual)
126
+
127
+ # Test reference special operation helpers
128
+ def test_reference_special_helpers(self, field_ref):
129
+ # Test is_null, in_, between, like methods
130
+ is_null_expr = field_ref.is_null()
131
+ assert isinstance(is_null_expr, IsNull)
132
+ assert is_null_expr.operand == field_ref
133
+
134
+ in_expr = field_ref.in_([1, 2, 3])
135
+ assert isinstance(in_expr, In)
136
+ assert in_expr.value == field_ref
137
+ assert len(in_expr.values) == 3
138
+ assert in_expr.values[0].value.as_py() == 1
139
+
140
+ between_expr = field_ref.between(10, 20)
141
+ assert isinstance(between_expr, Between)
142
+ assert between_expr.value == field_ref
143
+ assert between_expr.lower.value.as_py() == 10
144
+ assert between_expr.upper.value.as_py() == 20
145
+
146
+ like_expr = field_ref.like("%test%")
147
+ assert isinstance(like_expr, Like)
148
+ assert like_expr.value == field_ref
149
+ assert like_expr.pattern.value.as_py() == "%test%"
150
+
151
+ # Test boolean expression helper methods
152
+ def test_boolean_expression_helpers(self, field_ref):
153
+ # Test and_, or_, not_ methods
154
+ expr1 = field_ref.eq(42)
155
+ expr2 = field_ref.gt(10)
156
+
157
+ and_expr = expr1.and_(expr2)
158
+ assert isinstance(and_expr, And)
159
+ assert and_expr.left == expr1
160
+ assert and_expr.right == expr2
161
+
162
+ or_expr = expr1.or_(expr2)
163
+ assert isinstance(or_expr, Or)
164
+ assert or_expr.left == expr1
165
+ assert or_expr.right == expr2
166
+
167
+ not_expr = expr1.not_()
168
+ assert isinstance(not_expr, Not)
169
+ assert not_expr.operand == expr1
170
+
171
+ # Test building complex expressions
172
+ def test_complex_expression_building(self, field_ref, field_ref2):
173
+ # Test building more complex expressions using method chaining
174
+ expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
175
+
176
+ assert isinstance(expr, Not)
177
+ assert isinstance(expr.operand, Or)
178
+ assert isinstance(expr.operand.left, And)
179
+ assert isinstance(expr.operand.right, IsNull)
180
+
181
+ # Test DisplayVisitor for different expression types
182
+ def test_reference_display(self, field_ref, display_visitor):
183
+ assert display_visitor.visit(field_ref) == "field1"
184
+
185
+ def test_literal_display(self, literal_int, literal_str, display_visitor):
186
+ assert display_visitor.visit(literal_int) == "42"
187
+ assert display_visitor.visit(literal_str) == "test"
188
+
189
+ def test_comparison_display(self, field_ref, literal_int, display_visitor):
190
+ assert display_visitor.visit(Equal(field_ref, literal_int)) == "field1 = 42"
191
+ assert display_visitor.visit(NotEqual(field_ref, literal_int)) == "field1 <> 42"
192
+ assert (
193
+ display_visitor.visit(GreaterThan(field_ref, literal_int)) == "field1 > 42"
194
+ )
195
+ assert display_visitor.visit(LessThan(field_ref, literal_int)) == "field1 < 42"
196
+ assert (
197
+ display_visitor.visit(GreaterThanEqual(field_ref, literal_int))
198
+ == "field1 >= 42"
199
+ )
200
+ assert (
201
+ display_visitor.visit(LessThanEqual(field_ref, literal_int))
202
+ == "field1 <= 42"
203
+ )
204
+
205
+ def test_logical_operator_display(self, field_ref, literal_int, display_visitor):
206
+ eq_expr = Equal(field_ref, literal_int)
207
+ gt_expr = GreaterThan(field_ref, literal_int)
208
+
209
+ assert (
210
+ display_visitor.visit(And(eq_expr, gt_expr))
211
+ == "(field1 = 42 AND field1 > 42)"
212
+ )
213
+ assert (
214
+ display_visitor.visit(Or(eq_expr, gt_expr))
215
+ == "(field1 = 42 OR field1 > 42)"
216
+ )
217
+ assert display_visitor.visit(Not(eq_expr)) == "NOT (field1 = 42)"
218
+
219
+ def test_special_operator_display(self, field_ref, display_visitor):
220
+ assert display_visitor.visit(IsNull(field_ref)) == "(field1) IS NULL"
221
+
222
+ values = [Literal(pa.scalar(1)), Literal(pa.scalar(2)), Literal(pa.scalar(3))]
223
+ assert display_visitor.visit(In(field_ref, values)) == "field1 IN (1, 2, 3)"
224
+
225
+ lower = Literal(pa.scalar(10))
226
+ upper = Literal(pa.scalar(20))
227
+ assert (
228
+ display_visitor.visit(Between(field_ref, lower, upper))
229
+ == "field1 BETWEEN 10 AND 20"
230
+ )
231
+
232
+ pattern = Literal(pa.scalar("%test%"))
233
+ assert display_visitor.visit(Like(field_ref, pattern)) == "field1 LIKE %test%"
234
+
235
+ def test_complex_expression_display(self, field_ref, field_ref2, display_visitor):
236
+ expr = field_ref.eq(42).and_(field_ref2.gt(10)).or_(field_ref.is_null()).not_()
237
+
238
+ # Check that the DisplayVisitor correctly formats the complex expression
239
+ assert (
240
+ display_visitor.visit(expr)
241
+ == "NOT (((field1 = 42 AND field2 > 10) OR (field1) IS NULL))"
242
+ )
243
+
244
+ # Test BinaryExpression with_ methods
245
+ def test_binary_expression_with_methods(self, field_ref, field_ref2, literal_int):
246
+ eq_expr = Equal(field_ref, literal_int)
247
+
248
+ # Test with_left
249
+ new_expr = eq_expr.with_left(field_ref2)
250
+ assert isinstance(new_expr, Equal)
251
+ assert new_expr.left == field_ref2
252
+ assert new_expr.right == literal_int
253
+
254
+ # Test with_right
255
+ new_lit = Literal(pa.scalar(100))
256
+ new_expr = eq_expr.with_right(new_lit)
257
+ assert new_expr.left == field_ref
258
+ assert new_expr.right == new_lit
259
+
260
+ # Test __str__ method which uses DisplayVisitor
261
+ def test_expression_str_method(self, field_ref, literal_int):
262
+ eq_expr = Equal(field_ref, literal_int)
263
+ assert str(eq_expr) == "field1 = 42"
264
+
265
+ # Test proper parenthesization in complex expressions
266
+ def test_nested_parentheses(self, field_ref, field_ref2, display_visitor):
267
+ # Create a complex expression: (field1 = 1 AND field2 = 2) OR field2 = 3
268
+ expr1 = Equal(field_ref, Literal(pa.scalar(1)))
269
+ expr2 = Equal(field_ref2, Literal(pa.scalar(2)))
270
+ expr3 = Equal(field_ref2, Literal(pa.scalar(3)))
271
+
272
+ and_expr = And(expr1, expr2)
273
+ or_expr = Or(and_expr, expr3)
274
+
275
+ assert (
276
+ display_visitor.visit(or_expr)
277
+ == "((field1 = 1 AND field2 = 2) OR field2 = 3)"
278
+ )
279
+
280
+ # Test Literal comparison methods
281
+ def test_literal_comparison_methods(self, literal_int):
282
+ eq_expr = literal_int.eq("test")
283
+ assert isinstance(eq_expr, Equal)
284
+ assert eq_expr.left == literal_int
285
+ assert eq_expr.right.value.as_py() == "test"
286
+
287
+ ne_expr = literal_int.ne("test")
288
+ assert isinstance(ne_expr, NotEqual)
289
+ assert ne_expr.left == literal_int
290
+ assert ne_expr.right.value.as_py() == "test"
291
+
292
+ # Test a custom ExpressionVisitor implementation
293
+ def test_custom_visitor(self, field_ref, literal_int):
294
+ class CountingVisitor(ExpressionVisitor[None, int]):
295
+ """Simple visitor that counts expression nodes"""
296
+
297
+ def visit_reference(self, expr, context=None):
298
+ return 1
299
+
300
+ def visit_literal(self, expr, context=None):
301
+ return 1
302
+
303
+ def visit_binary_expression(self, expr, left, right, context=None):
304
+ return left + right + 1
305
+
306
+ def visit_unary_expression(self, expr, operand, context=None):
307
+ return operand + 1
308
+
309
+ def visit_in(self, expr, context=None):
310
+ return 1 + len(expr.values) + 1 # value + all values + In operator
311
+
312
+ def visit_between(self, expr, context=None):
313
+ return 3 # value + lower + upper
314
+
315
+ def visit_like(self, expr, context=None):
316
+ return 2 # value + pattern
317
+
318
+ visitor = CountingVisitor()
319
+
320
+ # Count nodes in simple expressions
321
+ assert visitor.visit(field_ref) == 1
322
+ assert visitor.visit(literal_int) == 1
323
+ assert visitor.visit(Equal(field_ref, literal_int)) == 3 # left + right + Equal
324
+
325
+ # Count nodes in a more complex expression
326
+ expr = field_ref.eq(42).and_(field_ref.gt(10))
327
+ assert visitor.visit(expr) == 7 # (1+1+1) + (1+1+1) + 1
@@ -2,7 +2,8 @@ import pytest
2
2
 
3
3
  import pyarrow as pa
4
4
  import pyarrow.parquet as pq
5
- from deltacat import Datatype, Dataset
5
+ from deltacat.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.storage.rivulet.dataset import Dataset
6
7
  from deltacat.storage.rivulet import Schema, Field
7
8
  from deltacat.utils.metafile_locator import _find_partition_path
8
9
 
@@ -57,7 +57,7 @@ def test_dataset_creation_metadata_structure(tmp_path):
57
57
  dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
58
58
 
59
59
  assert dataset._metadata_folder.startswith(".riv-meta")
60
- assert dataset._namespace == "DEFAULT"
60
+ assert dataset._namespace == "default"
61
61
  assert dataset.dataset_name == "test_dataset"
62
62
  assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
63
63
 
@@ -2,7 +2,7 @@ import os
2
2
 
3
3
  import pytest
4
4
 
5
- from deltacat import Dataset
5
+ from deltacat.storage.rivulet.dataset import Dataset
6
6
  from deltacat.storage.rivulet.fs.file_store import FileStore
7
7
  from deltacat.storage.rivulet.schema.datatype import Datatype
8
8
  from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO