deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. deltacat/__init__.py +19 -15
  2. deltacat/benchmarking/benchmark_engine.py +4 -2
  3. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  4. deltacat/catalog/__init__.py +62 -5
  5. deltacat/catalog/main/impl.py +18 -8
  6. deltacat/catalog/model/catalog.py +111 -73
  7. deltacat/catalog/model/properties.py +25 -22
  8. deltacat/compute/jobs/client.py +7 -5
  9. deltacat/constants.py +1 -2
  10. deltacat/env.py +10 -0
  11. deltacat/examples/basic_logging.py +1 -3
  12. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  13. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  14. deltacat/examples/indexer/indexer.py +2 -2
  15. deltacat/examples/indexer/job_runner.py +1 -2
  16. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  17. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  18. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  19. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  20. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  21. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  22. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  23. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  24. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  25. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  26. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  27. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  28. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  29. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  30. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  31. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  32. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  33. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  34. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  35. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  36. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  37. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  38. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  39. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  40. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  41. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  42. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  43. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  44. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  45. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  46. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  47. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  49. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  50. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  51. deltacat/io/reader/deltacat_read_api.py +1 -1
  52. deltacat/storage/model/shard.py +6 -2
  53. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  54. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  55. deltacat/tests/catalog/model/__init__.py +0 -0
  56. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  57. deltacat/tests/catalog/test_catalogs.py +52 -98
  58. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  59. deltacat/tests/daft/__init__.py +0 -0
  60. deltacat/tests/daft/test_model.py +97 -0
  61. deltacat/tests/experimental/__init__.py +0 -0
  62. deltacat/tests/experimental/catalog/__init__.py +0 -0
  63. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  64. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  65. deltacat/tests/experimental/daft/__init__.py +0 -0
  66. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  67. deltacat/tests/experimental/storage/__init__.py +0 -0
  68. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  69. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  70. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  71. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  72. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  73. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  74. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  75. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  76. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  77. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  78. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  79. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  80. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  81. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  82. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  83. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  84. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  85. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  86. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  87. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  88. deltacat/tests/storage/model/test_shard.py +3 -1
  89. deltacat/types/media.py +3 -3
  90. deltacat/utils/daft.py +530 -4
  91. deltacat/utils/export.py +3 -1
  92. deltacat/utils/url.py +1 -1
  93. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
  94. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
  95. deltacat/catalog/iceberg/__init__.py +0 -4
  96. deltacat/daft/daft_scan.py +0 -115
  97. deltacat/daft/model.py +0 -258
  98. deltacat/daft/translator.py +0 -126
  99. deltacat/examples/common/fixtures.py +0 -15
  100. deltacat/storage/rivulet/__init__.py +0 -11
  101. deltacat/storage/rivulet/feather/__init__.py +0 -5
  102. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  103. /deltacat/{daft → examples/experimental}/__init__.py +0 -0
  104. /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
  105. /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
  106. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  107. /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
  108. /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
  109. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  110. /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  111. /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
  112. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  113. /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  114. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  115. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  116. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  117. /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  118. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  119. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  120. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  121. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
  122. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
  123. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  124. /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
  125. /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
  126. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  127. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  128. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/utils/daft.py CHANGED
@@ -1,13 +1,42 @@
1
1
  import logging
2
- from typing import Optional, List, Any, Dict, Callable
2
+ from typing import Optional, List, Any, Dict, Callable, Iterator
3
+
4
+ from daft.daft import (
5
+ StorageConfig,
6
+ PartitionField,
7
+ Pushdowns as DaftRustPushdowns,
8
+ ScanTask,
9
+ FileFormatConfig,
10
+ ParquetSourceConfig,
11
+ PartitionTransform as DaftTransform,
12
+ PartitionField as DaftPartitionField,
13
+ )
14
+ from daft.expressions import Expression as DaftExpression
15
+ from daft.expressions.visitor import PredicateVisitor
16
+ from pyarrow import Field as PaField
17
+
3
18
  import daft
4
19
  import ray
5
- from daft import TimeUnit, DataFrame
20
+ from daft import (
21
+ TimeUnit,
22
+ DataFrame,
23
+ Schema as DaftSchema,
24
+ DataType,
25
+ )
26
+ from daft.logical.schema import Field as DaftField
6
27
  from daft.recordbatch import read_parquet_into_pyarrow
7
- from daft.io import IOConfig, S3Config
28
+ from daft.io import (
29
+ IOConfig,
30
+ S3Config,
31
+ )
32
+ from daft.io.scan import (
33
+ ScanOperator,
34
+ make_partition_field,
35
+ )
8
36
  import pyarrow as pa
9
37
 
10
38
  from deltacat import logs
39
+ from deltacat.catalog.model.table_definition import TableDefinition
11
40
  from deltacat.utils.common import ReadKwargsProvider
12
41
  from deltacat.utils.schema import coerce_pyarrow_table_to_schema
13
42
  from deltacat.types.media import ContentType, ContentEncoding
@@ -22,11 +51,180 @@ from deltacat.utils.performance import timed_invocation
22
51
  from deltacat.types.partial_download import (
23
52
  PartialFileDownloadParams,
24
53
  )
25
-
54
+ from deltacat.storage import (
55
+ Transform,
56
+ IdentityTransform,
57
+ HourTransform,
58
+ DayTransform,
59
+ MonthTransform,
60
+ YearTransform,
61
+ BucketTransform,
62
+ BucketingStrategy,
63
+ TruncateTransform,
64
+ PartitionKey,
65
+ Schema,
66
+ )
67
+ from deltacat.storage.model.interop import ModelMapper
68
+ from deltacat.storage.model.expression import (
69
+ Expression,
70
+ Reference,
71
+ Literal,
72
+ Equal,
73
+ NotEqual,
74
+ GreaterThan,
75
+ LessThan,
76
+ GreaterThanEqual,
77
+ LessThanEqual,
78
+ And,
79
+ Or,
80
+ Not,
81
+ IsNull,
82
+ )
83
+ from deltacat.storage.model.scan.push_down import (
84
+ PartitionFilter,
85
+ Pushdown as DeltaCatPushdown,
86
+ )
26
87
 
27
88
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
89
 
29
90
 
91
+ def translate_pushdown(pushdown: DaftRustPushdowns) -> DeltaCatPushdown:
92
+ """
93
+ Helper method to translate a Daft Pushdowns object into a Deltacat Pushdown.
94
+ Args:
95
+ pushdown: Daft Daft Pushdowns object
96
+ Returns:
97
+ Pushdown: Deltacat Pushdown object with translated filters
98
+ """
99
+ translator = DaftToDeltacatVisitor()
100
+
101
+ partition_filters = None
102
+ if pushdown.partition_filters is not None:
103
+ daft_expr = DaftExpression._from_pyexpr(pushdown.partition_filters)
104
+ partition_filters = PartitionFilter.of(translator.visit(daft_expr))
105
+
106
+ filters = None
107
+ if pushdown.filters is not None:
108
+ daft_expr = DaftExpression._from_pyexpr(pushdown.filters)
109
+ # TODO: support deltacat row filters
110
+ # filters = RowFilter.of(translator.visit(daft_expr))
111
+
112
+ columns = None
113
+ limit = None
114
+
115
+ return DeltaCatPushdown.of(
116
+ partition_filter=partition_filters,
117
+ column_filter=columns,
118
+ row_filter=filters,
119
+ limit=limit,
120
+ )
121
+
122
+
123
+ class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
124
+ """PredicateVisitor implementation to translate Daft Expressions into Deltacat Expressions"""
125
+
126
+ def visit_col(self, name: str) -> Expression:
127
+ return Reference.of(name)
128
+
129
+ def visit_lit(self, value: Any) -> Expression:
130
+ return Literal.of(value)
131
+
132
+ def visit_cast(self, expr: DaftExpression, dtype: DataType) -> Expression:
133
+ # deltacat expressions do not support explicit casting
134
+ # pyarrow should handle any type casting
135
+ return self.visit(expr)
136
+
137
+ def visit_alias(self, expr: DaftExpression, alias: str) -> Expression:
138
+ return self.visit(expr)
139
+
140
+ def visit_function(self, name: str, args: List[DaftExpression]) -> Expression:
141
+ # TODO: Add Deltacat expression function support
142
+ raise ValueError("Function not supported")
143
+
144
+ def visit_and(self, left: DaftExpression, right: DaftExpression) -> Expression:
145
+ """Visit an 'and' expression."""
146
+ return And.of(self.visit(left), self.visit(right))
147
+
148
+ def visit_or(self, left: DaftExpression, right: DaftExpression) -> Expression:
149
+ """Visit an 'or' expression."""
150
+ return Or.of(self.visit(left), self.visit(right))
151
+
152
+ def visit_not(self, expr: DaftExpression) -> Expression:
153
+ """Visit a 'not' expression."""
154
+ return Not.of(self.visit(expr))
155
+
156
+ def visit_equal(self, left: DaftExpression, right: DaftExpression) -> Expression:
157
+ """Visit an 'equals' comparison predicate."""
158
+ return Equal.of(self.visit(left), self.visit(right))
159
+
160
+ def visit_not_equal(
161
+ self, left: DaftExpression, right: DaftExpression
162
+ ) -> Expression:
163
+ """Visit a 'not equals' comparison predicate."""
164
+ return NotEqual.of(self.visit(left), self.visit(right))
165
+
166
+ def visit_less_than(
167
+ self, left: DaftExpression, right: DaftExpression
168
+ ) -> Expression:
169
+ """Visit a 'less than' comparison predicate."""
170
+ return LessThan.of(self.visit(left), self.visit(right))
171
+
172
+ def visit_less_than_or_equal(
173
+ self, left: DaftExpression, right: DaftExpression
174
+ ) -> Expression:
175
+ """Visit a 'less than or equal' comparison predicate."""
176
+ return LessThanEqual.of(self.visit(left), self.visit(right))
177
+
178
+ def visit_greater_than(
179
+ self, left: DaftExpression, right: DaftExpression
180
+ ) -> Expression:
181
+ """Visit a 'greater than' comparison predicate."""
182
+ return GreaterThan.of(self.visit(left), self.visit(right))
183
+
184
+ def visit_greater_than_or_equal(
185
+ self, left: DaftExpression, right: DaftExpression
186
+ ) -> Expression:
187
+ """Visit a 'greater than or equal' comparison predicate."""
188
+ return GreaterThanEqual.of(self.visit(left), self.visit(right))
189
+
190
+ def visit_between(
191
+ self, expr: DaftExpression, lower: DaftExpression, upper: DaftExpression
192
+ ) -> Expression:
193
+ """Visit a 'between' predicate."""
194
+ # Implement BETWEEN as lower <= expr <= upper
195
+ lower_bound = LessThanEqual.of(self.visit(lower), self.visit(expr))
196
+ upper_bound = LessThanEqual.of(self.visit(expr), self.visit(upper))
197
+ return And.of(lower_bound, upper_bound)
198
+
199
+ def visit_is_in(
200
+ self, expr: DaftExpression, items: list[DaftExpression]
201
+ ) -> Expression:
202
+ """Visit an 'is_in' predicate."""
203
+ # For empty list, return false literal
204
+ if not items:
205
+ return Literal(pa.scalar(False))
206
+
207
+ # Implement IN as a series of equality checks combined with OR
208
+ visited_expr = self.visit(expr)
209
+ equals_exprs = [Equal.of(visited_expr, self.visit(item)) for item in items]
210
+
211
+ # Combine with OR
212
+ result = equals_exprs[0]
213
+ for eq_expr in equals_exprs[1:]:
214
+ result = Or.of(result, eq_expr)
215
+
216
+ return result
217
+
218
+ def visit_is_null(self, expr: DaftExpression) -> Expression:
219
+ """Visit an 'is_null' predicate."""
220
+ return IsNull.of(self.visit(expr))
221
+
222
+ def visit_not_null(self, expr: DaftExpression) -> Expression:
223
+ """Visit an 'not_null' predicate."""
224
+ # NOT NULL is implemented as NOT(IS NULL)
225
+ return Not.of(IsNull.of(self.visit(expr)))
226
+
227
+
30
228
  def s3_files_to_dataframe(
31
229
  uris: List[str],
32
230
  content_type: str,
@@ -167,3 +365,331 @@ def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
167
365
  read_timeout_ms=10_000, # Timeout for first byte from server
168
366
  )
169
367
  )
368
+
369
+
370
+ class DeltaCatScanOperator(ScanOperator):
371
+ def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
372
+ super().__init__()
373
+ self.table = table
374
+ self._schema = self._infer_schema()
375
+ self.partition_keys = self._infer_partition_keys()
376
+ self.storage_config = storage_config
377
+
378
+ def schema(self) -> DaftSchema:
379
+ return self._schema
380
+
381
+ def name(self) -> str:
382
+ return "DeltaCatScanOperator"
383
+
384
+ def display_name(self) -> str:
385
+ return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
386
+
387
+ def partitioning_keys(self) -> list[PartitionField]:
388
+ return self.partition_keys
389
+
390
+ def multiline_display(self) -> list[str]:
391
+ return [
392
+ self.display_name(),
393
+ f"Schema = {self._schema}",
394
+ f"Partitioning keys = {self.partitioning_keys}",
395
+ f"Storage config = {self.storage_config}",
396
+ ]
397
+
398
+ def to_scan_tasks(self, pushdowns: DaftRustPushdowns) -> Iterator[ScanTask]:
399
+ dc_pushdown = translate_pushdown(pushdowns)
400
+ dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
401
+ scan_tasks = []
402
+ file_format_config = FileFormatConfig.from_parquet_config(
403
+ # maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
404
+ ParquetSourceConfig()
405
+ )
406
+ for dc_scan_task in dc_scan_plan.scan_tasks:
407
+ for data_file in dc_scan_task.data_files():
408
+ st = ScanTask.catalog_scan_task(
409
+ file=data_file.file_path,
410
+ file_format=file_format_config,
411
+ schema=self._schema._schema,
412
+ storage_config=self.storage_config,
413
+ pushdowns=pushdowns,
414
+ )
415
+ scan_tasks.append(st)
416
+ return iter(scan_tasks)
417
+
418
+ def can_absorb_filter(self) -> bool:
419
+ return False
420
+
421
+ def can_absorb_limit(self) -> bool:
422
+ return False
423
+
424
+ def can_absorb_select(self) -> bool:
425
+ return True
426
+
427
+ def _infer_schema(self) -> DaftSchema:
428
+
429
+ if not (
430
+ self.table and self.table.table_version and self.table.table_version.schema
431
+ ):
432
+ raise RuntimeError(
433
+ f"Failed to infer schema for DeltaCAT Table "
434
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
435
+ )
436
+
437
+ return DaftSchema.from_pyarrow_schema(self.table.table_version.schema.arrow)
438
+
439
+ def _infer_partition_keys(self) -> list[PartitionField]:
440
+ if not (
441
+ self.table
442
+ and self.table.table_version
443
+ and self.table.table_version.partition_scheme
444
+ and self.table.table_version.schema
445
+ ):
446
+ raise RuntimeError(
447
+ f"Failed to infer partition keys for DeltaCAT Table "
448
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
449
+ )
450
+
451
+ schema = self.table.table_version.schema
452
+ partition_keys = self.table.table_version.partition_scheme.keys
453
+ if not partition_keys:
454
+ return []
455
+
456
+ partition_fields = []
457
+ for key in partition_keys:
458
+ field = DaftPartitionKeyMapper.unmap(key, schema)
459
+ # Assert that the returned value is not None.
460
+ assert field is not None, f"Unmapping failed for key {key}"
461
+ partition_fields.append(field)
462
+
463
+ return partition_fields
464
+
465
+
466
+ class DaftFieldMapper(ModelMapper[DaftField, PaField]):
467
+ @staticmethod
468
+ def map(
469
+ obj: Optional[DaftField],
470
+ **kwargs,
471
+ ) -> Optional[PaField]:
472
+ """Convert Daft Field to PyArrow Field.
473
+
474
+ Args:
475
+ obj: The Daft Field to convert
476
+ **kwargs: Additional arguments
477
+
478
+ Returns:
479
+ Converted PyArrow Field object
480
+ """
481
+ if obj is None:
482
+ return None
483
+
484
+ return pa.field(
485
+ name=obj.name,
486
+ type=obj.dtype.to_arrow_dtype(),
487
+ )
488
+
489
+ @staticmethod
490
+ def unmap(
491
+ obj: Optional[PaField],
492
+ **kwargs,
493
+ ) -> Optional[DaftField]:
494
+ """Convert PyArrow Field to Daft Field.
495
+
496
+ Args:
497
+ obj: The PyArrow Field to convert
498
+ **kwargs: Additional arguments
499
+
500
+ Returns:
501
+ Converted Daft Field object
502
+ """
503
+ if obj is None:
504
+ return None
505
+
506
+ return DaftField.create(
507
+ name=obj.name,
508
+ dtype=DataType.from_arrow_type(obj.type), # type: ignore
509
+ )
510
+
511
+
512
+ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
513
+ @staticmethod
514
+ def map(
515
+ obj: Optional[DaftTransform],
516
+ **kwargs,
517
+ ) -> Optional[Transform]:
518
+ """Convert DaftTransform to DeltaCAT Transform.
519
+
520
+ Args:
521
+ obj: The DaftTransform to convert
522
+ **kwargs: Additional arguments
523
+
524
+ Returns:
525
+ Converted Transform object
526
+ """
527
+
528
+ # daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
529
+ # thus conversion is not possible.
530
+ # TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
531
+ raise NotImplementedError(
532
+ "Converting transform from Daft to DeltaCAT is not supported"
533
+ )
534
+
535
+ @staticmethod
536
+ def unmap(
537
+ obj: Optional[Transform],
538
+ **kwargs,
539
+ ) -> Optional[DaftTransform]:
540
+ """Convert DeltaCAT Transform to DaftTransform.
541
+
542
+ Args:
543
+ obj: The Transform to convert
544
+ **kwargs: Additional arguments
545
+
546
+ Returns:
547
+ Converted DaftTransform object
548
+ """
549
+ if obj is None:
550
+ return None
551
+
552
+ # Map DeltaCAT transforms to Daft transforms using isinstance
553
+
554
+ if isinstance(obj, IdentityTransform):
555
+ return DaftTransform.identity()
556
+ elif isinstance(obj, HourTransform):
557
+ return DaftTransform.hour()
558
+ elif isinstance(obj, DayTransform):
559
+ return DaftTransform.day()
560
+ elif isinstance(obj, MonthTransform):
561
+ return DaftTransform.month()
562
+ elif isinstance(obj, YearTransform):
563
+ return DaftTransform.year()
564
+ elif isinstance(obj, BucketTransform):
565
+ if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
566
+ return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
567
+ else:
568
+ raise ValueError(
569
+ f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
570
+ )
571
+ elif isinstance(obj, TruncateTransform):
572
+ return DaftTransform.iceberg_truncate(obj.parameters.width)
573
+
574
+ raise ValueError(f"Unsupported Transform: {obj}")
575
+
576
+
577
+ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
578
+ @staticmethod
579
+ def map(
580
+ obj: Optional[DaftPartitionField],
581
+ schema: Optional[DaftSchema] = None,
582
+ **kwargs,
583
+ ) -> Optional[PartitionKey]:
584
+ """Convert DaftPartitionField to PartitionKey.
585
+
586
+ Args:
587
+ obj: The DaftPartitionField to convert
588
+ schema: The Daft schema containing field information
589
+ **kwargs: Additional arguments
590
+
591
+ Returns:
592
+ Converted PartitionKey object
593
+ """
594
+ # Daft PartitionField only exposes 1 attribute `field` which is not enough
595
+ # to convert to DeltaCAT PartitionKey
596
+ # TODO: request Daft to expose more Python friendly interface for PartitionField
597
+ raise NotImplementedError(
598
+ f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
599
+ )
600
+
601
+ @staticmethod
602
+ def unmap(
603
+ obj: Optional[PartitionKey],
604
+ schema: Optional[Schema] = None,
605
+ **kwargs,
606
+ ) -> Optional[DaftPartitionField]:
607
+ """Convert PartitionKey to DaftPartitionField.
608
+
609
+ Args:
610
+ obj: The DeltaCAT PartitionKey to convert
611
+ schema: The Schema containing field information
612
+ **kwargs: Additional arguments
613
+
614
+ Returns:
615
+ Converted DaftPartitionField object
616
+ """
617
+ if obj is None:
618
+ return None
619
+ if obj.name is None:
620
+ raise ValueError("Name is required for PartitionKey conversion")
621
+ if not schema:
622
+ raise ValueError("Schema is required for PartitionKey conversion")
623
+ if len(obj.key) < 1:
624
+ raise ValueError(
625
+ f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
626
+ )
627
+
628
+ # Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
629
+ dc_source_field = schema.field(obj.key[0]).arrow
630
+ daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
631
+ # Convert transform if present
632
+ daft_transform = DaftTransformMapper.unmap(obj.transform)
633
+ daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
634
+ partition_field_name=obj.name,
635
+ daft_source_field=daft_source_field,
636
+ dc_transform=obj.transform,
637
+ )
638
+
639
+ # Create DaftPartitionField
640
+ return make_partition_field(
641
+ field=daft_partition_field,
642
+ source_field=daft_source_field,
643
+ transform=daft_transform,
644
+ )
645
+
646
+ @staticmethod
647
+ def get_daft_partition_field(
648
+ partition_field_name: str,
649
+ daft_source_field: Optional[DaftField],
650
+ # TODO: replace DeltaCAT transform with Daft Transform for uniformality
651
+ # We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
652
+ # access its attributes.
653
+ # TODO: request Daft to provide a more python friendly interface for Daft Tranform
654
+ dc_transform: Optional[Transform],
655
+ ) -> DaftField:
656
+ """Generate Daft Partition Field given partition field name, source field and transform.
657
+ Partition field type is inferred using source field type and transform.
658
+
659
+ Args:
660
+ partition_field_name (str): the specified result field name
661
+ daft_source_field (DaftField): the source field of the partition field
662
+ daft_transform (DaftTransform): transform applied on the source field to create partition field
663
+
664
+ Returns:
665
+ DaftField: Daft Field representing the partition field
666
+ """
667
+ if daft_source_field is None:
668
+ raise ValueError("Source field is required for PartitionField conversion")
669
+ if dc_transform is None:
670
+ raise ValueError("Transform is required for PartitionField conversion")
671
+
672
+ result_type = None
673
+ # Below type conversion logic references Daft - Iceberg conversion logic:
674
+ # https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
675
+ if isinstance(dc_transform, IdentityTransform):
676
+ result_type = daft_source_field.dtype
677
+ elif isinstance(dc_transform, YearTransform):
678
+ result_type = DataType.int32()
679
+ elif isinstance(dc_transform, MonthTransform):
680
+ result_type = DataType.int32()
681
+ elif isinstance(dc_transform, DayTransform):
682
+ result_type = DataType.int32()
683
+ elif isinstance(dc_transform, HourTransform):
684
+ result_type = DataType.int32()
685
+ elif isinstance(dc_transform, BucketTransform):
686
+ result_type = DataType.int32()
687
+ elif isinstance(dc_transform, TruncateTransform):
688
+ result_type = daft_source_field.dtype
689
+ else:
690
+ raise ValueError(f"Unsupported transform: {dc_transform}")
691
+
692
+ return DaftField.create(
693
+ name=partition_field_name,
694
+ dtype=result_type,
695
+ )
deltacat/utils/export.py CHANGED
@@ -5,7 +5,9 @@ import pyarrow.parquet
5
5
  import pyarrow.feather
6
6
  from typing import Callable, Dict
7
7
 
8
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
9
11
  from deltacat import logs
10
12
 
11
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
deltacat/utils/url.py CHANGED
@@ -632,7 +632,7 @@ class DeltaCatUrl:
632
632
  Note that, for reads, each of the above URLs typically resolves directly
633
633
  to the equivalent :class:`deltacat.types.media.DatasetType` reader. For
634
634
  example, if Ray Data is the dataset type then the equivalent
635
- ray.data.read_{} API is used. For example, a read referencing a URL of the
635
+ ray.data.read_{} API is used. In this case, a read referencing a URL of the
636
636
  form "audio+file:///my/audio.mp4" would resolve to a call to
637
637
  ray.data.read_audio("/my/audio.mp4").
638
638
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 2.0.0b10
3
+ Version: 2.0.0b11
4
4
  Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -19,6 +19,7 @@ Requires-Dist: aws-embedded-metrics==3.2.0
19
19
  Requires-Dist: boto3~=1.34
20
20
  Requires-Dist: google-cloud-storage
21
21
  Requires-Dist: gcsfs==2025.3.2
22
+ Requires-Dist: daft==0.4.15
22
23
  Requires-Dist: intervaltree==3.1.0
23
24
  Requires-Dist: numpy==1.22.4
24
25
  Requires-Dist: pandas==2.2.3
@@ -26,7 +27,7 @@ Requires-Dist: polars==1.28.1
26
27
  Requires-Dist: pyarrow==16.0.0
27
28
  Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3
28
29
  Requires-Dist: pymemcache==4.0.0
29
- Requires-Dist: ray[default]==2.43.0
30
+ Requires-Dist: ray[default]==2.46.0
30
31
  Requires-Dist: tenacity==8.2.3
31
32
  Requires-Dist: typing-extensions==4.6.1
32
33
  Requires-Dist: redis==4.6.0
@@ -61,8 +62,6 @@ DeltaCAT provides four high-level components:
61
62
 
62
63
  DeltaCAT is rapidly evolving. Usage instructions will be posted here soon!
63
64
 
64
- For now, feel free to peruse some of our examples:
65
- * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/rivulet
66
- * https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/iceberg
65
+ For now, feel free to peruse some of our [examples](https://github.com/ray-project/deltacat/tree/2.0/deltacat/examples/).
67
66
 
68
67