deltacat 2.0.0b6__py3-none-any.whl → 2.0.0b9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -67,7 +67,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
67
67
 
68
68
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
69
69
 
70
- __version__ = "2.0.0b6"
70
+ __version__ = "2.0.0b9"
71
71
 
72
72
 
73
73
  __all__ = [
File without changes
@@ -0,0 +1,111 @@
1
+ from typing import Iterator
2
+
3
+ from daft import Schema
4
+ from daft.daft import (
5
+ StorageConfig,
6
+ PartitionField,
7
+ Pushdowns,
8
+ ScanTask,
9
+ FileFormatConfig,
10
+ ParquetSourceConfig,
11
+ )
12
+ from daft.io.scan import ScanOperator
13
+
14
+ from deltacat.catalog.model.table_definition import TableDefinition
15
+ from deltacat.daft.model import DaftPartitionKeyMapper
16
+
17
+
18
+ class DeltaCatScanOperator(ScanOperator):
19
+ def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
20
+ super().__init__()
21
+ self.table = table
22
+ self._schema = self._infer_schema()
23
+ self.partition_keys = self._infer_partition_keys()
24
+ self.storage_config = storage_config
25
+
26
+ def schema(self) -> Schema:
27
+ return self._schema
28
+
29
+ def name(self) -> str:
30
+ return "DeltaCatScanOperator"
31
+
32
+ def display_name(self) -> str:
33
+ return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
34
+
35
+ def partitioning_keys(self) -> list[PartitionField]:
36
+ return self.partition_keys
37
+
38
+ def multiline_display(self) -> list[str]:
39
+ return [
40
+ self.display_name(),
41
+ f"Schema = {self._schema}",
42
+ f"Partitioning keys = {self.partitioning_keys}",
43
+ f"Storage config = {self.storage_config}",
44
+ ]
45
+
46
+ def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]:
47
+ # TODO: implement pushdown predicate on DeltaCAT
48
+ dc_scan_plan = self.table.create_scan_plan()
49
+ scan_tasks = []
50
+ file_format_config = FileFormatConfig.from_parquet_config(
51
+ # maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
52
+ ParquetSourceConfig()
53
+ )
54
+ for dc_scan_task in dc_scan_plan.scan_tasks:
55
+ for data_file in dc_scan_task.data_files():
56
+ st = ScanTask.catalog_scan_task(
57
+ file=data_file.file_path,
58
+ file_format=file_format_config,
59
+ schema=self._schema._schema,
60
+ storage_config=self.storage_config,
61
+ pushdowns=pushdowns,
62
+ )
63
+ scan_tasks.append(st)
64
+ return iter(scan_tasks)
65
+
66
+ def can_absorb_filter(self) -> bool:
67
+ return False
68
+
69
+ def can_absorb_limit(self) -> bool:
70
+ return False
71
+
72
+ def can_absorb_select(self) -> bool:
73
+ return True
74
+
75
+ def _infer_schema(self) -> Schema:
76
+
77
+ if not (
78
+ self.table and self.table.table_version and self.table.table_version.schema
79
+ ):
80
+ raise RuntimeError(
81
+ f"Failed to infer schema for DeltaCAT Table "
82
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
83
+ )
84
+
85
+ return Schema.from_pyarrow_schema(self.table.table_version.schema.arrow)
86
+
87
+ def _infer_partition_keys(self) -> list[PartitionField]:
88
+ if not (
89
+ self.table
90
+ and self.table.table_version
91
+ and self.table.table_version.partition_scheme
92
+ and self.table.table_version.schema
93
+ ):
94
+ raise RuntimeError(
95
+ f"Failed to infer partition keys for DeltaCAT Table "
96
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
97
+ )
98
+
99
+ schema = self.table.table_version.schema
100
+ partition_keys = self.table.table_version.partition_scheme.keys
101
+ if not partition_keys:
102
+ return []
103
+
104
+ partition_fields = []
105
+ for key in partition_keys:
106
+ field = DaftPartitionKeyMapper.unmap(key, schema)
107
+ # Assert that the returned value is not None.
108
+ assert field is not None, f"Unmapping failed for key {key}"
109
+ partition_fields.append(field)
110
+
111
+ return partition_fields
deltacat/daft/model.py ADDED
@@ -0,0 +1,258 @@
1
+ from typing import Optional
2
+
3
+ import pyarrow as pa
4
+ from pyarrow import Field as PaField
5
+ from daft import Schema as DaftSchema, DataType
6
+ from daft.daft import (
7
+ PartitionField as DaftPartitionField,
8
+ PartitionTransform as DaftTransform,
9
+ )
10
+ from daft.logical.schema import Field as DaftField
11
+ from daft.io.scan import make_partition_field
12
+
13
+ from deltacat.storage.model.schema import Schema
14
+ from deltacat.storage.model.interop import ModelMapper
15
+ from deltacat.storage.model.partition import PartitionKey
16
+ from deltacat.storage.model.transform import (
17
+ BucketingStrategy,
18
+ Transform,
19
+ BucketTransform,
20
+ HourTransform,
21
+ DayTransform,
22
+ MonthTransform,
23
+ YearTransform,
24
+ IdentityTransform,
25
+ TruncateTransform,
26
+ )
27
+
28
+
29
+ class DaftFieldMapper(ModelMapper[DaftField, PaField]):
30
+ @staticmethod
31
+ def map(
32
+ obj: Optional[DaftField],
33
+ **kwargs,
34
+ ) -> Optional[PaField]:
35
+ """Convert Daft Field to PyArrow Field.
36
+
37
+ Args:
38
+ obj: The Daft Field to convert
39
+ **kwargs: Additional arguments
40
+
41
+ Returns:
42
+ Converted PyArrow Field object
43
+ """
44
+ if obj is None:
45
+ return None
46
+
47
+ return pa.field(
48
+ name=obj.name,
49
+ type=obj.dtype.to_arrow_dtype(),
50
+ )
51
+
52
+ @staticmethod
53
+ def unmap(
54
+ obj: Optional[PaField],
55
+ **kwargs,
56
+ ) -> Optional[DaftField]:
57
+ """Convert PyArrow Field to Daft Field.
58
+
59
+ Args:
60
+ obj: The PyArrow Field to convert
61
+ **kwargs: Additional arguments
62
+
63
+ Returns:
64
+ Converted Daft Field object
65
+ """
66
+ if obj is None:
67
+ return None
68
+
69
+ return DaftField.create(
70
+ name=obj.name,
71
+ dtype=DataType.from_arrow_type(obj.type), # type: ignore
72
+ )
73
+
74
+
75
+ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
76
+ @staticmethod
77
+ def map(
78
+ obj: Optional[DaftTransform],
79
+ **kwargs,
80
+ ) -> Optional[Transform]:
81
+ """Convert DaftTransform to DeltaCAT Transform.
82
+
83
+ Args:
84
+ obj: The DaftTransform to convert
85
+ **kwargs: Additional arguments
86
+
87
+ Returns:
88
+ Converted Transform object
89
+ """
90
+
91
+ # daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
92
+ # thus conversion is not possible.
93
+ # TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
94
+ raise NotImplementedError(
95
+ "Converting transform from Daft to DeltaCAT is not supported"
96
+ )
97
+
98
+ @staticmethod
99
+ def unmap(
100
+ obj: Optional[Transform],
101
+ **kwargs,
102
+ ) -> Optional[DaftTransform]:
103
+ """Convert DeltaCAT Transform to DaftTransform.
104
+
105
+ Args:
106
+ obj: The Transform to convert
107
+ **kwargs: Additional arguments
108
+
109
+ Returns:
110
+ Converted DaftTransform object
111
+ """
112
+ if obj is None:
113
+ return None
114
+
115
+ # Map DeltaCAT transforms to Daft transforms using isinstance
116
+
117
+ if isinstance(obj, IdentityTransform):
118
+ return DaftTransform.identity()
119
+ elif isinstance(obj, HourTransform):
120
+ return DaftTransform.hour()
121
+ elif isinstance(obj, DayTransform):
122
+ return DaftTransform.day()
123
+ elif isinstance(obj, MonthTransform):
124
+ return DaftTransform.month()
125
+ elif isinstance(obj, YearTransform):
126
+ return DaftTransform.year()
127
+ elif isinstance(obj, BucketTransform):
128
+ if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
129
+ return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
130
+ else:
131
+ raise ValueError(
132
+ f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
133
+ )
134
+ elif isinstance(obj, TruncateTransform):
135
+ return DaftTransform.iceberg_truncate(obj.parameters.width)
136
+
137
+ raise ValueError(f"Unsupported Transform: {obj}")
138
+
139
+
140
+ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
141
+ @staticmethod
142
+ def map(
143
+ obj: Optional[DaftPartitionField],
144
+ schema: Optional[DaftSchema] = None,
145
+ **kwargs,
146
+ ) -> Optional[PartitionKey]:
147
+ """Convert DaftPartitionField to PartitionKey.
148
+
149
+ Args:
150
+ obj: The DaftPartitionField to convert
151
+ schema: The Daft schema containing field information
152
+ **kwargs: Additional arguments
153
+
154
+ Returns:
155
+ Converted PartitionKey object
156
+ """
157
+ # Daft PartitionField only exposes 1 attribute `field` which is not enough
158
+ # to convert to DeltaCAT PartitionKey
159
+ # TODO: request Daft to expose more Python friendly interface for PartitionField
160
+ raise NotImplementedError(
161
+ f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
162
+ )
163
+
164
+ @staticmethod
165
+ def unmap(
166
+ obj: Optional[PartitionKey],
167
+ schema: Optional[Schema] = None,
168
+ **kwargs,
169
+ ) -> Optional[DaftPartitionField]:
170
+ """Convert PartitionKey to DaftPartitionField.
171
+
172
+ Args:
173
+ obj: The DeltaCAT PartitionKey to convert
174
+ schema: The Schema containing field information
175
+ **kwargs: Additional arguments
176
+
177
+ Returns:
178
+ Converted DaftPartitionField object
179
+ """
180
+ if obj is None:
181
+ return None
182
+ if obj.name is None:
183
+ raise ValueError("Name is required for PartitionKey conversion")
184
+ if not schema:
185
+ raise ValueError("Schema is required for PartitionKey conversion")
186
+ if len(obj.key) < 1:
187
+ raise ValueError(
188
+ f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
189
+ )
190
+
191
+ # Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
192
+ dc_source_field = schema.field(obj.key[0]).arrow
193
+ daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
194
+ # Convert transform if present
195
+ daft_transform = DaftTransformMapper.unmap(obj.transform)
196
+ daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
197
+ partition_field_name=obj.name,
198
+ daft_source_field=daft_source_field,
199
+ dc_transform=obj.transform,
200
+ )
201
+
202
+ # Create DaftPartitionField
203
+ return make_partition_field(
204
+ field=daft_partition_field,
205
+ source_field=daft_source_field,
206
+ transform=daft_transform,
207
+ )
208
+
209
+ @staticmethod
210
+ def get_daft_partition_field(
211
+ partition_field_name: str,
212
+ daft_source_field: Optional[DaftField],
213
+ # TODO: replace DeltaCAT transform with Daft Transform for uniformality
214
+ # We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
215
+ # access its attributes.
216
+ # TODO: request Daft to provide a more python friendly interface for Daft Tranform
217
+ dc_transform: Optional[Transform],
218
+ ) -> DaftField:
219
+ """Generate Daft Partition Field given partition field name, source field and transform.
220
+ Partition field type is inferred using source field type and transform.
221
+
222
+ Args:
223
+ partition_field_name (str): the specified result field name
224
+ daft_source_field (DaftField): the source field of the partition field
225
+ daft_transform (DaftTransform): transform applied on the source field to create partition field
226
+
227
+ Returns:
228
+ DaftField: Daft Field representing the partition field
229
+ """
230
+ if daft_source_field is None:
231
+ raise ValueError("Source field is required for PartitionField conversion")
232
+ if dc_transform is None:
233
+ raise ValueError("Transform is required for PartitionField conversion")
234
+
235
+ result_type = None
236
+ # Below type conversion logic references Daft - Iceberg conversion logic:
237
+ # https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
238
+ if isinstance(dc_transform, IdentityTransform):
239
+ result_type = daft_source_field.dtype
240
+ elif isinstance(dc_transform, YearTransform):
241
+ result_type = DataType.int32()
242
+ elif isinstance(dc_transform, MonthTransform):
243
+ result_type = DataType.int32()
244
+ elif isinstance(dc_transform, DayTransform):
245
+ result_type = DataType.int32()
246
+ elif isinstance(dc_transform, HourTransform):
247
+ result_type = DataType.int32()
248
+ elif isinstance(dc_transform, BucketTransform):
249
+ result_type = DataType.int32()
250
+ elif isinstance(dc_transform, TruncateTransform):
251
+ result_type = daft_source_field.dtype
252
+ else:
253
+ raise ValueError(f"Unsupported transform: {dc_transform}")
254
+
255
+ return DaftField.create(
256
+ name=partition_field_name,
257
+ dtype=result_type,
258
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 2.0.0b6
3
+ Version: 2.0.0b9
4
4
  Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -17,7 +17,7 @@ Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: aws-embedded-metrics==3.2.0
19
19
  Requires-Dist: boto3~=1.34
20
- Requires-Dist: getdaft>=0.4.11
20
+ Requires-Dist: daft>=0.4.13
21
21
  Requires-Dist: intervaltree==3.1.0
22
22
  Requires-Dist: numpy==1.21.5
23
23
  Requires-Dist: pandas==1.3.5
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=gMoUv3PoGXHlkitSzr_bWcMYUfc5o1nn8-LdHDzLStU,2474
1
+ deltacat/__init__.py,sha256=NEIWMnOyGxSspHbfzf3Q4K6q9OLaN7qD6rkKyg1ZxJk,2474
2
2
  deltacat/annotations.py,sha256=9lBi34DpIV_RPjCCK2Aiz_6nMyd-e-_CfQ1XtdRQQlM,1196
3
3
  deltacat/api.py,sha256=fYKurVlM97VKb_fh7kJ1rDcl-VAAuSflxPeqrsUt1u8,5257
4
4
  deltacat/constants.py,sha256=_JfHTRktDTM70Nls-LMnSmLeCRG17UwQYCmY6gQSGBg,3482
@@ -130,6 +130,9 @@ deltacat/compute/stats/models/delta_stats.py,sha256=hBith8_hbF9TVr6HocLAt6RJ_kZZ
130
130
  deltacat/compute/stats/models/delta_stats_cache_result.py,sha256=mbJYxpZd5jaER_BWrCD2hROFy3p1nNdBrj66nUpc6io,1624
131
131
  deltacat/compute/stats/models/manifest_entry_stats.py,sha256=NCDAe2nPDEI4kOkuwNkRFgGPS-rqQaQqLuaLoKk20KQ,2419
132
132
  deltacat/compute/stats/models/stats_result.py,sha256=XQAlmzhUqRmg4jzEMUAOqcYn1HUOBTMryBH1CCVlet8,3820
133
+ deltacat/daft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
+ deltacat/daft/daft_scan.py,sha256=u0RpSZTujF9ScuFkXBLkEXfG2eMkoww5ypG2Eri0HrU,3778
135
+ deltacat/daft/model.py,sha256=6NaKkp9R0ruE0K2x-moyARNrQisswUl6TjMeA6YHtBM,9078
133
136
  deltacat/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
137
  deltacat/examples/basic_logging.py,sha256=IwUa-MwQbmH5vuzRvmz5NtfYXU2qNUID_0zkO5HlUZo,2826
135
138
  deltacat/examples/hello_world.py,sha256=hXpMUvJINB2qWTpV3QFPlRNu0uE31BvEs2sLyQ3CWZk,530
@@ -346,8 +349,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
346
349
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
347
350
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
348
351
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
349
- deltacat-2.0.0b6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
350
- deltacat-2.0.0b6.dist-info/METADATA,sha256=USf1wawH_OzPK19QEAqDWvrHDYhymxpP6I4RHD97YsQ,2809
351
- deltacat-2.0.0b6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
352
- deltacat-2.0.0b6.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
353
- deltacat-2.0.0b6.dist-info/RECORD,,
352
+ deltacat-2.0.0b9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
353
+ deltacat-2.0.0b9.dist-info/METADATA,sha256=IYmkoVGKgSBUVj37A7br7kmJKno7yietFSwOSPAG8Og,2806
354
+ deltacat-2.0.0b9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
355
+ deltacat-2.0.0b9.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
356
+ deltacat-2.0.0b9.dist-info/RECORD,,