deltacat 2.0.0b3__py3-none-any.whl → 2.0.0b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -67,7 +67,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
67
67
 
68
68
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
69
69
 
70
- __version__ = "2.0.0b3"
70
+ __version__ = "2.0.0b7"
71
71
 
72
72
 
73
73
  __all__ = [
@@ -2,10 +2,13 @@ import logging
2
2
 
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
- from daft import DataFrame
5
+ from daft import DataFrame, context
6
+ from daft.daft import ScanOperatorHandle, StorageConfig
7
+ from daft.logical.builder import LogicalPlanBuilder
6
8
 
7
9
  from deltacat import logs
8
10
  from deltacat.catalog.model.table_definition import TableDefinition
11
+ from deltacat.daft.daft_scan import DeltaCatScanOperator
9
12
  from deltacat.exceptions import TableAlreadyExistsError
10
13
  from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
11
14
  from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
@@ -144,7 +147,17 @@ def read_table(
144
147
  table: str, *args, namespace: Optional[str] = None, **kwargs
145
148
  ) -> DistributedDataset:
146
149
  """Read a table into a distributed dataset."""
147
- raise NotImplementedError("read_table not implemented")
150
+ # TODO: more proper IO configuration
151
+ io_config = context.get_context().daft_planning_config.default_io_config
152
+ multithreaded_io = context.get_context().get_or_create_runner().name != "ray"
153
+
154
+ storage_config = StorageConfig(multithreaded_io, io_config)
155
+
156
+ dc_table = get_table(name=table, namespace=namespace, **kwargs)
157
+ dc_scan_operator = DeltaCatScanOperator(dc_table, storage_config)
158
+ handle = ScanOperatorHandle.from_python_scan_operator(dc_scan_operator)
159
+ builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
160
+ return DataFrame(builder)
148
161
 
149
162
 
150
163
  def alter_table(
@@ -5,12 +5,11 @@ from typing import Iterator, List
5
5
  from pyarrow.fs import FileSystem
6
6
 
7
7
  from pyiceberg.io.pyarrow import (
8
- fill_parquet_file_metadata,
8
+ data_file_statistics_from_parquet_metadata,
9
9
  compute_statistics_plan,
10
10
  parquet_path_to_id_mapping,
11
11
  )
12
- from pyiceberg.table import Table, _MergingSnapshotProducer
13
- from pyiceberg.table.snapshots import Operation
12
+ from pyiceberg.table import Table
14
13
  from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
15
14
  from pyiceberg.types import StructType, NestedField, IntegerType
16
15
  from pyiceberg.typedef import Record
@@ -24,11 +23,10 @@ def append(table: Table, paths: List[str]) -> None:
24
23
  # raise ValueError("Cannot write to tables with a sort-order")
25
24
 
26
25
  data_files = write_file(table, paths)
27
- merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=table)
28
- for data_file in data_files:
29
- merge.append_data_file(data_file)
30
-
31
- merge.commit()
26
+ with table.transaction() as txn:
27
+ with txn.update_snapshot().fast_append() as snapshot_update:
28
+ for data_file in data_files:
29
+ snapshot_update.append_data_file(data_file)
32
30
 
33
31
 
34
32
  def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
@@ -41,6 +39,11 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
41
39
  fs_path = fs_tuple[1]
42
40
  with fs.open_input_file(fs_path) as native_file:
43
41
  parquet_metadata = pq.read_metadata(native_file)
42
+ statistics = data_file_statistics_from_parquet_metadata(
43
+ parquet_metadata=parquet_metadata,
44
+ stats_columns=compute_statistics_plan(table.schema(), table.properties),
45
+ parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
46
+ )
44
47
  data_file = DataFile(
45
48
  content=DataFileContent.DATA,
46
49
  file_path=file_path,
@@ -63,12 +66,7 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
63
66
  spec_id=table.spec().spec_id,
64
67
  equality_ids=None,
65
68
  key_metadata=None,
66
- )
67
- fill_parquet_file_metadata(
68
- data_file=data_file,
69
- parquet_metadata=parquet_metadata,
70
- stats_columns=compute_statistics_plan(table.schema(), table.properties),
71
- parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
69
+ **statistics.to_serialized_dict(),
72
70
  )
73
71
  data_files.append(data_file)
74
72
  return data_files
@@ -709,7 +709,7 @@ def _get_deltas_from_partition_filter(
709
709
 
710
710
  def _get_storage(**kwargs):
711
711
  """
712
- Returns the implementation of `deltacat.storage.interface` to use with this catalog.
712
+ Returns the implementation of `deltacat.storage.interface` to use with this catalog
713
713
 
714
714
  This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
715
715
  """
File without changes
@@ -0,0 +1,111 @@
1
+ from typing import Iterator
2
+
3
+ from daft import Schema
4
+ from daft.daft import (
5
+ StorageConfig,
6
+ PartitionField,
7
+ Pushdowns,
8
+ ScanTask,
9
+ FileFormatConfig,
10
+ ParquetSourceConfig,
11
+ )
12
+ from daft.io.scan import ScanOperator
13
+
14
+ from deltacat.catalog.model.table_definition import TableDefinition
15
+ from deltacat.daft.model import DaftPartitionKeyMapper
16
+
17
+
18
+ class DeltaCatScanOperator(ScanOperator):
19
+ def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
20
+ super().__init__()
21
+ self.table = table
22
+ self._schema = self._infer_schema()
23
+ self.partition_keys = self._infer_partition_keys()
24
+ self.storage_config = storage_config
25
+
26
+ def schema(self) -> Schema:
27
+ return self._schema
28
+
29
+ def name(self) -> str:
30
+ return "DeltaCatScanOperator"
31
+
32
+ def display_name(self) -> str:
33
+ return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
34
+
35
+ def partitioning_keys(self) -> list[PartitionField]:
36
+ return self.partition_keys
37
+
38
+ def multiline_display(self) -> list[str]:
39
+ return [
40
+ self.display_name(),
41
+ f"Schema = {self._schema}",
42
+ f"Partitioning keys = {self.partitioning_keys}",
43
+ f"Storage config = {self.storage_config}",
44
+ ]
45
+
46
+ def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]:
47
+ # TODO: implement pushdown predicate on DeltaCAT
48
+ dc_scan_plan = self.table.create_scan_plan()
49
+ scan_tasks = []
50
+ file_format_config = FileFormatConfig.from_parquet_config(
51
+ # maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
52
+ ParquetSourceConfig()
53
+ )
54
+ for dc_scan_task in dc_scan_plan.scan_tasks:
55
+ for data_file in dc_scan_task.data_files():
56
+ st = ScanTask.catalog_scan_task(
57
+ file=data_file.file_path,
58
+ file_format=file_format_config,
59
+ schema=self._schema._schema,
60
+ storage_config=self.storage_config,
61
+ pushdowns=pushdowns,
62
+ )
63
+ scan_tasks.append(st)
64
+ return iter(scan_tasks)
65
+
66
+ def can_absorb_filter(self) -> bool:
67
+ return False
68
+
69
+ def can_absorb_limit(self) -> bool:
70
+ return False
71
+
72
+ def can_absorb_select(self) -> bool:
73
+ return True
74
+
75
+ def _infer_schema(self) -> Schema:
76
+
77
+ if not (
78
+ self.table and self.table.table_version and self.table.table_version.schema
79
+ ):
80
+ raise RuntimeError(
81
+ f"Failed to infer schema for DeltaCAT Table "
82
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
83
+ )
84
+
85
+ return Schema.from_pyarrow_schema(self.table.table_version.schema.arrow)
86
+
87
+ def _infer_partition_keys(self) -> list[PartitionField]:
88
+ if not (
89
+ self.table
90
+ and self.table.table_version
91
+ and self.table.table_version.partition_scheme
92
+ and self.table.table_version.schema
93
+ ):
94
+ raise RuntimeError(
95
+ f"Failed to infer partition keys for DeltaCAT Table "
96
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
97
+ )
98
+
99
+ schema = self.table.table_version.schema
100
+ partition_keys = self.table.table_version.partition_scheme.keys
101
+ if not partition_keys:
102
+ return []
103
+
104
+ partition_fields = []
105
+ for key in partition_keys:
106
+ field = DaftPartitionKeyMapper.unmap(key, schema)
107
+ # Assert that the returned value is not None.
108
+ assert field is not None, f"Unmapping failed for key {key}"
109
+ partition_fields.append(field)
110
+
111
+ return partition_fields
deltacat/daft/model.py ADDED
@@ -0,0 +1,258 @@
1
+ from typing import Optional
2
+
3
+ import pyarrow as pa
4
+ from pyarrow import Field as PaField
5
+ from daft import Schema as DaftSchema, DataType
6
+ from daft.daft import (
7
+ PartitionField as DaftPartitionField,
8
+ PartitionTransform as DaftTransform,
9
+ )
10
+ from daft.logical.schema import Field as DaftField
11
+ from daft.io.scan import make_partition_field
12
+
13
+ from deltacat.storage.model.schema import Schema
14
+ from deltacat.storage.model.interop import ModelMapper
15
+ from deltacat.storage.model.partition import PartitionKey
16
+ from deltacat.storage.model.transform import (
17
+ BucketingStrategy,
18
+ Transform,
19
+ BucketTransform,
20
+ HourTransform,
21
+ DayTransform,
22
+ MonthTransform,
23
+ YearTransform,
24
+ IdentityTransform,
25
+ TruncateTransform,
26
+ )
27
+
28
+
29
+ class DaftFieldMapper(ModelMapper[DaftField, PaField]):
30
+ @staticmethod
31
+ def map(
32
+ obj: Optional[DaftField],
33
+ **kwargs,
34
+ ) -> Optional[PaField]:
35
+ """Convert Daft Field to PyArrow Field.
36
+
37
+ Args:
38
+ obj: The Daft Field to convert
39
+ **kwargs: Additional arguments
40
+
41
+ Returns:
42
+ Converted PyArrow Field object
43
+ """
44
+ if obj is None:
45
+ return None
46
+
47
+ return pa.field(
48
+ name=obj.name,
49
+ type=obj.dtype.to_arrow_dtype(),
50
+ )
51
+
52
+ @staticmethod
53
+ def unmap(
54
+ obj: Optional[PaField],
55
+ **kwargs,
56
+ ) -> Optional[DaftField]:
57
+ """Convert PyArrow Field to Daft Field.
58
+
59
+ Args:
60
+ obj: The PyArrow Field to convert
61
+ **kwargs: Additional arguments
62
+
63
+ Returns:
64
+ Converted Daft Field object
65
+ """
66
+ if obj is None:
67
+ return None
68
+
69
+ return DaftField.create(
70
+ name=obj.name,
71
+ dtype=DataType.from_arrow_type(obj.type), # type: ignore
72
+ )
73
+
74
+
75
+ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
76
+ @staticmethod
77
+ def map(
78
+ obj: Optional[DaftTransform],
79
+ **kwargs,
80
+ ) -> Optional[Transform]:
81
+ """Convert DaftTransform to DeltaCAT Transform.
82
+
83
+ Args:
84
+ obj: The DaftTransform to convert
85
+ **kwargs: Additional arguments
86
+
87
+ Returns:
88
+ Converted Transform object
89
+ """
90
+
91
+ # daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
92
+ # thus conversion is not possible.
93
+ # TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
94
+ raise NotImplementedError(
95
+ "Converting transform from Daft to DeltaCAT is not supported"
96
+ )
97
+
98
+ @staticmethod
99
+ def unmap(
100
+ obj: Optional[Transform],
101
+ **kwargs,
102
+ ) -> Optional[DaftTransform]:
103
+ """Convert DeltaCAT Transform to DaftTransform.
104
+
105
+ Args:
106
+ obj: The Transform to convert
107
+ **kwargs: Additional arguments
108
+
109
+ Returns:
110
+ Converted DaftTransform object
111
+ """
112
+ if obj is None:
113
+ return None
114
+
115
+ # Map DeltaCAT transforms to Daft transforms using isinstance
116
+
117
+ if isinstance(obj, IdentityTransform):
118
+ return DaftTransform.identity()
119
+ elif isinstance(obj, HourTransform):
120
+ return DaftTransform.hour()
121
+ elif isinstance(obj, DayTransform):
122
+ return DaftTransform.day()
123
+ elif isinstance(obj, MonthTransform):
124
+ return DaftTransform.month()
125
+ elif isinstance(obj, YearTransform):
126
+ return DaftTransform.year()
127
+ elif isinstance(obj, BucketTransform):
128
+ if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
129
+ return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
130
+ else:
131
+ raise ValueError(
132
+ f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
133
+ )
134
+ elif isinstance(obj, TruncateTransform):
135
+ return DaftTransform.iceberg_truncate(obj.parameters.width)
136
+
137
+ raise ValueError(f"Unsupported Transform: {obj}")
138
+
139
+
140
+ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
141
+ @staticmethod
142
+ def map(
143
+ obj: Optional[DaftPartitionField],
144
+ schema: Optional[DaftSchema] = None,
145
+ **kwargs,
146
+ ) -> Optional[PartitionKey]:
147
+ """Convert DaftPartitionField to PartitionKey.
148
+
149
+ Args:
150
+ obj: The DaftPartitionField to convert
151
+ schema: The Daft schema containing field information
152
+ **kwargs: Additional arguments
153
+
154
+ Returns:
155
+ Converted PartitionKey object
156
+ """
157
+ # Daft PartitionField only exposes 1 attribute `field` which is not enough
158
+ # to convert to DeltaCAT PartitionKey
159
+ # TODO: request Daft to expose more Python friendly interface for PartitionField
160
+ raise NotImplementedError(
161
+ f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
162
+ )
163
+
164
+ @staticmethod
165
+ def unmap(
166
+ obj: Optional[PartitionKey],
167
+ schema: Optional[Schema] = None,
168
+ **kwargs,
169
+ ) -> Optional[DaftPartitionField]:
170
+ """Convert PartitionKey to DaftPartitionField.
171
+
172
+ Args:
173
+ obj: The DeltaCAT PartitionKey to convert
174
+ schema: The Schema containing field information
175
+ **kwargs: Additional arguments
176
+
177
+ Returns:
178
+ Converted DaftPartitionField object
179
+ """
180
+ if obj is None:
181
+ return None
182
+ if obj.name is None:
183
+ raise ValueError("Name is required for PartitionKey conversion")
184
+ if not schema:
185
+ raise ValueError("Schema is required for PartitionKey conversion")
186
+ if len(obj.key) < 1:
187
+ raise ValueError(
188
+ f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
189
+ )
190
+
191
+ # Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
192
+ dc_source_field = schema.field(obj.key[0]).arrow
193
+ daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
194
+ # Convert transform if present
195
+ daft_transform = DaftTransformMapper.unmap(obj.transform)
196
+ daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
197
+ partition_field_name=obj.name,
198
+ daft_source_field=daft_source_field,
199
+ dc_transform=obj.transform,
200
+ )
201
+
202
+ # Create DaftPartitionField
203
+ return make_partition_field(
204
+ field=daft_partition_field,
205
+ source_field=daft_source_field,
206
+ transform=daft_transform,
207
+ )
208
+
209
+ @staticmethod
210
+ def get_daft_partition_field(
211
+ partition_field_name: str,
212
+ daft_source_field: Optional[DaftField],
213
+ # TODO: replace DeltaCAT transform with Daft Transform for uniformality
214
+ # We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
215
+ # access its attributes.
216
+ # TODO: request Daft to provide a more python friendly interface for Daft Tranform
217
+ dc_transform: Optional[Transform],
218
+ ) -> DaftField:
219
+ """Generate Daft Partition Field given partition field name, source field and transform.
220
+ Partition field type is inferred using source field type and transform.
221
+
222
+ Args:
223
+ partition_field_name (str): the specified result field name
224
+ daft_source_field (DaftField): the source field of the partition field
225
+ daft_transform (DaftTransform): transform applied on the source field to create partition field
226
+
227
+ Returns:
228
+ DaftField: Daft Field representing the partition field
229
+ """
230
+ if daft_source_field is None:
231
+ raise ValueError("Source field is required for PartitionField conversion")
232
+ if dc_transform is None:
233
+ raise ValueError("Transform is required for PartitionField conversion")
234
+
235
+ result_type = None
236
+ # Below type conversion logic references Daft - Iceberg conversion logic:
237
+ # https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
238
+ if isinstance(dc_transform, IdentityTransform):
239
+ result_type = daft_source_field.dtype
240
+ elif isinstance(dc_transform, YearTransform):
241
+ result_type = DataType.int32()
242
+ elif isinstance(dc_transform, MonthTransform):
243
+ result_type = DataType.int32()
244
+ elif isinstance(dc_transform, DayTransform):
245
+ result_type = DataType.int32()
246
+ elif isinstance(dc_transform, HourTransform):
247
+ result_type = DataType.int32()
248
+ elif isinstance(dc_transform, BucketTransform):
249
+ result_type = DataType.int32()
250
+ elif isinstance(dc_transform, TruncateTransform):
251
+ result_type = daft_source_field.dtype
252
+ else:
253
+ raise ValueError(f"Unsupported transform: {dc_transform}")
254
+
255
+ return DaftField.create(
256
+ name=partition_field_name,
257
+ dtype=result_type,
258
+ )
@@ -1,11 +1,15 @@
1
1
  import os
2
2
  import logging
3
3
 
4
+ import uuid
4
5
  import daft
6
+ from pyiceberg.catalog import CatalogType
7
+
5
8
  import deltacat as dc
6
9
 
7
10
  from deltacat import logs
8
11
  from deltacat import IcebergCatalog
12
+ from deltacat.catalog.iceberg import IcebergCatalogConfig
9
13
  from deltacat.examples.common.fixtures import (
10
14
  store_cli_args_in_os_environ,
11
15
  )
@@ -30,6 +34,24 @@ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
30
34
 
31
35
 
32
36
  def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
37
+ """
38
+ This is an e2e example that
39
+ 1. creates a DeltaCAT Table (backed by an Iceberg Table) in Glue
40
+ 2. writes data into the DeltaCAT Table
41
+ 3. reads data from the DeltaCAT Table using Daft
42
+
43
+ To run the script:
44
+ 1. prepare an AWS Account
45
+ 1. prepare a S3 location where the data will be written to, which will be used in Step 3.
46
+ 2. prepare an IAM Role that has access to the S3 location and Glue
47
+ 2. retrieve the IAM Role AWS Credential and cache locally in ~/.aws/credentials
48
+ 3. run below command to execute the example
49
+ ```
50
+ make venv && source venv/bin/activate
51
+ python -m deltacat.examples.iceberg.iceberg_bucket_writer --warehouse=s3://<YOUR_S3_LOCATION>
52
+ ```
53
+
54
+ """
33
55
  # create any runtime environment required to run the example
34
56
  runtime_env = create_ray_runtime_environment()
35
57
 
@@ -38,6 +60,7 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
38
60
  # Only the `iceberg` data catalog is provided so it will become the default.
39
61
  # If initializing multiple catalogs, use the `default_catalog_name` param
40
62
  # to specify which catalog should be the default.
63
+
41
64
  dc.init(
42
65
  catalogs={
43
66
  # the name of the DeltaCAT catalog is "iceberg"
@@ -49,11 +72,13 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
49
72
  name="example-iceberg-catalog",
50
73
  # for additional properties see:
51
74
  # https://py.iceberg.apache.org/configuration/
52
- properties={
53
- "type": "glue",
54
- "region_name": "us-east-1",
55
- "warehouse": warehouse,
56
- },
75
+ config=IcebergCatalogConfig(
76
+ type=CatalogType.GLUE,
77
+ properties={
78
+ "warehouse": warehouse,
79
+ "region_name": "us-east-1",
80
+ },
81
+ ),
57
82
  )
58
83
  },
59
84
  # pass the runtime environment into ray.init()
@@ -89,10 +114,10 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
89
114
  }
90
115
  )
91
116
 
92
- # write to a table named `test_namespace.test_table_bucketed`
117
+ # write to a table named `test_namespace.test_table_bucketed-<SUFFIX>`
93
118
  # we don't need to specify which catalog to create this table in since
94
119
  # only the "iceberg" catalog is available
95
- table_name = "test_table_bucketed"
120
+ table_name = f"test_table_bucketed-{uuid.uuid4().hex[:8]}"
96
121
  namespace = "test_namespace"
97
122
  print(f"Creating Glue Table: {namespace}.{table_name}")
98
123
  dc.write_to_table(
@@ -106,9 +131,40 @@ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
106
131
  )
107
132
 
108
133
  print(f"Getting Glue Table: {namespace}.{table_name}")
109
- table_definition = dc.get_table(table_name, namespace)
134
+ table_definition = dc.get_table(name=table_name, namespace=namespace)
110
135
  print(f"Retrieved Glue Table: {table_definition}")
111
136
 
137
+ # Read Data from DeltaCAT Table (backed by Iceberg) using Daft
138
+ daft_dataframe = dc.read_table(table=table_name, namespace=namespace)
139
+
140
+ daft_dataframe.where(df["bid"] > 200.0).show()
141
+ # Expected result:
142
+ # ╭────────┬─────────┬─────────╮
143
+ # │ symbol ┆ bid ┆ ask │
144
+ # │ --- ┆ --- ┆ --- │
145
+ # │ Utf8 ┆ Float64 ┆ Float64 │
146
+ # ╞════════╪═════════╪═════════╡
147
+ # │ meta ┆ 392.03 ┆ 392.09 │
148
+ # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
149
+ # │ msft ┆ 403.25 ┆ 403.27 │
150
+ # ╰────────┴─────────┴─────────╯
151
+
152
+ daft_dataframe.select("symbol").show()
153
+ # Expected result:
154
+ # ╭────────╮
155
+ # │ symbol │
156
+ # │ --- │
157
+ # │ Utf8 │
158
+ # ╞════════╡
159
+ # │ meta │
160
+ # ├╌╌╌╌╌╌╌╌┤
161
+ # │ amzn │
162
+ # ├╌╌╌╌╌╌╌╌┤
163
+ # │ goog │
164
+ # ├╌╌╌╌╌╌╌╌┤
165
+ # │ msft │
166
+ # ╰────────╯
167
+
112
168
 
113
169
  if __name__ == "__main__":
114
170
  example_script_args = [
@@ -121,15 +177,6 @@ if __name__ == "__main__":
121
177
  "type": str,
122
178
  },
123
179
  ),
124
- (
125
- [
126
- "--STAGE",
127
- ],
128
- {
129
- "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
130
- "type": str,
131
- },
132
- ),
133
180
  ]
134
181
 
135
182
  # store any CLI args in the runtime environment
File without changes
@@ -0,0 +1,4 @@
1
+ """Daft integration package for DeltaCAT.
2
+
3
+ This package provides integration between DeltaCAT and Daft.
4
+ """
@@ -0,0 +1,229 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Tuple, Optional
4
+
5
+ from deltacat.catalog.model.catalog import Catalog as DCCatalog
6
+ from deltacat.catalog.model.table_definition import TableDefinition
7
+
8
+ from daft.catalog import Catalog, Identifier, Table
9
+ from daft.dataframe import DataFrame
10
+ from daft.logical.schema import Schema
11
+ from deltacat.constants import DEFAULT_NAMESPACE
12
+
13
+
14
+ class DaftCatalog(Catalog):
15
+ """
16
+ Wrapper class to create a Daft catalog from a DeltaCAT catalog.
17
+
18
+ The initialization of DeltaCAT and Daft catalogs is managed in `deltacat.catalog.catalog.py`. The user
19
+ is just expected to initialize catalogs through the DeltaCAT public interface (init / put_catalog).
20
+
21
+ TODO (mccember) in follow up PR we need to consider how to keep the DeltaCAT Catalogs class and Daft session in sync,
22
+ and the user-facing entrypoint to get a Daft catalog
23
+
24
+ This class itself expects a `Catalog` and will invoke the underlying implementation
25
+ similar to `deltacat.catalog.delegate.py`, like:
26
+ catalog.impl.create_namespace(namespace, inner=catalog.inner)
27
+
28
+ We cannot route calls through the higher level catalog registry / delegate since this wrapper class is at a lower
29
+ layer and does not manage registering catalogs.
30
+ """
31
+
32
+ def __init__(self, catalog: DCCatalog, name: str):
33
+ """
34
+ Initialize given DeltaCAT catalog. This catalog is also registered with DeltaCAT (via deltacat.put_catalog) given the provided Name
35
+
36
+ :param catalog: DeltaCAT Catalog object. If None, the catalog will be fetched from `deltacat.Catalogs`
37
+ given the catalog name.
38
+
39
+ :param name: Name of DeltaCAT catalog. If the name is not yet registered with `deltacat.Catalogs`,
40
+ it will be registered upon creation to ensure that the DeltaCAT and Daft catalogs keep in sync.
41
+
42
+ :param kwargs: Additional keyword arguments passed to deltacat.get_catalog or deltacat.put_catalog,
43
+ such as 'namespace' for tests.
44
+ """
45
+ self.dc_catalog = catalog
46
+ self._name = name
47
+
48
+ @property
49
+ def name(self) -> str:
50
+ return self._name
51
+
52
+ ###
53
+ # create_*
54
+ ###
55
+ def create_namespace(self, identifier: Identifier | str):
56
+ """Create a new namespace in the catalog."""
57
+ if isinstance(identifier, Identifier):
58
+ identifier = str(identifier)
59
+ self.dc_catalog.impl.create_namespace(identifier, inner=self.dc_catalog.inner)
60
+
61
+ def create_table(
62
+ self, identifier: Identifier | str, source: Schema | DataFrame, **kwargs
63
+ ) -> Table:
64
+ """
65
+ Create a DeltaCAT table via Daft catalog API
66
+
67
+ End users calling create_table through the daft table API may provide kwargs which will be plumbed through
68
+ to deltacat create_table. For full list of keyword arguments accepted by create_table.
69
+
70
+ Note: as of 4/22, Daft create_table does not yet support kwargs. Tracked at: https://github.com/Eventual-Inc/Daft/issues/4195
71
+
72
+ :param identifier: Daft table identifier. Sequence of strings of the format (namespace) or (namespace, table)
73
+ or (namespace, table, table version). If this is a string, it is a dot delimited string of the same format.
74
+ Identifiers can be created either like Identifier("namespace", "table", "version") OR
75
+ Identifier.from_str("namespace.table.version")
76
+
77
+ :param source: a TableSource, either a Daft DataFrame, Daft Schema, or str (filesystem path)
78
+ """
79
+ if isinstance(source, DataFrame):
80
+ return self._create_table_from_df(identifier, source)
81
+ elif isinstance(source, Schema):
82
+ return self._create_table_from_schema(identifier, source)
83
+ else:
84
+ raise Exception(
85
+ f"Expected table source to be Schema or DataFrame. Found: {type(source)}"
86
+ )
87
+
88
+ def _create_table_from_df(
89
+ self, ident: Identifier | str, source: DataFrame, **kwargs
90
+ ) -> Table:
91
+ """
92
+ Create a table from a DataFrame.
93
+ """
94
+ t = self._create_table_from_schema(ident, source.schema(), **kwargs)
95
+ # TODO (mccember) append data upon creation
96
+ return t
97
+
98
+ def _create_table_from_schema(
99
+ self, ident: Identifier | str, source: Schema, **kwargs
100
+ ) -> Table:
101
+ """
102
+ Create a table from a schema.
103
+ """
104
+ namespace, name, version = self._extract_namespace_name_version(ident)
105
+
106
+ # Convert the Daft schema to a DeltaCAT schema
107
+ # This is a simplified version, would need to be enhanced for production
108
+ deltacat_schema = self._convert_schema_to_deltacat(source)
109
+
110
+ # Create the table in DeltaCAT
111
+ table_def = self.dc_catalog.impl.create_table(
112
+ name,
113
+ namespace=namespace,
114
+ version=version,
115
+ schema=deltacat_schema,
116
+ inner=self.dc_catalog.inner,
117
+ **kwargs,
118
+ )
119
+
120
+ return DaftTable._from_obj(table_def)
121
+
122
+ ###
123
+ # drop_*
124
+ ###
125
+
126
+ def drop_namespace(self, identifier: Identifier | str):
127
+ raise NotImplementedError()
128
+
129
+ def drop_table(self, identifier: Identifier | str):
130
+ raise NotImplementedError()
131
+
132
+ ###
133
+ # get_*
134
+ ###
135
+
136
+ def get_table(self, identifier: Identifier | str, **kwargs) -> Table:
137
+ namespace, table, version = self._extract_namespace_name_version(identifier)
138
+
139
+ table_def = self.dc_catalog.impl.get_table(
140
+ table,
141
+ namespace=namespace,
142
+ table_version=version,
143
+ inner=self.dc_catalog.inner,
144
+ **kwargs,
145
+ )
146
+
147
+ if not table_def:
148
+ raise ValueError(f"Table {identifier} not found")
149
+
150
+ return DaftTable._from_obj(table_def)
151
+
152
+ ###
153
+ # list_*
154
+ ###
155
+
156
+ def list_namespaces(self, pattern: str | None = None) -> list[Identifier]:
157
+ raise NotImplementedError("Not implemented")
158
+
159
+ def list_tables(self, pattern: str | None = None) -> list[str]:
160
+ raise NotImplementedError("Not implemented")
161
+
162
+ def _extract_namespace_name_version(
163
+ self, ident: Identifier | str
164
+ ) -> Tuple[str, str, Optional[str]]:
165
+ """
166
+ Extract namespace, name,version from identifier
167
+
168
+ Returns a 3-tuple. If no namespace is provided, uses DeltaCAT defualt namespace
169
+ """
170
+ default_namespace = DEFAULT_NAMESPACE
171
+
172
+ if isinstance(ident, str):
173
+ ident = Identifier.from_str(ident)
174
+
175
+ if isinstance(ident, Identifier):
176
+ if len(ident) == 1:
177
+ return (default_namespace, ident[0], None)
178
+ elif len(ident) == 2:
179
+ return (ident[0], ident[1], None)
180
+ elif len(ident) == 3:
181
+ return (ident[0], ident[1], ident[2])
182
+ else:
183
+ raise ValueError(
184
+ f"Expected table identifier to be in format (table) or (namespace, table)"
185
+ f"or (namespace, table, version). Found: {ident}"
186
+ )
187
+
188
+ def _convert_schema_to_deltacat(self, schema: Schema):
189
+ """Convert Daft schema to DeltaCAT schema.
190
+ For now, just use PyArrow schema as intermediary
191
+ TODO look into how enhancements on schema can be propagated between Daft<=>DeltaCAT
192
+ """
193
+ from deltacat.storage.model.schema import Schema as DeltaCATSchema
194
+
195
+ return DeltaCATSchema.of(schema=schema.to_pyarrow_schema())
196
+
197
+
198
+ class DaftTable(Table):
199
+ """
200
+ Wrapper class to create a Daft table from a DeltaCAT table
201
+ """
202
+
203
+ _inner: TableDefinition
204
+
205
+ _read_options = set()
206
+ _write_options = set()
207
+
208
+ def __init__(self, inner: TableDefinition):
209
+ self._inner = inner
210
+
211
+ @property
212
+ def name(self) -> str:
213
+ """Return the table name."""
214
+ return self._inner.table_version.table_name
215
+
216
+ @staticmethod
217
+ def _from_obj(obj: object) -> DaftTable:
218
+ """Returns a DeltaCATTable if the given object can be adapted so."""
219
+ if isinstance(obj, TableDefinition):
220
+ t = DaftTable.__new__(DaftTable)
221
+ t._inner = obj
222
+ return t
223
+ raise ValueError(f"Unsupported DeltaCAT table type: {type(obj)}")
224
+
225
+ def read(self, **options) -> DataFrame:
226
+ raise NotImplementedError("Not implemented")
227
+
228
+ def write(self, df: DataFrame | object, mode: str = "append", **options):
229
+ raise NotImplementedError("Not implemented")
@@ -11,6 +11,8 @@ from deltacat import logs
11
11
 
12
12
  from deltacat.storage.model.schema import FieldLocator
13
13
 
14
+ import json
15
+
14
16
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
17
 
16
18
 
@@ -192,6 +194,20 @@ class Manifest(dict):
192
194
  manifest = Manifest._build_manifest(meta, entries, author, uuid)
193
195
  return manifest
194
196
 
197
+ @staticmethod
198
+ def from_json(json_string: str) -> Manifest:
199
+ parsed_dict = json.loads(json_string)
200
+ return Manifest.of(
201
+ entries=ManifestEntryList.of(
202
+ [
203
+ ManifestEntry.from_dict(entry)
204
+ for entry in parsed_dict.get("entries", [])
205
+ ]
206
+ ),
207
+ author=ManifestAuthor.from_dict(parsed_dict.get("author")),
208
+ uuid=parsed_dict.get("id"),
209
+ )
210
+
195
211
  @staticmethod
196
212
  def merge_manifests(
197
213
  manifests: List[Manifest], author: Optional[ManifestAuthor] = None
@@ -264,6 +280,23 @@ class ManifestMeta(dict):
264
280
  manifest_meta["entry_params"] = entry_params
265
281
  return manifest_meta
266
282
 
283
+ @staticmethod
284
+ def from_dict(obj: dict) -> Optional[ManifestMeta]:
285
+ if obj is None:
286
+ return None
287
+
288
+ return ManifestMeta.of(
289
+ record_count=obj.get("record_count"),
290
+ content_length=obj.get("content_length"),
291
+ content_type=obj.get("content_type"),
292
+ content_encoding=obj.get("content_encoding"),
293
+ source_content_length=obj.get("source_content_length"),
294
+ credentials=obj.get("credentials"),
295
+ content_type_parameters=obj.get("content_type_parameters"),
296
+ entry_type=obj.get("entry_type"),
297
+ entry_params=obj.get("entry_params"),
298
+ )
299
+
267
300
  @property
268
301
  def record_count(self) -> Optional[int]:
269
302
  return self.get("record_count")
@@ -358,6 +391,16 @@ class ManifestEntry(dict):
358
391
  manifest_entry = ManifestEntry.of(url, manifest_entry_meta)
359
392
  return manifest_entry
360
393
 
394
+ @staticmethod
395
+ def from_dict(obj: dict) -> ManifestEntry:
396
+ return ManifestEntry.of(
397
+ url=obj.get("url"),
398
+ uri=obj.get("uri"),
399
+ meta=ManifestMeta.from_dict(obj.get("meta")),
400
+ mandatory=obj.get("mandatory", True),
401
+ uuid=obj.get("id"),
402
+ )
403
+
361
404
  @property
362
405
  def uri(self) -> Optional[str]:
363
406
  return self.get("uri")
@@ -392,6 +435,12 @@ class ManifestAuthor(dict):
392
435
  manifest_author["version"] = version
393
436
  return manifest_author
394
437
 
438
+ @staticmethod
439
+ def from_dict(obj: dict) -> Optional[ManifestAuthor]:
440
+ if obj is None:
441
+ return None
442
+ return ManifestAuthor.of(obj.get("name"), obj.get("version"))
443
+
395
444
  @property
396
445
  def name(self) -> Optional[str]:
397
446
  return self.get("name")
@@ -0,0 +1,129 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from deltacat.storage.model.manifest import Manifest, ManifestEntry
6
+
7
+
8
+ @pytest.fixture
9
+ def manifest_a():
10
+ return """
11
+ {
12
+ "entries":[
13
+ {
14
+ "uri":"s3://test_bucket/file1.tsv.gz",
15
+ "mandatory":true,
16
+ "meta":{
17
+ "record_count":0,
18
+ "content_length":123,
19
+ "source_content_length":0,
20
+ "content_type":"application/x-amzn-unescaped-tsv",
21
+ "content_encoding":"gzip"
22
+ }
23
+ },
24
+ {
25
+ "uri":"s3://test_bucket/file2.tsv.gz",
26
+ "mandatory":true,
27
+ "meta":{
28
+ "record_count":0,
29
+ "content_length":456,
30
+ "source_content_length":0,
31
+ "content_type":"application/x-amzn-unescaped-tsv",
32
+ "content_encoding":"gzip"
33
+ }
34
+ }
35
+ ],
36
+ "meta":{
37
+ "record_count":0,
38
+ "content_length":579,
39
+ "source_content_length":0,
40
+ "content_type":"application/x-amzn-unescaped-tsv",
41
+ "content_encoding":"gzip"
42
+ },
43
+ "id":"052f62c0-5082-4935-9937-18a705156123",
44
+ "author":{
45
+ "name":"Dave",
46
+ "version":"1.0"
47
+ }
48
+ }
49
+ """
50
+
51
+
52
+ @pytest.fixture
53
+ def manifest_no_author():
54
+ return """
55
+ {
56
+ "entries":[
57
+ {
58
+ "uri":"s3://test_bucket/file1.tsv.gz",
59
+ "mandatory":true,
60
+ "meta":{
61
+ "record_count":0,
62
+ "content_length":123,
63
+ "source_content_length":0,
64
+ "content_type":"application/x-amzn-unescaped-tsv",
65
+ "content_encoding":"gzip"
66
+ }
67
+ },
68
+ {
69
+ "uri":"s3://test_bucket/file2.tsv.gz",
70
+ "mandatory":true,
71
+ "meta":{
72
+ "record_count":0,
73
+ "content_length":456,
74
+ "source_content_length":0,
75
+ "content_type":"application/x-amzn-unescaped-tsv",
76
+ "content_encoding":"gzip"
77
+ }
78
+ }
79
+ ],
80
+ "meta":{
81
+ "record_count":0,
82
+ "content_length":579,
83
+ "source_content_length":0,
84
+ "content_type":"application/x-amzn-unescaped-tsv",
85
+ "content_encoding":"gzip"
86
+ },
87
+ "id":"052f62c0-5082-4935-9937-18a705156123"
88
+ }
89
+ """
90
+
91
+
92
+ @pytest.fixture()
93
+ def manifest_entry_no_meta():
94
+ return """
95
+ {
96
+ "uri":"s3://test_bucket/file1.tsv.gz",
97
+ "mandatory":true
98
+ }
99
+ """
100
+
101
+
102
+ def test_manifest_from_json(manifest_a):
103
+ manifest = Manifest.from_json(manifest_a)
104
+
105
+ assert manifest.entries is not None
106
+ assert len(manifest.entries) == 2
107
+ assert manifest.entries[0].uri == "s3://test_bucket/file1.tsv.gz"
108
+ assert manifest.entries[0].meta.record_count == 0
109
+ assert manifest.meta.content_length == 579
110
+ assert manifest.author.name == "Dave"
111
+
112
+
113
+ def test_manifest_from_json_no_author(manifest_no_author):
114
+ manifest = Manifest.from_json(manifest_no_author)
115
+
116
+ assert manifest.entries is not None
117
+ assert len(manifest.entries) == 2
118
+ assert manifest.entries[0].uri == "s3://test_bucket/file1.tsv.gz"
119
+ assert manifest.entries[0].meta is not None
120
+ assert manifest.author is None
121
+
122
+
123
+ def test_manifest_entry_from_dict_no_meta(manifest_entry_no_meta):
124
+ entry = ManifestEntry.from_dict(json.loads(manifest_entry_no_meta))
125
+
126
+ assert entry is not None
127
+ assert entry.meta is None
128
+ assert entry.uri == "s3://test_bucket/file1.tsv.gz"
129
+ assert entry.mandatory is True
deltacat/utils/daft.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Optional, List, Any, Dict, Callable
3
3
  import daft
4
4
  import ray
5
- from daft.table import read_parquet_into_pyarrow
5
+ from daft.recordbatch import read_parquet_into_pyarrow
6
6
  from daft import TimeUnit, DataFrame
7
7
  from daft.io import IOConfig, S3Config
8
8
  import pyarrow as pa
@@ -10,7 +10,6 @@ import pyarrow as pa
10
10
  from deltacat import logs
11
11
  from deltacat.utils.common import ReadKwargsProvider
12
12
  from deltacat.utils.schema import coerce_pyarrow_table_to_schema
13
-
14
13
  from deltacat.types.media import ContentType, ContentEncoding
15
14
  from deltacat.aws.constants import (
16
15
  BOTO_MAX_RETRIES,
@@ -72,9 +71,7 @@ def s3_files_to_dataframe(
72
71
  f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
73
72
  )
74
73
 
75
- df, latency = timed_invocation(
76
- daft.read_parquet, path=uris, io_config=io_config, use_native_downloader=True
77
- )
74
+ df, latency = timed_invocation(daft.read_parquet, path=uris, io_config=io_config)
78
75
 
79
76
  logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
80
77
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 2.0.0b3
3
+ Version: 2.0.0b7
4
4
  Summary: A portable, scalable, fast, and Pythonic Data Lakehouse for AI.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -17,11 +17,11 @@ Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: aws-embedded-metrics==3.2.0
19
19
  Requires-Dist: boto3~=1.34
20
- Requires-Dist: getdaft==0.3.6
20
+ Requires-Dist: getdaft>=0.4.11
21
21
  Requires-Dist: intervaltree==3.1.0
22
22
  Requires-Dist: numpy==1.21.5
23
23
  Requires-Dist: pandas==1.3.5
24
- Requires-Dist: pyarrow==17.0.0
24
+ Requires-Dist: pyarrow==16.0.0
25
25
  Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3
26
26
  Requires-Dist: pymemcache==4.0.0
27
27
  Requires-Dist: ray>=2.20.0
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=GCLov4iY1E1wvwH6d8j0edbjmuyEHRWEvGEJ2Zs6UHo,2474
1
+ deltacat/__init__.py,sha256=EXieJNtfMEoIDqJ3wlqkpjUyMXs5y6hHmK0bA-AU_yg,2474
2
2
  deltacat/annotations.py,sha256=9lBi34DpIV_RPjCCK2Aiz_6nMyd-e-_CfQ1XtdRQQlM,1196
3
3
  deltacat/api.py,sha256=fYKurVlM97VKb_fh7kJ1rDcl-VAAuSflxPeqrsUt1u8,5257
4
4
  deltacat/constants.py,sha256=_JfHTRktDTM70Nls-LMnSmLeCRG17UwQYCmY6gQSGBg,3482
@@ -24,10 +24,10 @@ deltacat/catalog/delegate.py,sha256=x3jj_T61gyExuAnbDqhU6smbaAbIN4UxrVMZuBEOg0A,
24
24
  deltacat/catalog/interface.py,sha256=YB-qNBFsWupqyWJuHr7eQ-_MshhZZ5HpLphoZ64yn2g,12244
25
25
  deltacat/catalog/iceberg/__init__.py,sha256=LOENcLTQQlu_694MvRhMd2TQDLzwfg2vz0D8DuVO3M8,190
26
26
  deltacat/catalog/iceberg/iceberg_catalog_config.py,sha256=LfHxv8pk-YmTRQy5LvKFzwSqZ8ek2Y6v0KY7xihhIN0,786
27
- deltacat/catalog/iceberg/impl.py,sha256=hFAX0QGfWq25t9miYHACye_t_3fxUAmQXpQ9kf3w_xQ,13591
28
- deltacat/catalog/iceberg/overrides.py,sha256=HGev1Us2zJpavAoClCCMHrf6sQ8fG0poSxyLEJOB-Ss,2668
27
+ deltacat/catalog/iceberg/impl.py,sha256=c_ONnLLyh8Vyqo5PusQSHySQ92iM4Qgk-rucHMfdd7s,14288
28
+ deltacat/catalog/iceberg/overrides.py,sha256=WmM2mxf7ihDl8anb5GzBxo5-sxBkot8ZSRTxDpaauRA,2687
29
29
  deltacat/catalog/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- deltacat/catalog/main/impl.py,sha256=y7sya4BVfYMbp0-smgs_00cktw7QHkJxXTWADSr0W3s,23093
30
+ deltacat/catalog/main/impl.py,sha256=E9gCPaARJAaiIS2HTdXXz0-GwTjOaWIBX2TK2MsL194,23092
31
31
  deltacat/catalog/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  deltacat/catalog/model/catalog.py,sha256=zGuNl1Czqbk2QQx9qGpMWCTK9ay4b3tm3SJzNkLlw-I,10198
33
33
  deltacat/catalog/model/properties.py,sha256=wdXjd39-JEj-zZLL5pH6wyIXAdpph-CD7yEIF96Wn-A,4110
@@ -130,14 +130,20 @@ deltacat/compute/stats/models/delta_stats.py,sha256=hBith8_hbF9TVr6HocLAt6RJ_kZZ
130
130
  deltacat/compute/stats/models/delta_stats_cache_result.py,sha256=mbJYxpZd5jaER_BWrCD2hROFy3p1nNdBrj66nUpc6io,1624
131
131
  deltacat/compute/stats/models/manifest_entry_stats.py,sha256=NCDAe2nPDEI4kOkuwNkRFgGPS-rqQaQqLuaLoKk20KQ,2419
132
132
  deltacat/compute/stats/models/stats_result.py,sha256=XQAlmzhUqRmg4jzEMUAOqcYn1HUOBTMryBH1CCVlet8,3820
133
+ deltacat/daft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
+ deltacat/daft/daft_scan.py,sha256=u0RpSZTujF9ScuFkXBLkEXfG2eMkoww5ypG2Eri0HrU,3778
135
+ deltacat/daft/model.py,sha256=6NaKkp9R0ruE0K2x-moyARNrQisswUl6TjMeA6YHtBM,9078
133
136
  deltacat/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
137
  deltacat/examples/basic_logging.py,sha256=IwUa-MwQbmH5vuzRvmz5NtfYXU2qNUID_0zkO5HlUZo,2826
135
138
  deltacat/examples/hello_world.py,sha256=hXpMUvJINB2qWTpV3QFPlRNu0uE31BvEs2sLyQ3CWZk,530
136
139
  deltacat/examples/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
140
  deltacat/examples/common/fixtures.py,sha256=MS0Hz1c__f9Axm3JgTajfWuMVeDAQmFmZ7KB7vz_1q4,430
138
141
  deltacat/examples/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- deltacat/examples/iceberg/iceberg_bucket_writer.py,sha256=9i78x8WBgp-vvMBsvbCWkcRo6oEZ8SDtGfjMlNXAO30,4521
142
+ deltacat/examples/iceberg/iceberg_bucket_writer.py,sha256=PdJG3jXcgPVds4UanfyNWB1egv-Os7LnZCPhdgv9Yyk,6586
140
143
  deltacat/examples/iceberg/iceberg_reader.py,sha256=mlF-277vT04at-2jibAjgRJG6Y-zle_NNy1-pXwS2YQ,5023
144
+ deltacat/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
+ deltacat/experimental/daft/__init__.py,sha256=0d1SsgjbDher8TKgS0gSBBdy5TGi01fewiwpG0BMwck,108
146
+ deltacat/experimental/daft/daft_catalog.py,sha256=112wDqqzdtxmtZVwiZW59MektbRsFMjSRgqYHrUOuok,8396
141
147
  deltacat/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
148
  deltacat/io/file_object_store.py,sha256=YoNL3Qla8uLOHaWnyBmIgotjSGAy3Td3Tumah0kk73Y,1868
143
149
  deltacat/io/memcached_object_store.py,sha256=C96t77-4BQe0XZ4vC76Ygi2o1POUoMN4t4BiyPmulz0,10997
@@ -158,7 +164,7 @@ deltacat/storage/model/delta.py,sha256=PhkjME0dItGgPd37SrQbI8VjQcIaYW2OfIq0KJKgD
158
164
  deltacat/storage/model/interop.py,sha256=CzXdu1NuJF5ER3IjQJztkNECD6MRDwbmMezlfN4SRH0,536
159
165
  deltacat/storage/model/list_result.py,sha256=5DpRAu-c0M48cHtKdTRPSgQiq2nCWfjAY8LOVqp5wxI,2703
160
166
  deltacat/storage/model/locator.py,sha256=Q16y-eGSQSZpDPKDYQhOjSA9c5ajwg1jLw_13MIB4SM,4707
161
- deltacat/storage/model/manifest.py,sha256=iV53LLQY83pDv9YwUqlyzjfLiqFHWuJf9J0dZdR7yO4,15153
167
+ deltacat/storage/model/manifest.py,sha256=3I4Vohd-PnEQ5NdQu9yN3jvFchqnzb8hQ3bq6w_tO4E,16808
162
168
  deltacat/storage/model/metafile.py,sha256=UVWPvvYvA0tj_pM8ig7NKfVFrVWU4l3eDP7I2n9Upeg,53404
163
169
  deltacat/storage/model/namespace.py,sha256=gLli1V64O9RHIf-FesmqWA29Wi7P1kwt01uz5sDdJR0,2409
164
170
  deltacat/storage/model/partition.py,sha256=qNCvc74o_4pmFVL-FCyKCZMH4lHSjRO560sb3vaF_H0,20759
@@ -277,6 +283,7 @@ deltacat/tests/storage/main/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
277
283
  deltacat/tests/storage/main/test_main_storage.py,sha256=9dtsAcp9GZ4XQ5-8XhKnAcFF7upowJpTIuqZUB2EYig,58124
278
284
  deltacat/tests/storage/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
279
285
  deltacat/tests/storage/model/test_delete_parameters.py,sha256=RcNRMIed0zUzkX9tRXDoYPXHb7721OEt8viY9tpWXZM,822
286
+ deltacat/tests/storage/model/test_manifest.py,sha256=udp9YUNvIBpnT-NutjMaF25abEQOXEcPkQm8Aay_UCs,3733
280
287
  deltacat/tests/storage/model/test_metafile_io.py,sha256=116U9aNJPzR0JS6iadJyyx0_4KyAi3D47WCNbndag6o,101639
281
288
  deltacat/tests/storage/model/test_schema.py,sha256=5m4BscbxbbOiry-lDI8j4vQcnvkG2Y-f0ZfshncPiSI,9599
282
289
  deltacat/tests/storage/model/test_shard.py,sha256=6QBr-ws3zQkJjjGyB7QEOhtNC5ql0cdjOPB2wxGNW3Q,755
@@ -323,7 +330,7 @@ deltacat/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
330
  deltacat/utils/arguments.py,sha256=5y1Xz4HSAD8M8Jt83i6gOEKoYjy_fMQe1V43IhIE4hY,1191
324
331
  deltacat/utils/cloudpickle.py,sha256=XE7YDmQe56ksfl3NdYZkzOAhbHSuhNcBZGOehQpgZr0,1187
325
332
  deltacat/utils/common.py,sha256=RG_-enXNpLKaYrqyx1ne2lL10lxN9vK7F631oJP6SE8,1375
326
- deltacat/utils/daft.py,sha256=nd4XBKcZTFYxf_VH9jm-wqqbrIujKAeisCt2vVbW2BA,5807
333
+ deltacat/utils/daft.py,sha256=RsOGzxI6ltsRcH6SfbK6PDBEaKyLZaUisCBXBlUvjbI,5770
327
334
  deltacat/utils/export.py,sha256=As5aiwOw9vLxtfolPLU0yak6W2RVR0rkuaYQ5YCy49U,1952
328
335
  deltacat/utils/filesystem.py,sha256=DthBgrVGzIcsQcGnyD3QYEQIpkYFxB19XmpF9DfCaeo,11709
329
336
  deltacat/utils/metafile_locator.py,sha256=_3yEW9n49jiEBuXHZmUKsFdYx6RxWWuS-Mu2gs_a1bw,2933
@@ -342,8 +349,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
342
349
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
343
350
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
344
351
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
345
- deltacat-2.0.0b3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
346
- deltacat-2.0.0b3.dist-info/METADATA,sha256=mRoST3kb94Civ8ipex9LlT7_BQ1Sz2vMbukcv10AT6g,2808
347
- deltacat-2.0.0b3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
348
- deltacat-2.0.0b3.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
349
- deltacat-2.0.0b3.dist-info/RECORD,,
352
+ deltacat-2.0.0b7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
353
+ deltacat-2.0.0b7.dist-info/METADATA,sha256=qLbeNiyQZXKd0ZfQ0AZF9GqIXH8A64YGDUJulu0Rb2k,2809
354
+ deltacat-2.0.0b7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
355
+ deltacat-2.0.0b7.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
356
+ deltacat-2.0.0b7.dist-info/RECORD,,