deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -16
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +62 -5
- deltacat/catalog/main/impl.py +26 -10
- deltacat/catalog/model/catalog.py +165 -109
- deltacat/catalog/model/properties.py +25 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/client.py +406 -0
- deltacat/constants.py +5 -6
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
- deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
- deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/types.py +5 -3
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +52 -98
- deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +0 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +531 -5
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/daft/daft_scan.py +0 -111
- deltacat/daft/model.py +0 -258
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- /deltacat/{daft → compute/jobs}/__init__.py +0 -0
- /deltacat/examples/{common → experimental}/__init__.py +0 -0
- /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/utils/daft.py
CHANGED
@@ -1,13 +1,42 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional, List, Any, Dict, Callable
|
2
|
+
from typing import Optional, List, Any, Dict, Callable, Iterator
|
3
|
+
|
4
|
+
from daft.daft import (
|
5
|
+
StorageConfig,
|
6
|
+
PartitionField,
|
7
|
+
Pushdowns as DaftRustPushdowns,
|
8
|
+
ScanTask,
|
9
|
+
FileFormatConfig,
|
10
|
+
ParquetSourceConfig,
|
11
|
+
PartitionTransform as DaftTransform,
|
12
|
+
PartitionField as DaftPartitionField,
|
13
|
+
)
|
14
|
+
from daft.expressions import Expression as DaftExpression
|
15
|
+
from daft.expressions.visitor import PredicateVisitor
|
16
|
+
from pyarrow import Field as PaField
|
17
|
+
|
3
18
|
import daft
|
4
19
|
import ray
|
20
|
+
from daft import (
|
21
|
+
TimeUnit,
|
22
|
+
DataFrame,
|
23
|
+
Schema as DaftSchema,
|
24
|
+
DataType,
|
25
|
+
)
|
26
|
+
from daft.logical.schema import Field as DaftField
|
5
27
|
from daft.recordbatch import read_parquet_into_pyarrow
|
6
|
-
from daft import
|
7
|
-
|
28
|
+
from daft.io import (
|
29
|
+
IOConfig,
|
30
|
+
S3Config,
|
31
|
+
)
|
32
|
+
from daft.io.scan import (
|
33
|
+
ScanOperator,
|
34
|
+
make_partition_field,
|
35
|
+
)
|
8
36
|
import pyarrow as pa
|
9
37
|
|
10
38
|
from deltacat import logs
|
39
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
11
40
|
from deltacat.utils.common import ReadKwargsProvider
|
12
41
|
from deltacat.utils.schema import coerce_pyarrow_table_to_schema
|
13
42
|
from deltacat.types.media import ContentType, ContentEncoding
|
@@ -22,11 +51,180 @@ from deltacat.utils.performance import timed_invocation
|
|
22
51
|
from deltacat.types.partial_download import (
|
23
52
|
PartialFileDownloadParams,
|
24
53
|
)
|
25
|
-
|
54
|
+
from deltacat.storage import (
|
55
|
+
Transform,
|
56
|
+
IdentityTransform,
|
57
|
+
HourTransform,
|
58
|
+
DayTransform,
|
59
|
+
MonthTransform,
|
60
|
+
YearTransform,
|
61
|
+
BucketTransform,
|
62
|
+
BucketingStrategy,
|
63
|
+
TruncateTransform,
|
64
|
+
PartitionKey,
|
65
|
+
Schema,
|
66
|
+
)
|
67
|
+
from deltacat.storage.model.interop import ModelMapper
|
68
|
+
from deltacat.storage.model.expression import (
|
69
|
+
Expression,
|
70
|
+
Reference,
|
71
|
+
Literal,
|
72
|
+
Equal,
|
73
|
+
NotEqual,
|
74
|
+
GreaterThan,
|
75
|
+
LessThan,
|
76
|
+
GreaterThanEqual,
|
77
|
+
LessThanEqual,
|
78
|
+
And,
|
79
|
+
Or,
|
80
|
+
Not,
|
81
|
+
IsNull,
|
82
|
+
)
|
83
|
+
from deltacat.storage.model.scan.push_down import (
|
84
|
+
PartitionFilter,
|
85
|
+
Pushdown as DeltaCatPushdown,
|
86
|
+
)
|
26
87
|
|
27
88
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
89
|
|
29
90
|
|
91
|
+
def translate_pushdown(pushdown: DaftRustPushdowns) -> DeltaCatPushdown:
|
92
|
+
"""
|
93
|
+
Helper method to translate a Daft Pushdowns object into a Deltacat Pushdown.
|
94
|
+
Args:
|
95
|
+
pushdown: Daft Daft Pushdowns object
|
96
|
+
Returns:
|
97
|
+
Pushdown: Deltacat Pushdown object with translated filters
|
98
|
+
"""
|
99
|
+
translator = DaftToDeltacatVisitor()
|
100
|
+
|
101
|
+
partition_filters = None
|
102
|
+
if pushdown.partition_filters is not None:
|
103
|
+
daft_expr = DaftExpression._from_pyexpr(pushdown.partition_filters)
|
104
|
+
partition_filters = PartitionFilter.of(translator.visit(daft_expr))
|
105
|
+
|
106
|
+
filters = None
|
107
|
+
if pushdown.filters is not None:
|
108
|
+
daft_expr = DaftExpression._from_pyexpr(pushdown.filters)
|
109
|
+
# TODO: support deltacat row filters
|
110
|
+
# filters = RowFilter.of(translator.visit(daft_expr))
|
111
|
+
|
112
|
+
columns = None
|
113
|
+
limit = None
|
114
|
+
|
115
|
+
return DeltaCatPushdown.of(
|
116
|
+
partition_filter=partition_filters,
|
117
|
+
column_filter=columns,
|
118
|
+
row_filter=filters,
|
119
|
+
limit=limit,
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
|
124
|
+
"""PredicateVisitor implementation to translate Daft Expressions into Deltacat Expressions"""
|
125
|
+
|
126
|
+
def visit_col(self, name: str) -> Expression:
|
127
|
+
return Reference.of(name)
|
128
|
+
|
129
|
+
def visit_lit(self, value: Any) -> Expression:
|
130
|
+
return Literal.of(value)
|
131
|
+
|
132
|
+
def visit_cast(self, expr: DaftExpression, dtype: DataType) -> Expression:
|
133
|
+
# deltacat expressions do not support explicit casting
|
134
|
+
# pyarrow should handle any type casting
|
135
|
+
return self.visit(expr)
|
136
|
+
|
137
|
+
def visit_alias(self, expr: DaftExpression, alias: str) -> Expression:
|
138
|
+
return self.visit(expr)
|
139
|
+
|
140
|
+
def visit_function(self, name: str, args: List[DaftExpression]) -> Expression:
|
141
|
+
# TODO: Add Deltacat expression function support
|
142
|
+
raise ValueError("Function not supported")
|
143
|
+
|
144
|
+
def visit_and(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
145
|
+
"""Visit an 'and' expression."""
|
146
|
+
return And.of(self.visit(left), self.visit(right))
|
147
|
+
|
148
|
+
def visit_or(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
149
|
+
"""Visit an 'or' expression."""
|
150
|
+
return Or.of(self.visit(left), self.visit(right))
|
151
|
+
|
152
|
+
def visit_not(self, expr: DaftExpression) -> Expression:
|
153
|
+
"""Visit a 'not' expression."""
|
154
|
+
return Not.of(self.visit(expr))
|
155
|
+
|
156
|
+
def visit_equal(self, left: DaftExpression, right: DaftExpression) -> Expression:
|
157
|
+
"""Visit an 'equals' comparison predicate."""
|
158
|
+
return Equal.of(self.visit(left), self.visit(right))
|
159
|
+
|
160
|
+
def visit_not_equal(
|
161
|
+
self, left: DaftExpression, right: DaftExpression
|
162
|
+
) -> Expression:
|
163
|
+
"""Visit a 'not equals' comparison predicate."""
|
164
|
+
return NotEqual.of(self.visit(left), self.visit(right))
|
165
|
+
|
166
|
+
def visit_less_than(
|
167
|
+
self, left: DaftExpression, right: DaftExpression
|
168
|
+
) -> Expression:
|
169
|
+
"""Visit a 'less than' comparison predicate."""
|
170
|
+
return LessThan.of(self.visit(left), self.visit(right))
|
171
|
+
|
172
|
+
def visit_less_than_or_equal(
|
173
|
+
self, left: DaftExpression, right: DaftExpression
|
174
|
+
) -> Expression:
|
175
|
+
"""Visit a 'less than or equal' comparison predicate."""
|
176
|
+
return LessThanEqual.of(self.visit(left), self.visit(right))
|
177
|
+
|
178
|
+
def visit_greater_than(
|
179
|
+
self, left: DaftExpression, right: DaftExpression
|
180
|
+
) -> Expression:
|
181
|
+
"""Visit a 'greater than' comparison predicate."""
|
182
|
+
return GreaterThan.of(self.visit(left), self.visit(right))
|
183
|
+
|
184
|
+
def visit_greater_than_or_equal(
|
185
|
+
self, left: DaftExpression, right: DaftExpression
|
186
|
+
) -> Expression:
|
187
|
+
"""Visit a 'greater than or equal' comparison predicate."""
|
188
|
+
return GreaterThanEqual.of(self.visit(left), self.visit(right))
|
189
|
+
|
190
|
+
def visit_between(
|
191
|
+
self, expr: DaftExpression, lower: DaftExpression, upper: DaftExpression
|
192
|
+
) -> Expression:
|
193
|
+
"""Visit a 'between' predicate."""
|
194
|
+
# Implement BETWEEN as lower <= expr <= upper
|
195
|
+
lower_bound = LessThanEqual.of(self.visit(lower), self.visit(expr))
|
196
|
+
upper_bound = LessThanEqual.of(self.visit(expr), self.visit(upper))
|
197
|
+
return And.of(lower_bound, upper_bound)
|
198
|
+
|
199
|
+
def visit_is_in(
|
200
|
+
self, expr: DaftExpression, items: list[DaftExpression]
|
201
|
+
) -> Expression:
|
202
|
+
"""Visit an 'is_in' predicate."""
|
203
|
+
# For empty list, return false literal
|
204
|
+
if not items:
|
205
|
+
return Literal(pa.scalar(False))
|
206
|
+
|
207
|
+
# Implement IN as a series of equality checks combined with OR
|
208
|
+
visited_expr = self.visit(expr)
|
209
|
+
equals_exprs = [Equal.of(visited_expr, self.visit(item)) for item in items]
|
210
|
+
|
211
|
+
# Combine with OR
|
212
|
+
result = equals_exprs[0]
|
213
|
+
for eq_expr in equals_exprs[1:]:
|
214
|
+
result = Or.of(result, eq_expr)
|
215
|
+
|
216
|
+
return result
|
217
|
+
|
218
|
+
def visit_is_null(self, expr: DaftExpression) -> Expression:
|
219
|
+
"""Visit an 'is_null' predicate."""
|
220
|
+
return IsNull.of(self.visit(expr))
|
221
|
+
|
222
|
+
def visit_not_null(self, expr: DaftExpression) -> Expression:
|
223
|
+
"""Visit an 'not_null' predicate."""
|
224
|
+
# NOT NULL is implemented as NOT(IS NULL)
|
225
|
+
return Not.of(IsNull.of(self.visit(expr)))
|
226
|
+
|
227
|
+
|
30
228
|
def s3_files_to_dataframe(
|
31
229
|
uris: List[str],
|
32
230
|
content_type: str,
|
@@ -51,7 +249,7 @@ def s3_files_to_dataframe(
|
|
51
249
|
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
52
250
|
|
53
251
|
if not ray.is_initialized():
|
54
|
-
ray.init(
|
252
|
+
ray.init(ignore_reinit_error=True, **ray_init_options)
|
55
253
|
|
56
254
|
daft.context.set_runner_ray(noop_if_initialized=True)
|
57
255
|
|
@@ -167,3 +365,331 @@ def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
|
|
167
365
|
read_timeout_ms=10_000, # Timeout for first byte from server
|
168
366
|
)
|
169
367
|
)
|
368
|
+
|
369
|
+
|
370
|
+
class DeltaCatScanOperator(ScanOperator):
|
371
|
+
def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
|
372
|
+
super().__init__()
|
373
|
+
self.table = table
|
374
|
+
self._schema = self._infer_schema()
|
375
|
+
self.partition_keys = self._infer_partition_keys()
|
376
|
+
self.storage_config = storage_config
|
377
|
+
|
378
|
+
def schema(self) -> DaftSchema:
|
379
|
+
return self._schema
|
380
|
+
|
381
|
+
def name(self) -> str:
|
382
|
+
return "DeltaCatScanOperator"
|
383
|
+
|
384
|
+
def display_name(self) -> str:
|
385
|
+
return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
|
386
|
+
|
387
|
+
def partitioning_keys(self) -> list[PartitionField]:
|
388
|
+
return self.partition_keys
|
389
|
+
|
390
|
+
def multiline_display(self) -> list[str]:
|
391
|
+
return [
|
392
|
+
self.display_name(),
|
393
|
+
f"Schema = {self._schema}",
|
394
|
+
f"Partitioning keys = {self.partitioning_keys}",
|
395
|
+
f"Storage config = {self.storage_config}",
|
396
|
+
]
|
397
|
+
|
398
|
+
def to_scan_tasks(self, pushdowns: DaftRustPushdowns) -> Iterator[ScanTask]:
|
399
|
+
dc_pushdown = translate_pushdown(pushdowns)
|
400
|
+
dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
|
401
|
+
scan_tasks = []
|
402
|
+
file_format_config = FileFormatConfig.from_parquet_config(
|
403
|
+
# maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
|
404
|
+
ParquetSourceConfig()
|
405
|
+
)
|
406
|
+
for dc_scan_task in dc_scan_plan.scan_tasks:
|
407
|
+
for data_file in dc_scan_task.data_files():
|
408
|
+
st = ScanTask.catalog_scan_task(
|
409
|
+
file=data_file.file_path,
|
410
|
+
file_format=file_format_config,
|
411
|
+
schema=self._schema._schema,
|
412
|
+
storage_config=self.storage_config,
|
413
|
+
pushdowns=pushdowns,
|
414
|
+
)
|
415
|
+
scan_tasks.append(st)
|
416
|
+
return iter(scan_tasks)
|
417
|
+
|
418
|
+
def can_absorb_filter(self) -> bool:
|
419
|
+
return False
|
420
|
+
|
421
|
+
def can_absorb_limit(self) -> bool:
|
422
|
+
return False
|
423
|
+
|
424
|
+
def can_absorb_select(self) -> bool:
|
425
|
+
return True
|
426
|
+
|
427
|
+
def _infer_schema(self) -> DaftSchema:
|
428
|
+
|
429
|
+
if not (
|
430
|
+
self.table and self.table.table_version and self.table.table_version.schema
|
431
|
+
):
|
432
|
+
raise RuntimeError(
|
433
|
+
f"Failed to infer schema for DeltaCAT Table "
|
434
|
+
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
435
|
+
)
|
436
|
+
|
437
|
+
return DaftSchema.from_pyarrow_schema(self.table.table_version.schema.arrow)
|
438
|
+
|
439
|
+
def _infer_partition_keys(self) -> list[PartitionField]:
|
440
|
+
if not (
|
441
|
+
self.table
|
442
|
+
and self.table.table_version
|
443
|
+
and self.table.table_version.partition_scheme
|
444
|
+
and self.table.table_version.schema
|
445
|
+
):
|
446
|
+
raise RuntimeError(
|
447
|
+
f"Failed to infer partition keys for DeltaCAT Table "
|
448
|
+
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
449
|
+
)
|
450
|
+
|
451
|
+
schema = self.table.table_version.schema
|
452
|
+
partition_keys = self.table.table_version.partition_scheme.keys
|
453
|
+
if not partition_keys:
|
454
|
+
return []
|
455
|
+
|
456
|
+
partition_fields = []
|
457
|
+
for key in partition_keys:
|
458
|
+
field = DaftPartitionKeyMapper.unmap(key, schema)
|
459
|
+
# Assert that the returned value is not None.
|
460
|
+
assert field is not None, f"Unmapping failed for key {key}"
|
461
|
+
partition_fields.append(field)
|
462
|
+
|
463
|
+
return partition_fields
|
464
|
+
|
465
|
+
|
466
|
+
class DaftFieldMapper(ModelMapper[DaftField, PaField]):
|
467
|
+
@staticmethod
|
468
|
+
def map(
|
469
|
+
obj: Optional[DaftField],
|
470
|
+
**kwargs,
|
471
|
+
) -> Optional[PaField]:
|
472
|
+
"""Convert Daft Field to PyArrow Field.
|
473
|
+
|
474
|
+
Args:
|
475
|
+
obj: The Daft Field to convert
|
476
|
+
**kwargs: Additional arguments
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
Converted PyArrow Field object
|
480
|
+
"""
|
481
|
+
if obj is None:
|
482
|
+
return None
|
483
|
+
|
484
|
+
return pa.field(
|
485
|
+
name=obj.name,
|
486
|
+
type=obj.dtype.to_arrow_dtype(),
|
487
|
+
)
|
488
|
+
|
489
|
+
@staticmethod
|
490
|
+
def unmap(
|
491
|
+
obj: Optional[PaField],
|
492
|
+
**kwargs,
|
493
|
+
) -> Optional[DaftField]:
|
494
|
+
"""Convert PyArrow Field to Daft Field.
|
495
|
+
|
496
|
+
Args:
|
497
|
+
obj: The PyArrow Field to convert
|
498
|
+
**kwargs: Additional arguments
|
499
|
+
|
500
|
+
Returns:
|
501
|
+
Converted Daft Field object
|
502
|
+
"""
|
503
|
+
if obj is None:
|
504
|
+
return None
|
505
|
+
|
506
|
+
return DaftField.create(
|
507
|
+
name=obj.name,
|
508
|
+
dtype=DataType.from_arrow_type(obj.type), # type: ignore
|
509
|
+
)
|
510
|
+
|
511
|
+
|
512
|
+
class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
|
513
|
+
@staticmethod
|
514
|
+
def map(
|
515
|
+
obj: Optional[DaftTransform],
|
516
|
+
**kwargs,
|
517
|
+
) -> Optional[Transform]:
|
518
|
+
"""Convert DaftTransform to DeltaCAT Transform.
|
519
|
+
|
520
|
+
Args:
|
521
|
+
obj: The DaftTransform to convert
|
522
|
+
**kwargs: Additional arguments
|
523
|
+
|
524
|
+
Returns:
|
525
|
+
Converted Transform object
|
526
|
+
"""
|
527
|
+
|
528
|
+
# daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
|
529
|
+
# thus conversion is not possible.
|
530
|
+
# TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
|
531
|
+
raise NotImplementedError(
|
532
|
+
"Converting transform from Daft to DeltaCAT is not supported"
|
533
|
+
)
|
534
|
+
|
535
|
+
@staticmethod
|
536
|
+
def unmap(
|
537
|
+
obj: Optional[Transform],
|
538
|
+
**kwargs,
|
539
|
+
) -> Optional[DaftTransform]:
|
540
|
+
"""Convert DeltaCAT Transform to DaftTransform.
|
541
|
+
|
542
|
+
Args:
|
543
|
+
obj: The Transform to convert
|
544
|
+
**kwargs: Additional arguments
|
545
|
+
|
546
|
+
Returns:
|
547
|
+
Converted DaftTransform object
|
548
|
+
"""
|
549
|
+
if obj is None:
|
550
|
+
return None
|
551
|
+
|
552
|
+
# Map DeltaCAT transforms to Daft transforms using isinstance
|
553
|
+
|
554
|
+
if isinstance(obj, IdentityTransform):
|
555
|
+
return DaftTransform.identity()
|
556
|
+
elif isinstance(obj, HourTransform):
|
557
|
+
return DaftTransform.hour()
|
558
|
+
elif isinstance(obj, DayTransform):
|
559
|
+
return DaftTransform.day()
|
560
|
+
elif isinstance(obj, MonthTransform):
|
561
|
+
return DaftTransform.month()
|
562
|
+
elif isinstance(obj, YearTransform):
|
563
|
+
return DaftTransform.year()
|
564
|
+
elif isinstance(obj, BucketTransform):
|
565
|
+
if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
|
566
|
+
return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
|
567
|
+
else:
|
568
|
+
raise ValueError(
|
569
|
+
f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
|
570
|
+
)
|
571
|
+
elif isinstance(obj, TruncateTransform):
|
572
|
+
return DaftTransform.iceberg_truncate(obj.parameters.width)
|
573
|
+
|
574
|
+
raise ValueError(f"Unsupported Transform: {obj}")
|
575
|
+
|
576
|
+
|
577
|
+
class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
|
578
|
+
@staticmethod
|
579
|
+
def map(
|
580
|
+
obj: Optional[DaftPartitionField],
|
581
|
+
schema: Optional[DaftSchema] = None,
|
582
|
+
**kwargs,
|
583
|
+
) -> Optional[PartitionKey]:
|
584
|
+
"""Convert DaftPartitionField to PartitionKey.
|
585
|
+
|
586
|
+
Args:
|
587
|
+
obj: The DaftPartitionField to convert
|
588
|
+
schema: The Daft schema containing field information
|
589
|
+
**kwargs: Additional arguments
|
590
|
+
|
591
|
+
Returns:
|
592
|
+
Converted PartitionKey object
|
593
|
+
"""
|
594
|
+
# Daft PartitionField only exposes 1 attribute `field` which is not enough
|
595
|
+
# to convert to DeltaCAT PartitionKey
|
596
|
+
# TODO: request Daft to expose more Python friendly interface for PartitionField
|
597
|
+
raise NotImplementedError(
|
598
|
+
f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
|
599
|
+
)
|
600
|
+
|
601
|
+
@staticmethod
|
602
|
+
def unmap(
|
603
|
+
obj: Optional[PartitionKey],
|
604
|
+
schema: Optional[Schema] = None,
|
605
|
+
**kwargs,
|
606
|
+
) -> Optional[DaftPartitionField]:
|
607
|
+
"""Convert PartitionKey to DaftPartitionField.
|
608
|
+
|
609
|
+
Args:
|
610
|
+
obj: The DeltaCAT PartitionKey to convert
|
611
|
+
schema: The Schema containing field information
|
612
|
+
**kwargs: Additional arguments
|
613
|
+
|
614
|
+
Returns:
|
615
|
+
Converted DaftPartitionField object
|
616
|
+
"""
|
617
|
+
if obj is None:
|
618
|
+
return None
|
619
|
+
if obj.name is None:
|
620
|
+
raise ValueError("Name is required for PartitionKey conversion")
|
621
|
+
if not schema:
|
622
|
+
raise ValueError("Schema is required for PartitionKey conversion")
|
623
|
+
if len(obj.key) < 1:
|
624
|
+
raise ValueError(
|
625
|
+
f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
|
626
|
+
)
|
627
|
+
|
628
|
+
# Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
|
629
|
+
dc_source_field = schema.field(obj.key[0]).arrow
|
630
|
+
daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
|
631
|
+
# Convert transform if present
|
632
|
+
daft_transform = DaftTransformMapper.unmap(obj.transform)
|
633
|
+
daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
|
634
|
+
partition_field_name=obj.name,
|
635
|
+
daft_source_field=daft_source_field,
|
636
|
+
dc_transform=obj.transform,
|
637
|
+
)
|
638
|
+
|
639
|
+
# Create DaftPartitionField
|
640
|
+
return make_partition_field(
|
641
|
+
field=daft_partition_field,
|
642
|
+
source_field=daft_source_field,
|
643
|
+
transform=daft_transform,
|
644
|
+
)
|
645
|
+
|
646
|
+
@staticmethod
|
647
|
+
def get_daft_partition_field(
|
648
|
+
partition_field_name: str,
|
649
|
+
daft_source_field: Optional[DaftField],
|
650
|
+
# TODO: replace DeltaCAT transform with Daft Transform for uniformality
|
651
|
+
# We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
|
652
|
+
# access its attributes.
|
653
|
+
# TODO: request Daft to provide a more python friendly interface for Daft Tranform
|
654
|
+
dc_transform: Optional[Transform],
|
655
|
+
) -> DaftField:
|
656
|
+
"""Generate Daft Partition Field given partition field name, source field and transform.
|
657
|
+
Partition field type is inferred using source field type and transform.
|
658
|
+
|
659
|
+
Args:
|
660
|
+
partition_field_name (str): the specified result field name
|
661
|
+
daft_source_field (DaftField): the source field of the partition field
|
662
|
+
daft_transform (DaftTransform): transform applied on the source field to create partition field
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
DaftField: Daft Field representing the partition field
|
666
|
+
"""
|
667
|
+
if daft_source_field is None:
|
668
|
+
raise ValueError("Source field is required for PartitionField conversion")
|
669
|
+
if dc_transform is None:
|
670
|
+
raise ValueError("Transform is required for PartitionField conversion")
|
671
|
+
|
672
|
+
result_type = None
|
673
|
+
# Below type conversion logic references Daft - Iceberg conversion logic:
|
674
|
+
# https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
|
675
|
+
if isinstance(dc_transform, IdentityTransform):
|
676
|
+
result_type = daft_source_field.dtype
|
677
|
+
elif isinstance(dc_transform, YearTransform):
|
678
|
+
result_type = DataType.int32()
|
679
|
+
elif isinstance(dc_transform, MonthTransform):
|
680
|
+
result_type = DataType.int32()
|
681
|
+
elif isinstance(dc_transform, DayTransform):
|
682
|
+
result_type = DataType.int32()
|
683
|
+
elif isinstance(dc_transform, HourTransform):
|
684
|
+
result_type = DataType.int32()
|
685
|
+
elif isinstance(dc_transform, BucketTransform):
|
686
|
+
result_type = DataType.int32()
|
687
|
+
elif isinstance(dc_transform, TruncateTransform):
|
688
|
+
result_type = daft_source_field.dtype
|
689
|
+
else:
|
690
|
+
raise ValueError(f"Unsupported transform: {dc_transform}")
|
691
|
+
|
692
|
+
return DaftField.create(
|
693
|
+
name=partition_field_name,
|
694
|
+
dtype=result_type,
|
695
|
+
)
|
deltacat/utils/export.py
CHANGED
@@ -5,7 +5,9 @@ import pyarrow.parquet
|
|
5
5
|
import pyarrow.feather
|
6
6
|
from typing import Callable, Dict
|
7
7
|
|
8
|
-
from deltacat.storage.rivulet.reader.query_expression import
|
8
|
+
from deltacat.experimental.storage.rivulet.reader.query_expression import (
|
9
|
+
QueryExpression,
|
10
|
+
)
|
9
11
|
from deltacat import logs
|
10
12
|
|
11
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
deltacat/utils/filesystem.py
CHANGED
@@ -2,12 +2,12 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
from typing import Optional, Tuple, Union, List
|
5
|
+
from datetime import timedelta
|
5
6
|
|
6
7
|
import sys
|
7
8
|
import urllib
|
8
9
|
import pathlib
|
9
10
|
|
10
|
-
import pyarrow
|
11
11
|
import pyarrow as pa
|
12
12
|
from pyarrow.fs import (
|
13
13
|
_resolve_filesystem_and_path,
|
@@ -17,6 +17,7 @@ from pyarrow.fs import (
|
|
17
17
|
FileSystem,
|
18
18
|
FSSpecHandler,
|
19
19
|
PyFileSystem,
|
20
|
+
GcsFileSystem,
|
20
21
|
)
|
21
22
|
|
22
23
|
_LOCAL_SCHEME = "local"
|
@@ -24,8 +25,8 @@ _LOCAL_SCHEME = "local"
|
|
24
25
|
|
25
26
|
def resolve_paths_and_filesystem(
|
26
27
|
paths: Union[str, List[str]],
|
27
|
-
filesystem:
|
28
|
-
) -> Tuple[List[str],
|
28
|
+
filesystem: FileSystem = None,
|
29
|
+
) -> Tuple[List[str], FileSystem]:
|
29
30
|
"""
|
30
31
|
Resolves and normalizes all provided paths, infers a filesystem from the
|
31
32
|
paths or validates the provided filesystem against the paths and ensures
|
@@ -113,19 +114,26 @@ def resolve_paths_and_filesystem(
|
|
113
114
|
else:
|
114
115
|
raise
|
115
116
|
if filesystem is None:
|
116
|
-
|
117
|
+
if isinstance(resolved_filesystem, GcsFileSystem):
|
118
|
+
# Configure a retry time limit for GcsFileSystem so that it
|
119
|
+
# doesn't hang forever trying to get file info (e.g., when
|
120
|
+
# trying to get a public file w/o anonymous=True).
|
121
|
+
filesystem = GcsFileSystem(
|
122
|
+
retry_time_limit=timedelta(seconds=60),
|
123
|
+
)
|
124
|
+
else:
|
125
|
+
filesystem = resolved_filesystem
|
117
126
|
elif need_unwrap_path_protocol:
|
118
127
|
resolved_path = _unwrap_protocol(resolved_path)
|
119
128
|
resolved_path = filesystem.normalize_path(resolved_path)
|
120
129
|
resolved_paths.append(resolved_path)
|
121
|
-
|
122
130
|
return resolved_paths, filesystem
|
123
131
|
|
124
132
|
|
125
133
|
def resolve_path_and_filesystem(
|
126
134
|
path: str,
|
127
|
-
filesystem: Optional[
|
128
|
-
) -> Tuple[str,
|
135
|
+
filesystem: Optional[FileSystem] = None,
|
136
|
+
) -> Tuple[str, FileSystem]:
|
129
137
|
"""
|
130
138
|
Resolves and normalizes the provided path, infers a filesystem from the
|
131
139
|
path or validates the provided filesystem against the path.
|
@@ -148,7 +156,7 @@ def resolve_path_and_filesystem(
|
|
148
156
|
|
149
157
|
def list_directory(
|
150
158
|
path: str,
|
151
|
-
filesystem:
|
159
|
+
filesystem: FileSystem,
|
152
160
|
exclude_prefixes: Optional[List[str]] = None,
|
153
161
|
ignore_missing_path: bool = False,
|
154
162
|
recursive: bool = False,
|
@@ -199,7 +207,7 @@ def list_directory(
|
|
199
207
|
|
200
208
|
def get_file_info(
|
201
209
|
path: str,
|
202
|
-
filesystem:
|
210
|
+
filesystem: FileSystem,
|
203
211
|
ignore_missing_path: bool = False,
|
204
212
|
) -> FileInfo:
|
205
213
|
"""Get the file info for the provided path."""
|
@@ -227,6 +235,9 @@ def _handle_read_os_error(
|
|
227
235
|
r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
|
228
236
|
r"body\.(.*))$"
|
229
237
|
)
|
238
|
+
gcp_error_pattern = (
|
239
|
+
r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
|
240
|
+
)
|
230
241
|
if re.match(aws_error_pattern, str(error)):
|
231
242
|
# Specially handle AWS error when reading files, to give a clearer error
|
232
243
|
# message to avoid confusing users. The real issue is most likely that the AWS
|
@@ -243,9 +254,28 @@ def _handle_read_os_error(
|
|
243
254
|
"You can also run AWS CLI command to get more detailed error message "
|
244
255
|
"(e.g., aws s3 ls <file-name>). "
|
245
256
|
"See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
|
257
|
+
"and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
|
246
258
|
"for more information."
|
247
259
|
)
|
248
260
|
)
|
261
|
+
elif re.match(gcp_error_pattern, str(error)):
|
262
|
+
# Special handling for GCP errors (e.g., handling the special case of
|
263
|
+
# requiring the filesystem to be instantiated with anonymous access to
|
264
|
+
# read public files).
|
265
|
+
if isinstance(paths, str):
|
266
|
+
paths = f'"{paths}"'
|
267
|
+
raise OSError(
|
268
|
+
(
|
269
|
+
f"Failing to read GCP GS file(s): {paths}. "
|
270
|
+
"Please check that file exists and has properly configured access. "
|
271
|
+
"If this is a public file, please instantiate a filesystem with "
|
272
|
+
"anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
|
273
|
+
"to read it. See https://google.aip.dev/auth/4110 and "
|
274
|
+
"https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
|
275
|
+
"for more information."
|
276
|
+
)
|
277
|
+
)
|
278
|
+
|
249
279
|
else:
|
250
280
|
raise error
|
251
281
|
|