deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,248 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Dict, Generic, TypeVar, Callable, Optional
|
3
|
+
from functools import singledispatchmethod
|
4
|
+
import re
|
5
|
+
|
6
|
+
from deltacat.storage.model.expression import (
|
7
|
+
Expression,
|
8
|
+
Reference,
|
9
|
+
Literal,
|
10
|
+
BinaryExpression,
|
11
|
+
UnaryExpression,
|
12
|
+
In,
|
13
|
+
Between,
|
14
|
+
Like,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
C = TypeVar("C") # Context type
|
19
|
+
R = TypeVar("R") # Return type
|
20
|
+
|
21
|
+
|
22
|
+
class ExpressionVisitor(ABC, Generic[C, R]):
|
23
|
+
"""
|
24
|
+
Visitor pattern for deltacat expressions.
|
25
|
+
|
26
|
+
This base class provides two ways to implement visitors:
|
27
|
+
1. Using a procedure dictionary (_PROCEDURES) - for simple, declarative visitors
|
28
|
+
2. Using specialized visit_xyz methods with snake_case naming - for more control
|
29
|
+
|
30
|
+
Subclasses need only implement visit_reference and visit_literal, plus either:
|
31
|
+
- Define _PROCEDURES dictionary with functions for handling different expression types
|
32
|
+
- Implement specific visit_xyz methods (using snake_case) for individual expressions
|
33
|
+
"""
|
34
|
+
|
35
|
+
# Default procedure dictionary for subclasses to override
|
36
|
+
_PROCEDURES: Dict[str, Callable] = {}
|
37
|
+
|
38
|
+
def __init__(self):
|
39
|
+
"""Initialize visitor and validate required methods."""
|
40
|
+
# Pre-check for required methods
|
41
|
+
if not hasattr(self, "visit_reference") or not callable(
|
42
|
+
getattr(self, "visit_reference")
|
43
|
+
):
|
44
|
+
raise NotImplementedError("Subclasses must implement visit_reference")
|
45
|
+
if not hasattr(self, "visit_literal") or not callable(
|
46
|
+
getattr(self, "visit_literal")
|
47
|
+
):
|
48
|
+
raise NotImplementedError("Subclasses must implement visit_literal")
|
49
|
+
self._setup_default_procedure_handlers()
|
50
|
+
|
51
|
+
def _to_snake_case(self, name: str) -> str:
|
52
|
+
"""Convert PascalCase or camelCase to snake_case."""
|
53
|
+
pattern = re.compile(r"(?<!^)(?=[A-Z])")
|
54
|
+
return pattern.sub("_", name).lower()
|
55
|
+
|
56
|
+
def _setup_default_procedure_handlers(self):
|
57
|
+
"""Set up default procedure application methods if not overridden."""
|
58
|
+
if not hasattr(self, "_apply_binary") or not callable(
|
59
|
+
getattr(self, "_apply_binary")
|
60
|
+
):
|
61
|
+
self._apply_binary = lambda proc, left, right: proc(left, right)
|
62
|
+
if not hasattr(self, "_apply_unary") or not callable(
|
63
|
+
getattr(self, "_apply_unary")
|
64
|
+
):
|
65
|
+
self._apply_unary = lambda proc, operand: proc(operand)
|
66
|
+
if not hasattr(self, "_apply_in") or not callable(getattr(self, "_apply_in")):
|
67
|
+
self._apply_in = lambda proc, value, values: proc(value, values)
|
68
|
+
if not hasattr(self, "_apply_between") or not callable(
|
69
|
+
getattr(self, "_apply_between")
|
70
|
+
):
|
71
|
+
self._apply_between = lambda proc, value, lower, upper: proc(
|
72
|
+
value, lower, upper
|
73
|
+
)
|
74
|
+
if not hasattr(self, "_apply_like") or not callable(
|
75
|
+
getattr(self, "_apply_like")
|
76
|
+
):
|
77
|
+
self._apply_like = lambda proc, value, pattern: proc(value, pattern)
|
78
|
+
|
79
|
+
@singledispatchmethod
|
80
|
+
def visit(self, expr: Expression, context: Optional[C] = None) -> R:
|
81
|
+
"""
|
82
|
+
Generic visit method that dispatches to specific methods based on expression type.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
expr: The expression to visit
|
86
|
+
context: Optional context to pass through the visitor
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
Result of visiting the expression
|
90
|
+
"""
|
91
|
+
expr_type = type(expr).__name__
|
92
|
+
raise NotImplementedError(f"No visit method for type {expr_type}")
|
93
|
+
|
94
|
+
@visit.register
|
95
|
+
def _visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
|
96
|
+
"""Visit a Reference expression."""
|
97
|
+
return self.visit_reference(expr, context)
|
98
|
+
|
99
|
+
@visit.register
|
100
|
+
def _visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
|
101
|
+
"""Visit a Literal expression."""
|
102
|
+
return self.visit_literal(expr, context)
|
103
|
+
|
104
|
+
@visit.register
|
105
|
+
def _visit_binary(self, expr: BinaryExpression, context: Optional[C] = None) -> R:
|
106
|
+
"""Visit a binary expression using method specialization or procedures."""
|
107
|
+
expr_type = type(expr).__name__
|
108
|
+
|
109
|
+
left_result = self.visit(expr.left, context)
|
110
|
+
right_result = self.visit(expr.right, context)
|
111
|
+
|
112
|
+
method_name = f"visit_{self._to_snake_case(expr_type)}"
|
113
|
+
if hasattr(self, method_name):
|
114
|
+
method = getattr(self, method_name)
|
115
|
+
return method(expr, context)
|
116
|
+
|
117
|
+
if expr_type in self._PROCEDURES:
|
118
|
+
return self._apply_binary(
|
119
|
+
self._PROCEDURES[expr_type], left_result, right_result
|
120
|
+
)
|
121
|
+
|
122
|
+
try:
|
123
|
+
return self.visit_binary_expression(
|
124
|
+
expr, left_result, right_result, context
|
125
|
+
)
|
126
|
+
except NotImplementedError:
|
127
|
+
raise NotImplementedError(f"No handler for {expr_type}")
|
128
|
+
|
129
|
+
@visit.register
|
130
|
+
def _visit_unary(self, expr: UnaryExpression, context: Optional[C] = None) -> R:
|
131
|
+
"""Visit a unary expression using method specialization or procedures."""
|
132
|
+
expr_type = type(expr).__name__
|
133
|
+
|
134
|
+
operand_result = self.visit(expr.operand, context)
|
135
|
+
|
136
|
+
method_name = f"visit_{self._to_snake_case(expr_type)}"
|
137
|
+
if hasattr(self, method_name):
|
138
|
+
method = getattr(self, method_name)
|
139
|
+
return method(expr, context)
|
140
|
+
|
141
|
+
if expr_type in self._PROCEDURES:
|
142
|
+
return self._apply_unary(self._PROCEDURES[expr_type], operand_result)
|
143
|
+
|
144
|
+
try:
|
145
|
+
return self.visit_unary_expression(expr, operand_result, context)
|
146
|
+
except NotImplementedError:
|
147
|
+
raise NotImplementedError(f"No handler for {expr_type}")
|
148
|
+
|
149
|
+
@visit.register
|
150
|
+
def _visit_in(self, expr: In, context: Optional[C] = None) -> R:
|
151
|
+
"""Visit an In expression."""
|
152
|
+
if hasattr(self, "visit_in"):
|
153
|
+
return self.visit_in(expr, context)
|
154
|
+
|
155
|
+
if "In" in self._PROCEDURES:
|
156
|
+
value_result = self.visit(expr.value, context)
|
157
|
+
values_results = [self.visit(v, context) for v in expr.values]
|
158
|
+
return self._apply_in(self._PROCEDURES["In"], value_result, values_results)
|
159
|
+
|
160
|
+
raise NotImplementedError("No handler for In expression")
|
161
|
+
|
162
|
+
@visit.register
|
163
|
+
def _visit_between(self, expr: Between, context: Optional[C] = None) -> R:
|
164
|
+
"""Visit a Between expression."""
|
165
|
+
if hasattr(self, "visit_between"):
|
166
|
+
return self.visit_between(expr, context)
|
167
|
+
|
168
|
+
if "Between" in self._PROCEDURES:
|
169
|
+
value_result = self.visit(expr.value, context)
|
170
|
+
lower_result = self.visit(expr.lower, context)
|
171
|
+
upper_result = self.visit(expr.upper, context)
|
172
|
+
return self._apply_between(
|
173
|
+
self._PROCEDURES["Between"], value_result, lower_result, upper_result
|
174
|
+
)
|
175
|
+
|
176
|
+
raise NotImplementedError("No handler for Between expression")
|
177
|
+
|
178
|
+
@visit.register
|
179
|
+
def _visit_like(self, expr: Like, context: Optional[C] = None) -> R:
|
180
|
+
"""Visit a Like expression."""
|
181
|
+
if hasattr(self, "visit_like"):
|
182
|
+
return self.visit_like(expr, context)
|
183
|
+
|
184
|
+
if "Like" in self._PROCEDURES:
|
185
|
+
value_result = self.visit(expr.value, context)
|
186
|
+
pattern_result = self.visit(expr.pattern, context)
|
187
|
+
return self._apply_like(
|
188
|
+
self._PROCEDURES["Like"], value_result, pattern_result
|
189
|
+
)
|
190
|
+
|
191
|
+
raise NotImplementedError("No handler for Like expression")
|
192
|
+
|
193
|
+
@abstractmethod
|
194
|
+
def visit_reference(self, expr: Reference, context: Optional[C] = None) -> R:
|
195
|
+
"""Visit a Reference expression."""
|
196
|
+
pass
|
197
|
+
|
198
|
+
@abstractmethod
|
199
|
+
def visit_literal(self, expr: Literal, context: Optional[C] = None) -> R:
|
200
|
+
"""Visit a Literal expression."""
|
201
|
+
pass
|
202
|
+
|
203
|
+
def visit_binary_expression(
|
204
|
+
self, expr: BinaryExpression, left: R, right: R, context: Optional[C] = None
|
205
|
+
) -> R:
|
206
|
+
"""Default fallback handler for binary expressions."""
|
207
|
+
raise NotImplementedError(f"No handler for {type(expr).__name__}")
|
208
|
+
|
209
|
+
def visit_unary_expression(
|
210
|
+
self, expr: UnaryExpression, operand: R, context: Optional[C] = None
|
211
|
+
) -> R:
|
212
|
+
"""Default fallback handler for unary expressions."""
|
213
|
+
raise NotImplementedError(f"No handler for {type(expr).__name__}")
|
214
|
+
|
215
|
+
|
216
|
+
class DisplayVisitor(ExpressionVisitor[Expression, str]):
|
217
|
+
"""
|
218
|
+
Visitor implementation that formats expressions in standard infix notation.
|
219
|
+
For example: "a = b AND c > d" instead of "(AND (= a b) (> c d))".
|
220
|
+
"""
|
221
|
+
|
222
|
+
# Map all expression types to their string formatting procedures with infix notation
|
223
|
+
_PROCEDURES = {
|
224
|
+
# Binary operations with infix notation
|
225
|
+
"Equal": lambda left, right: f"{left} = {right}",
|
226
|
+
"NotEqual": lambda left, right: f"{left} <> {right}",
|
227
|
+
"GreaterThan": lambda left, right: f"{left} > {right}",
|
228
|
+
"LessThan": lambda left, right: f"{left} < {right}",
|
229
|
+
"GreaterThanEqual": lambda left, right: f"{left} >= {right}",
|
230
|
+
"LessThanEqual": lambda left, right: f"{left} <= {right}",
|
231
|
+
"And": lambda left, right: f"({left} AND {right})",
|
232
|
+
"Or": lambda left, right: f"({left} OR {right})",
|
233
|
+
# Unary operations
|
234
|
+
"Not": lambda operand: f"NOT ({operand})",
|
235
|
+
"IsNull": lambda operand: f"({operand}) IS NULL",
|
236
|
+
# Special operations
|
237
|
+
"In": lambda value, values: f"{value} IN ({', '.join(values)})",
|
238
|
+
"Between": lambda value, lower, upper: f"{value} BETWEEN {lower} AND {upper}",
|
239
|
+
"Like": lambda value, pattern: f"{value} LIKE {pattern}",
|
240
|
+
}
|
241
|
+
|
242
|
+
def visit_reference(self, expr: Reference, context=None) -> str:
|
243
|
+
"""Format a field reference."""
|
244
|
+
return expr.field
|
245
|
+
|
246
|
+
def visit_literal(self, expr: Literal, context=None) -> str:
|
247
|
+
"""Format a literal value using its PyArrow representation."""
|
248
|
+
return str(expr.value)
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
|
4
4
|
import copy
|
5
5
|
|
6
|
-
from typing import Optional, Tuple, List
|
6
|
+
from typing import Optional, Tuple, List, Union
|
7
7
|
|
8
8
|
import base64
|
9
9
|
import json
|
@@ -412,7 +412,7 @@ class Metafile(dict):
|
|
412
412
|
@staticmethod
|
413
413
|
def based_on(
|
414
414
|
other: Optional[Metafile],
|
415
|
-
new_id: Optional[
|
415
|
+
new_id: Optional[str] = None,
|
416
416
|
) -> Optional[Metafile]:
|
417
417
|
"""
|
418
418
|
Returns a new metafile equivalent to the input metafile, but with a new
|
@@ -539,29 +539,31 @@ class Metafile(dict):
|
|
539
539
|
f"${serialized_dict}"
|
540
540
|
)
|
541
541
|
|
542
|
+
@staticmethod
|
543
|
+
def get_type_name(serialized_dict: dict):
|
544
|
+
"""
|
545
|
+
Given a serialized dictionary of Metafile data, gets the type name of
|
546
|
+
the metafile class.
|
547
|
+
"""
|
548
|
+
return Metafile.get_class(serialized_dict).__name__
|
549
|
+
|
542
550
|
@classmethod
|
543
|
-
def
|
551
|
+
def deserialize(
|
544
552
|
cls,
|
545
|
-
|
546
|
-
|
547
|
-
format: Optional[str] = METAFILE_FORMAT,
|
553
|
+
serialized: Union[bytes, str],
|
554
|
+
meta_format: Optional[str] = METAFILE_FORMAT,
|
548
555
|
) -> Metafile:
|
549
556
|
"""
|
550
|
-
|
551
|
-
:param
|
552
|
-
:param
|
553
|
-
:
|
554
|
-
:return: Deserialized object from the metadata file.
|
557
|
+
Deserialize a metadata file from the given bytes or string.
|
558
|
+
:param serialized: Serialized metadata file data.
|
559
|
+
:param meta_format: Format to use for deserializing the metadata file.
|
560
|
+
:return: Deserialized metadata file.
|
555
561
|
"""
|
556
|
-
if
|
562
|
+
if meta_format not in SUPPORTED_METAFILE_FORMATS:
|
557
563
|
raise ValueError(
|
558
|
-
f"Unsupported format '{
|
564
|
+
f"Unsupported format '{meta_format}'. "
|
565
|
+
f"Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
|
559
566
|
)
|
560
|
-
|
561
|
-
if not filesystem:
|
562
|
-
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
563
|
-
with filesystem.open_input_stream(path) as file:
|
564
|
-
binary = file.readall()
|
565
567
|
reader = {
|
566
568
|
"json": lambda b: json.loads(
|
567
569
|
b.decode("utf-8"),
|
@@ -573,12 +575,32 @@ class Metafile(dict):
|
|
573
575
|
},
|
574
576
|
),
|
575
577
|
"msgpack": msgpack.loads,
|
576
|
-
}[
|
577
|
-
data = reader(
|
578
|
+
}[meta_format]
|
579
|
+
data = reader(serialized)
|
578
580
|
# cast this Metafile into the appropriate child class type
|
579
581
|
clazz = Metafile.get_class(data)
|
580
|
-
|
581
|
-
|
582
|
+
return clazz(**data)
|
583
|
+
|
584
|
+
@classmethod
|
585
|
+
def read(
|
586
|
+
cls,
|
587
|
+
path: str,
|
588
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
589
|
+
meta_format: Optional[str] = METAFILE_FORMAT,
|
590
|
+
) -> Metafile:
|
591
|
+
"""
|
592
|
+
Read a metadata file and return the deserialized object.
|
593
|
+
:param path: Metadata file path to read.
|
594
|
+
:param filesystem: File system to use for reading the metadata file.
|
595
|
+
:param meta_format: Format to use for deserializing the metadata file.
|
596
|
+
:return: Deserialized object from the metadata file.
|
597
|
+
"""
|
598
|
+
if not filesystem:
|
599
|
+
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
600
|
+
with filesystem.open_input_stream(path) as file:
|
601
|
+
serialized = file.readall()
|
602
|
+
metafile = Metafile.deserialize(serialized, meta_format)
|
603
|
+
return metafile.from_serializable(path, filesystem)
|
582
604
|
|
583
605
|
def write_txn(
|
584
606
|
self,
|
@@ -616,11 +638,37 @@ class Metafile(dict):
|
|
616
638
|
filesystem=filesystem,
|
617
639
|
)
|
618
640
|
|
641
|
+
def serialize(
|
642
|
+
self,
|
643
|
+
meta_format: Optional[str] = METAFILE_FORMAT,
|
644
|
+
) -> Union[bytes, str]:
|
645
|
+
"""
|
646
|
+
Serialize this object to the given metafile format.
|
647
|
+
:param meta_format: Format to use for serializing the metadata file.
|
648
|
+
:return: Serialized metadata file bytes or string (format dependent).
|
649
|
+
"""
|
650
|
+
if meta_format not in SUPPORTED_METAFILE_FORMATS:
|
651
|
+
raise ValueError(
|
652
|
+
f"Unsupported format '{meta_format}'. "
|
653
|
+
f"Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
|
654
|
+
)
|
655
|
+
serializer = {
|
656
|
+
"json": lambda data: json.dumps(
|
657
|
+
data,
|
658
|
+
indent=4,
|
659
|
+
default=lambda b: base64.b64encode(b).decode("utf-8")
|
660
|
+
if isinstance(b, bytes)
|
661
|
+
else b,
|
662
|
+
).encode("utf-8"),
|
663
|
+
"msgpack": msgpack.dumps,
|
664
|
+
}[meta_format]
|
665
|
+
return serializer(self.to_serializable())
|
666
|
+
|
619
667
|
def write(
|
620
668
|
self,
|
621
669
|
path: str,
|
622
670
|
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
623
|
-
|
671
|
+
meta_format: Optional[str] = METAFILE_FORMAT,
|
624
672
|
) -> None:
|
625
673
|
"""
|
626
674
|
Serialize and write this object to a metadata file.
|
@@ -628,31 +676,15 @@ class Metafile(dict):
|
|
628
676
|
:param filesystem: File system to use for writing the metadata file. If
|
629
677
|
not given, a default filesystem will be automatically selected based on
|
630
678
|
the catalog root path.
|
631
|
-
param
|
679
|
+
:param meta_format: Format to use for serializing the metadata file.
|
632
680
|
"""
|
633
|
-
|
634
|
-
raise ValueError(
|
635
|
-
f"Unsupported format '{format}'. Supported formats include: {SUPPORTED_METAFILE_FORMATS}."
|
636
|
-
)
|
637
|
-
|
681
|
+
serialized = self.serialize(meta_format)
|
638
682
|
if not filesystem:
|
639
683
|
path, filesystem = resolve_path_and_filesystem(path, filesystem)
|
640
684
|
revision_dir_path = posixpath.dirname(path)
|
641
685
|
filesystem.create_dir(revision_dir_path, recursive=True)
|
642
|
-
|
643
|
-
writer = {
|
644
|
-
"json": lambda data: json.dumps(
|
645
|
-
data,
|
646
|
-
indent=4,
|
647
|
-
default=lambda b: base64.b64encode(b).decode("utf-8")
|
648
|
-
if isinstance(b, bytes)
|
649
|
-
else b,
|
650
|
-
).encode("utf-8"),
|
651
|
-
"msgpack": msgpack.dumps,
|
652
|
-
}[format]
|
653
|
-
|
654
686
|
with filesystem.open_output_stream(path) as file:
|
655
|
-
file.write(
|
687
|
+
file.write(serialized)
|
656
688
|
|
657
689
|
def equivalent_to(self, other: Metafile) -> bool:
|
658
690
|
"""
|
@@ -1,3 +1,10 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from deltacat.storage.model.expression import Expression
|
6
|
+
|
7
|
+
|
1
8
|
class RowFilter:
|
2
9
|
...
|
3
10
|
|
@@ -6,14 +13,34 @@ class ColumnFilter:
|
|
6
13
|
...
|
7
14
|
|
8
15
|
|
16
|
+
@dataclass
|
9
17
|
class PartitionFilter:
|
10
|
-
|
18
|
+
expr: Expression
|
19
|
+
|
20
|
+
@staticmethod
|
21
|
+
def of(expr: Expression) -> PartitionFilter:
|
22
|
+
return PartitionFilter(expr)
|
11
23
|
|
12
24
|
|
25
|
+
@dataclass
|
13
26
|
class Pushdown:
|
14
27
|
"""Represents pushdown predicates to be applied for DeltaCAT Tables"""
|
15
28
|
|
16
|
-
row_filter: RowFilter
|
17
|
-
column_filter: ColumnFilter
|
18
|
-
partition_filter: PartitionFilter
|
19
|
-
limit: int
|
29
|
+
row_filter: Optional[RowFilter]
|
30
|
+
column_filter: Optional[ColumnFilter]
|
31
|
+
partition_filter: Optional[PartitionFilter]
|
32
|
+
limit: Optional[int]
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def of(
|
36
|
+
row_filter: Optional[RowFilter],
|
37
|
+
column_filter: Optional[ColumnFilter],
|
38
|
+
partition_filter: Optional[PartitionFilter],
|
39
|
+
limit: Optional[int],
|
40
|
+
) -> Pushdown:
|
41
|
+
return Pushdown(
|
42
|
+
row_filter=row_filter,
|
43
|
+
column_filter=column_filter,
|
44
|
+
partition_filter=partition_filter,
|
45
|
+
limit=limit,
|
46
|
+
)
|
deltacat/storage/model/types.py
CHANGED
@@ -6,7 +6,7 @@ from typing import List, Union
|
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
8
|
import pyarrow as pa
|
9
|
-
from ray.data.dataset import Dataset
|
9
|
+
from ray.data.dataset import Dataset as RayDataset
|
10
10
|
from daft import DataFrame as DaftDataFrame
|
11
11
|
|
12
12
|
|
@@ -16,13 +16,15 @@ LocalTable = Union[
|
|
16
16
|
np.ndarray,
|
17
17
|
pa.parquet.ParquetFile,
|
18
18
|
]
|
19
|
-
LocalDataset = List[LocalTable]
|
20
|
-
DistributedDataset = Union[
|
19
|
+
LocalDataset = Union[LocalTable, List[LocalTable]]
|
20
|
+
DistributedDataset = Union[RayDataset, DaftDataFrame]
|
21
|
+
Dataset = Union[LocalDataset, DistributedDataset]
|
21
22
|
|
22
23
|
|
23
24
|
class StreamFormat(str, Enum):
|
24
25
|
DELTACAT = "deltacat"
|
25
26
|
ICEBERG = "iceberg"
|
27
|
+
HIVE = "hive"
|
26
28
|
HUDI = "hudi"
|
27
29
|
DELTA_LAKE = "delta_lake"
|
28
30
|
SQLITE3 = "SQLITE3" # used by tests
|
@@ -1,7 +1,7 @@
|
|
1
|
-
from .schema.schema import Schema
|
2
|
-
from .schema.schema import Field
|
3
|
-
from .dataset import Dataset
|
4
|
-
from .schema.schema import Datatype
|
1
|
+
from deltacat.storage.rivulet.schema.schema import Schema
|
2
|
+
from deltacat.storage.rivulet.schema.schema import Field
|
3
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
4
|
+
from deltacat.storage.rivulet.schema.schema import Datatype
|
5
5
|
|
6
6
|
__all__ = [
|
7
7
|
"Schema",
|
File without changes
|
File without changes
|