deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/__init__.py +4 -0
- deltacat/aws/redshift/model/manifest.py +93 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +47 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,126 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
4
|
+
CompactPartitionParams,
|
5
|
+
)
|
6
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
7
|
+
LocalMergeFileGroupsProvider,
|
8
|
+
)
|
9
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
10
|
+
import pyarrow as pa
|
11
|
+
from deltacat import logs
|
12
|
+
from typing import List, Optional
|
13
|
+
|
14
|
+
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
|
15
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
16
|
+
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
17
|
+
from deltacat.compute.compactor import (
|
18
|
+
RoundCompletionInfo,
|
19
|
+
DeltaAnnotated,
|
20
|
+
)
|
21
|
+
|
22
|
+
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
23
|
+
|
24
|
+
from deltacat.utils.performance import timed_invocation
|
25
|
+
from deltacat.storage import (
|
26
|
+
Partition,
|
27
|
+
)
|
28
|
+
|
29
|
+
|
30
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
31
|
+
|
32
|
+
|
33
|
+
def materialize(
|
34
|
+
input: MergeInput,
|
35
|
+
task_index: int,
|
36
|
+
compacted_tables: List[pa.Table],
|
37
|
+
) -> MaterializeResult:
|
38
|
+
compacted_table = pa.concat_tables(compacted_tables)
|
39
|
+
if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
40
|
+
# TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
|
41
|
+
# TODO (pdames): compare performance to pandas-native materialize path
|
42
|
+
df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
|
43
|
+
compacted_table = df
|
44
|
+
delta, stage_delta_time = timed_invocation(
|
45
|
+
input.deltacat_storage.stage_delta,
|
46
|
+
compacted_table,
|
47
|
+
input.write_to_partition,
|
48
|
+
max_records_per_entry=input.max_records_per_output_file,
|
49
|
+
content_type=input.compacted_file_content_type,
|
50
|
+
s3_table_writer_kwargs=input.s3_table_writer_kwargs,
|
51
|
+
**input.deltacat_storage_kwargs,
|
52
|
+
)
|
53
|
+
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
54
|
+
compacted_table
|
55
|
+
)
|
56
|
+
logger.debug(
|
57
|
+
f"Time taken for materialize task"
|
58
|
+
f" to upload {len(compacted_table)} records"
|
59
|
+
f" of size {compacted_table_size} is: {stage_delta_time}s"
|
60
|
+
)
|
61
|
+
manifest = delta.manifest
|
62
|
+
manifest_records = manifest.meta.record_count
|
63
|
+
assert manifest_records == len(compacted_table), (
|
64
|
+
f"Unexpected Error: Materialized delta manifest record count "
|
65
|
+
f"({manifest_records}) does not equal compacted table record count "
|
66
|
+
f"({len(compacted_table)})"
|
67
|
+
)
|
68
|
+
materialize_result = MaterializeResult.of(
|
69
|
+
delta=delta,
|
70
|
+
task_index=task_index,
|
71
|
+
# TODO (pdames): Generalize WriteResult to contain in-memory-table-type
|
72
|
+
# and in-memory-table-bytes instead of tight coupling to paBytes
|
73
|
+
pyarrow_write_result=PyArrowWriteResult.of(
|
74
|
+
len(manifest.entries),
|
75
|
+
TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
|
76
|
+
manifest.meta.content_length,
|
77
|
+
len(compacted_table),
|
78
|
+
),
|
79
|
+
)
|
80
|
+
logger.info(f"Materialize result: {materialize_result}")
|
81
|
+
return materialize_result
|
82
|
+
|
83
|
+
|
84
|
+
def generate_local_merge_input(
|
85
|
+
params: CompactPartitionParams,
|
86
|
+
annotated_deltas: List[DeltaAnnotated],
|
87
|
+
compacted_partition: Partition,
|
88
|
+
round_completion_info: Optional[RoundCompletionInfo],
|
89
|
+
):
|
90
|
+
"""
|
91
|
+
Generates a merge input for local deltas that do not reside in the Ray object store and
|
92
|
+
have not been subject to the hash bucketing process.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
params: parameters for compacting a partition
|
96
|
+
annotated_deltas: a list of annotated deltas
|
97
|
+
compacted_partition: the compacted partition to write to
|
98
|
+
round_completion_info: keeps track of high watermarks and other metadata from previous compaction rounds
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
A MergeInput object
|
102
|
+
|
103
|
+
"""
|
104
|
+
|
105
|
+
return MergeInput.of(
|
106
|
+
merge_file_groups_provider=LocalMergeFileGroupsProvider(
|
107
|
+
annotated_deltas,
|
108
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
109
|
+
deltacat_storage=params.deltacat_storage,
|
110
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
111
|
+
),
|
112
|
+
write_to_partition=compacted_partition,
|
113
|
+
compacted_file_content_type=params.compacted_file_content_type,
|
114
|
+
primary_keys=params.primary_keys,
|
115
|
+
sort_keys=params.sort_keys,
|
116
|
+
drop_duplicates=params.drop_duplicates,
|
117
|
+
max_records_per_output_file=params.records_per_compacted_file,
|
118
|
+
enable_profiler=params.enable_profiler,
|
119
|
+
metrics_config=params.metrics_config,
|
120
|
+
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
121
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
122
|
+
round_completion_info=round_completion_info,
|
123
|
+
object_store=params.object_store,
|
124
|
+
deltacat_storage=params.deltacat_storage,
|
125
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
126
|
+
)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import botocore
|
2
|
+
import logging
|
2
3
|
from typing import Dict, Optional, List, Tuple
|
4
|
+
from deltacat import logs
|
3
5
|
from deltacat.types.media import ContentEncoding, ContentType
|
4
6
|
from deltacat.types.partial_download import PartialParquetParameters
|
5
7
|
from deltacat.storage import (
|
@@ -17,6 +19,8 @@ from deltacat.compute.compactor_v2.constants import (
|
|
17
19
|
PARQUET_TO_PYARROW_INFLATION,
|
18
20
|
)
|
19
21
|
|
22
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
23
|
+
|
20
24
|
|
21
25
|
def _get_parquet_type_params_if_exist(
|
22
26
|
entry: ManifestEntry,
|
@@ -133,11 +137,15 @@ def hash_bucket_resource_options_provider(
|
|
133
137
|
ray_custom_resources: Optional[Dict] = None,
|
134
138
|
**kwargs,
|
135
139
|
) -> Dict:
|
140
|
+
debug_memory_params = {"hash_bucket_task_index": index}
|
136
141
|
size_bytes = 0.0
|
137
142
|
num_rows = 0
|
138
143
|
total_pk_size = 0
|
139
144
|
|
140
145
|
if not item.manifest or not item.manifest.entries:
|
146
|
+
logger.debug(
|
147
|
+
f"[Hash bucket task {index}]: No manifest entries, skipping memory allocation calculation"
|
148
|
+
)
|
141
149
|
return {"CPU": 0.01}
|
142
150
|
|
143
151
|
for entry in item.manifest.entries:
|
@@ -162,12 +170,30 @@ def hash_bucket_resource_options_provider(
|
|
162
170
|
else:
|
163
171
|
total_pk_size += pk_size
|
164
172
|
|
165
|
-
# total size + pk size + pk hash column + hash bucket index column
|
173
|
+
# total size + pk size + pyarrow-to-numpy conversion + pk hash column + hashlib inefficiency + hash bucket index column
|
166
174
|
# Refer to hash_bucket step for more details.
|
167
|
-
total_memory =
|
175
|
+
total_memory = (
|
176
|
+
size_bytes
|
177
|
+
+ total_pk_size
|
178
|
+
+ total_pk_size
|
179
|
+
+ num_rows * 20
|
180
|
+
+ num_rows * 20
|
181
|
+
+ num_rows * 4
|
182
|
+
)
|
183
|
+
debug_memory_params["size_bytes"] = size_bytes
|
184
|
+
debug_memory_params["num_rows"] = num_rows
|
185
|
+
debug_memory_params["total_pk_size"] = total_pk_size
|
186
|
+
debug_memory_params["total_memory"] = total_memory
|
187
|
+
|
188
|
+
debug_memory_params["previous_inflation"] = previous_inflation
|
189
|
+
debug_memory_params["average_record_size_bytes"] = average_record_size_bytes
|
168
190
|
|
169
191
|
# Consider buffer
|
170
192
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
193
|
+
debug_memory_params["total_memory_with_buffer"] = total_memory
|
194
|
+
logger.debug(
|
195
|
+
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
|
196
|
+
)
|
171
197
|
|
172
198
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
173
199
|
|
@@ -186,10 +212,13 @@ def merge_resource_options_provider(
|
|
186
212
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
187
213
|
**kwargs,
|
188
214
|
) -> Dict:
|
215
|
+
debug_memory_params = {"merge_task_index": index}
|
189
216
|
hb_group_idx = item[0]
|
190
217
|
|
191
218
|
data_size = hash_group_size_bytes.get(hb_group_idx, 0)
|
192
219
|
num_rows = hash_group_num_rows.get(hb_group_idx, 0)
|
220
|
+
debug_memory_params["data_size_from_hash_group"] = data_size
|
221
|
+
debug_memory_params["num_rows_from_hash_group"] = num_rows
|
193
222
|
|
194
223
|
# upper bound for pk size of incremental
|
195
224
|
pk_size_bytes = data_size
|
@@ -205,10 +234,13 @@ def merge_resource_options_provider(
|
|
205
234
|
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
206
235
|
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
207
236
|
)
|
237
|
+
debug_memory_params["previous_inflation"] = previous_inflation
|
238
|
+
|
208
239
|
average_record_size = (
|
209
240
|
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
210
241
|
/ round_completion_info.compacted_pyarrow_write_result.records
|
211
242
|
)
|
243
|
+
debug_memory_params["average_record_size"] = average_record_size
|
212
244
|
|
213
245
|
iterable = hash_group_index_to_hash_bucket_indices(
|
214
246
|
hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
|
@@ -247,16 +279,27 @@ def merge_resource_options_provider(
|
|
247
279
|
else:
|
248
280
|
pk_size_bytes += pk_size
|
249
281
|
|
250
|
-
# total data downloaded + primary key hash column +
|
251
|
-
# + dict size for merge + incremental index array size
|
282
|
+
# total data downloaded + primary key hash column + pyarrow-to-numpy conversion
|
283
|
+
# + primary key column + hashlib inefficiency + dict size for merge + incremental index array size
|
252
284
|
total_memory = (
|
253
285
|
data_size
|
254
286
|
+ pk_size_bytes
|
287
|
+
+ pk_size_bytes
|
288
|
+
+ num_rows * 20
|
255
289
|
+ num_rows * 20
|
256
290
|
+ num_rows * 20
|
257
291
|
+ incremental_index_array_size
|
258
292
|
)
|
293
|
+
debug_memory_params["data_size"] = data_size
|
294
|
+
debug_memory_params["num_rows"] = num_rows
|
295
|
+
debug_memory_params["pk_size_bytes"] = pk_size_bytes
|
296
|
+
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
297
|
+
debug_memory_params["total_memory"] = total_memory
|
259
298
|
|
260
299
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
300
|
+
debug_memory_params["total_memory_with_buffer"] = total_memory
|
301
|
+
logger.debug(
|
302
|
+
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
|
303
|
+
)
|
261
304
|
|
262
305
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import logging
|
2
|
+
from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
|
3
|
+
from deltacat.storage.model.types import DistributedDataset
|
4
|
+
from deltacat.types.media import TableType, DistributedDatasetType
|
5
|
+
from deltacat.compute.merge_on_read.utils.delta import create_df_from_all_deltas
|
6
|
+
from deltacat import logs
|
7
|
+
|
8
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
|
+
|
10
|
+
|
11
|
+
def merge(params: MergeOnReadParams, **kwargs) -> DistributedDataset:
|
12
|
+
"""
|
13
|
+
Merges the given deltas and returns the result as distributed dataframe.
|
14
|
+
It reads the deltas into the Daft dataframe and leverages operations supported
|
15
|
+
by Daft to perform an efficient merge using Ray cluster.
|
16
|
+
|
17
|
+
TODO(raghumdani): Perform actual merge.
|
18
|
+
"""
|
19
|
+
|
20
|
+
delta_dfs = create_df_from_all_deltas(
|
21
|
+
deltas=params.deltas,
|
22
|
+
table_type=TableType.PYARROW,
|
23
|
+
distributed_dataset_type=DistributedDatasetType.DAFT,
|
24
|
+
reader_kwargs=params.reader_kwargs,
|
25
|
+
deltacat_storage=params.deltacat_storage,
|
26
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
27
|
+
**kwargs,
|
28
|
+
)
|
29
|
+
|
30
|
+
logger.info(f"Merging {len(delta_dfs)} delta dfs...")
|
31
|
+
|
32
|
+
# TODO: This code should be optimized from daft side
|
33
|
+
result = None
|
34
|
+
for df in delta_dfs:
|
35
|
+
if result is None:
|
36
|
+
result = df
|
37
|
+
else:
|
38
|
+
result = result.concat(df)
|
39
|
+
|
40
|
+
return result
|
File without changes
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Optional, Dict, List, Union, Any
|
3
|
+
from deltacat.storage import (
|
4
|
+
Delta,
|
5
|
+
DeltaLocator,
|
6
|
+
interface as unimplemented_deltacat_storage,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
class MergeOnReadParams(dict):
|
11
|
+
"""
|
12
|
+
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
+
"""
|
14
|
+
|
15
|
+
@staticmethod
|
16
|
+
def of(params: Optional[Dict]) -> MergeOnReadParams:
|
17
|
+
params = {} if params is None else params
|
18
|
+
|
19
|
+
result = MergeOnReadParams(params)
|
20
|
+
assert result.deltas is not None, "deltas is a required arg"
|
21
|
+
|
22
|
+
result.deltacat_storage = params.get(
|
23
|
+
"deltacat_storage", unimplemented_deltacat_storage
|
24
|
+
)
|
25
|
+
result.reader_kwargs = params.get("reader_kwargs", {})
|
26
|
+
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
27
|
+
|
28
|
+
return result
|
29
|
+
|
30
|
+
@property
|
31
|
+
def deltas(self) -> List[Union[Delta, DeltaLocator]]:
|
32
|
+
"""
|
33
|
+
The list of deltas to compact in-memory.
|
34
|
+
"""
|
35
|
+
return self["deltas"]
|
36
|
+
|
37
|
+
@deltas.setter
|
38
|
+
def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
|
39
|
+
self["deltas"] = to_set
|
40
|
+
|
41
|
+
@property
|
42
|
+
def reader_kwargs(self) -> Dict[Any, Any]:
|
43
|
+
"""
|
44
|
+
The key word arguments to be passed to the reader.
|
45
|
+
"""
|
46
|
+
return self["reader_kwargs"]
|
47
|
+
|
48
|
+
@reader_kwargs.setter
|
49
|
+
def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
|
50
|
+
self["reader_kwargs"] = kwargs
|
51
|
+
|
52
|
+
@property
|
53
|
+
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
54
|
+
return self["deltacat_storage"]
|
55
|
+
|
56
|
+
@deltacat_storage.setter
|
57
|
+
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
58
|
+
self["deltacat_storage"] = storage
|
59
|
+
|
60
|
+
@property
|
61
|
+
def deltacat_storage_kwargs(self) -> dict:
|
62
|
+
return self["deltacat_storage_kwargs"]
|
63
|
+
|
64
|
+
@deltacat_storage_kwargs.setter
|
65
|
+
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
66
|
+
self["deltacat_storage_kwargs"] = kwargs
|
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from typing import List, Dict, Any, Optional, Union
|
2
|
+
from deltacat.storage.model.delta import Delta, DeltaLocator
|
3
|
+
from deltacat.storage.model.types import DistributedDataset
|
4
|
+
from deltacat.storage import (
|
5
|
+
interface as unimplemented_deltacat_storage,
|
6
|
+
)
|
7
|
+
from deltacat.types.media import TableType, StorageType, DistributedDatasetType
|
8
|
+
|
9
|
+
|
10
|
+
def create_df_from_all_deltas(
|
11
|
+
deltas: List[Union[Delta, DeltaLocator]],
|
12
|
+
table_type: TableType,
|
13
|
+
distributed_dataset_type: DistributedDatasetType,
|
14
|
+
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
15
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
16
|
+
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
17
|
+
*args,
|
18
|
+
**kwargs
|
19
|
+
) -> List[DistributedDataset]: # type: ignore
|
20
|
+
"""
|
21
|
+
This method creates a distributed dataset for each delta and returns their references.
|
22
|
+
"""
|
23
|
+
|
24
|
+
if reader_kwargs is None:
|
25
|
+
reader_kwargs = {}
|
26
|
+
if deltacat_storage_kwargs is None:
|
27
|
+
deltacat_storage_kwargs = {}
|
28
|
+
|
29
|
+
df_list = []
|
30
|
+
|
31
|
+
for delta in deltas:
|
32
|
+
df = deltacat_storage.download_delta(
|
33
|
+
delta_like=delta,
|
34
|
+
table_type=table_type,
|
35
|
+
distributed_dataset_type=distributed_dataset_type,
|
36
|
+
storage_type=StorageType.DISTRIBUTED,
|
37
|
+
**reader_kwargs,
|
38
|
+
**deltacat_storage_kwargs
|
39
|
+
)
|
40
|
+
df_list.append(df)
|
41
|
+
|
42
|
+
return df_list
|
deltacat/storage/interface.py
CHANGED
@@ -23,7 +23,12 @@ from deltacat.storage import (
|
|
23
23
|
SortKey,
|
24
24
|
PartitionLocator,
|
25
25
|
)
|
26
|
-
from deltacat.types.media import
|
26
|
+
from deltacat.types.media import (
|
27
|
+
ContentType,
|
28
|
+
StorageType,
|
29
|
+
TableType,
|
30
|
+
DistributedDatasetType,
|
31
|
+
)
|
27
32
|
from deltacat.utils.common import ReadKwargsProvider
|
28
33
|
|
29
34
|
|
@@ -178,9 +183,10 @@ def download_delta(
|
|
178
183
|
columns: Optional[List[str]] = None,
|
179
184
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
180
185
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
186
|
+
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
181
187
|
*args,
|
182
188
|
**kwargs
|
183
|
-
) -> Union[LocalDataset, DistributedDataset]:
|
189
|
+
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
184
190
|
"""
|
185
191
|
Download the given delta or delta locator into either a list of
|
186
192
|
tables resident in the local node's memory, or into a dataset distributed
|
@@ -205,6 +211,8 @@ def download_delta_manifest_entry(
|
|
205
211
|
given delta or delta locator. If a delta is provided with a non-empty
|
206
212
|
manifest, then the entry is downloaded from this manifest. Otherwise, the
|
207
213
|
manifest is first retrieved then the given entry index downloaded.
|
214
|
+
|
215
|
+
NOTE: The entry will be downloaded in the current node's memory.
|
208
216
|
"""
|
209
217
|
raise NotImplementedError("download_delta_manifest_entry not implemented")
|
210
218
|
|
deltacat/storage/model/types.py
CHANGED
@@ -1,24 +1,16 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import List, Union
|
2
|
+
from typing import List, Union
|
3
3
|
|
4
4
|
from pyarrow.parquet import ParquetFile
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
import pyarrow as pa
|
8
|
-
import pkg_resources
|
9
|
-
from ray.data._internal.arrow_block import ArrowRow
|
10
8
|
from ray.data.dataset import Dataset
|
9
|
+
from daft import DataFrame as DaftDataFrame
|
11
10
|
|
12
11
|
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
|
13
12
|
LocalDataset = List[LocalTable]
|
14
|
-
|
15
|
-
# and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
|
16
|
-
ray_version = pkg_resources.parse_version(pkg_resources.get_distribution("ray").version)
|
17
|
-
change_version = pkg_resources.parse_version("2.5.0")
|
18
|
-
if ray_version < change_version:
|
19
|
-
DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
|
20
|
-
else:
|
21
|
-
DistributedDataset = Dataset
|
13
|
+
DistributedDataset = Union[Dataset, DaftDataFrame]
|
22
14
|
|
23
15
|
|
24
16
|
class DeltaType(str, Enum):
|
File without changes
|
@@ -0,0 +1,98 @@
|
|
1
|
+
import unittest
|
2
|
+
import sqlite3
|
3
|
+
import ray
|
4
|
+
import os
|
5
|
+
import deltacat.tests.local_deltacat_storage as ds
|
6
|
+
from deltacat.utils.common import current_time_ms
|
7
|
+
from deltacat.tests.test_utils.pyarrow import (
|
8
|
+
create_delta_from_csv_file,
|
9
|
+
commit_delta_to_partition,
|
10
|
+
)
|
11
|
+
from deltacat.types.media import DistributedDatasetType, ContentType
|
12
|
+
from deltacat.catalog import default_catalog_impl as dc
|
13
|
+
|
14
|
+
|
15
|
+
class TestReadTable(unittest.TestCase):
|
16
|
+
READ_TABLE_NAMESPACE = "catalog_read_table_namespace"
|
17
|
+
LOCAL_CATALOG_NAME = "local_catalog"
|
18
|
+
DB_FILE_PATH = f"{current_time_ms()}.db"
|
19
|
+
SAMPLE_FILE_PATH = "deltacat/tests/catalog/data/sample_table.csv"
|
20
|
+
|
21
|
+
@classmethod
|
22
|
+
def setUpClass(cls):
|
23
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
24
|
+
|
25
|
+
con = sqlite3.connect(cls.DB_FILE_PATH)
|
26
|
+
cur = con.cursor()
|
27
|
+
cls.kwargs = {
|
28
|
+
ds.SQLITE_CON_ARG: con,
|
29
|
+
ds.SQLITE_CUR_ARG: cur,
|
30
|
+
"supported_content_types": [ContentType.CSV],
|
31
|
+
}
|
32
|
+
cls.deltacat_storage_kwargs = {ds.DB_FILE_PATH_ARG: cls.DB_FILE_PATH}
|
33
|
+
|
34
|
+
super().setUpClass()
|
35
|
+
|
36
|
+
@classmethod
|
37
|
+
def doClassCleanups(cls) -> None:
|
38
|
+
os.remove(cls.DB_FILE_PATH)
|
39
|
+
|
40
|
+
def test_daft_distributed_read_sanity(self):
|
41
|
+
# setup
|
42
|
+
READ_TABLE_TABLE_NAME = "test_read_table"
|
43
|
+
create_delta_from_csv_file(
|
44
|
+
self.READ_TABLE_NAMESPACE,
|
45
|
+
[self.SAMPLE_FILE_PATH],
|
46
|
+
table_name=READ_TABLE_TABLE_NAME,
|
47
|
+
**self.kwargs,
|
48
|
+
)
|
49
|
+
|
50
|
+
dc.initialize(ds=ds)
|
51
|
+
df = dc.read_table(
|
52
|
+
table=READ_TABLE_TABLE_NAME,
|
53
|
+
namespace=self.READ_TABLE_NAMESPACE,
|
54
|
+
catalog=self.LOCAL_CATALOG_NAME,
|
55
|
+
distributed_dataset_type=DistributedDatasetType.DAFT,
|
56
|
+
deltacat_storage_kwargs=self.kwargs,
|
57
|
+
)
|
58
|
+
|
59
|
+
# verify
|
60
|
+
self.assertEqual(df.count_rows(), 6)
|
61
|
+
self.assertEqual(df.column_names, ["pk", "value"])
|
62
|
+
|
63
|
+
def test_daft_distributed_read_multiple_deltas(self):
|
64
|
+
# setup
|
65
|
+
READ_TABLE_TABLE_NAME = "test_read_table_2"
|
66
|
+
delta = create_delta_from_csv_file(
|
67
|
+
self.READ_TABLE_NAMESPACE,
|
68
|
+
[self.SAMPLE_FILE_PATH],
|
69
|
+
table_name=READ_TABLE_TABLE_NAME,
|
70
|
+
**self.kwargs,
|
71
|
+
)
|
72
|
+
|
73
|
+
partition = ds.get_partition(
|
74
|
+
delta.stream_locator, delta.partition_values, **self.kwargs
|
75
|
+
)
|
76
|
+
|
77
|
+
commit_delta_to_partition(
|
78
|
+
partition=partition, file_paths=[self.SAMPLE_FILE_PATH], **self.kwargs
|
79
|
+
)
|
80
|
+
|
81
|
+
# action
|
82
|
+
dc.initialize(ds=ds)
|
83
|
+
df = dc.read_table(
|
84
|
+
table=READ_TABLE_TABLE_NAME,
|
85
|
+
namespace=self.READ_TABLE_NAMESPACE,
|
86
|
+
catalog=self.LOCAL_CATALOG_NAME,
|
87
|
+
distributed_dataset_type=DistributedDatasetType.DAFT,
|
88
|
+
merge_on_read=False,
|
89
|
+
deltacat_storage_kwargs=self.kwargs,
|
90
|
+
)
|
91
|
+
|
92
|
+
# verify
|
93
|
+
self.assertEqual(
|
94
|
+
df.count_rows(),
|
95
|
+
12,
|
96
|
+
"we expect twice as many" " columns as merge on read is disabled",
|
97
|
+
)
|
98
|
+
self.assertEqual(df.column_names, ["pk", "value"])
|