deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/__init__.py +4 -0
- deltacat/aws/redshift/model/manifest.py +93 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +47 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -442,6 +442,33 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
442
442
|
drop_duplicates=True,
|
443
443
|
skip_enabled_compact_partition_drivers=None,
|
444
444
|
),
|
445
|
+
"12-incremental-decimal-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
446
|
+
primary_keys={"pk_col_1"},
|
447
|
+
sort_keys=[SortKey.of(key_name="sk_col_1")],
|
448
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
449
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
450
|
+
input_deltas=pa.Table.from_arrays(
|
451
|
+
[
|
452
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
453
|
+
pa.array([i for i in range(20)]),
|
454
|
+
],
|
455
|
+
names=["pk_col_1", "sk_col_1"],
|
456
|
+
),
|
457
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
458
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
459
|
+
[
|
460
|
+
pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
|
461
|
+
pa.array([3, 7, 11, 15, 19]),
|
462
|
+
],
|
463
|
+
names=["pk_col_1", "sk_col_1"],
|
464
|
+
),
|
465
|
+
do_create_placement_group=False,
|
466
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
467
|
+
hash_bucket_count=1,
|
468
|
+
read_kwargs_provider=None,
|
469
|
+
drop_duplicates=True,
|
470
|
+
skip_enabled_compact_partition_drivers=None,
|
471
|
+
),
|
445
472
|
}
|
446
473
|
|
447
474
|
REBASE_THEN_INCREMENTAL_TEST_CASES = {
|
@@ -1091,6 +1118,104 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
|
|
1091
1118
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
1092
1119
|
),
|
1093
1120
|
"14-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
|
1121
|
+
primary_keys={"pk_col_1"},
|
1122
|
+
sort_keys=[
|
1123
|
+
SortKey.of(key_name="sk_col_1"),
|
1124
|
+
SortKey.of(key_name="sk_col_2"),
|
1125
|
+
],
|
1126
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
1127
|
+
partition_values=["1"],
|
1128
|
+
input_deltas=pa.Table.from_arrays(
|
1129
|
+
[
|
1130
|
+
pa.array([str(i) for i in range(10)]),
|
1131
|
+
pa.array([i for i in range(0, 10)]),
|
1132
|
+
pa.array(["foo"] * 10),
|
1133
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
1134
|
+
],
|
1135
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1136
|
+
),
|
1137
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
1138
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
1139
|
+
[
|
1140
|
+
pa.array([str(i) for i in range(10)]),
|
1141
|
+
pa.array([i for i in range(0, 10)]),
|
1142
|
+
pa.array(["foo"] * 10),
|
1143
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
1144
|
+
],
|
1145
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1146
|
+
),
|
1147
|
+
incremental_deltas=None,
|
1148
|
+
incremental_deltas_delta_type=DeltaType.UPSERT,
|
1149
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
1150
|
+
[
|
1151
|
+
pa.array([str(i) for i in range(10)]),
|
1152
|
+
pa.array([i for i in range(0, 10)]),
|
1153
|
+
pa.array(["foo"] * 10),
|
1154
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
1155
|
+
],
|
1156
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1157
|
+
),
|
1158
|
+
do_create_placement_group=False,
|
1159
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
1160
|
+
hash_bucket_count=3,
|
1161
|
+
read_kwargs_provider=None,
|
1162
|
+
drop_duplicates=True,
|
1163
|
+
skip_enabled_compact_partition_drivers=None,
|
1164
|
+
),
|
1165
|
+
"15-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
|
1166
|
+
primary_keys={"pk_col_1"},
|
1167
|
+
sort_keys=[
|
1168
|
+
SortKey.of(key_name="sk_col_1"),
|
1169
|
+
SortKey.of(key_name="sk_col_2"),
|
1170
|
+
],
|
1171
|
+
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
1172
|
+
partition_values=["1"],
|
1173
|
+
input_deltas=pa.Table.from_arrays(
|
1174
|
+
[
|
1175
|
+
pa.array([str(i) for i in range(10)]),
|
1176
|
+
pa.array([i for i in range(0, 10)]),
|
1177
|
+
pa.array(["foo"] * 10),
|
1178
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
1179
|
+
],
|
1180
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1181
|
+
),
|
1182
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
1183
|
+
rebase_expected_compact_partition_result=pa.Table.from_arrays(
|
1184
|
+
[
|
1185
|
+
pa.array([str(i) for i in range(10)]),
|
1186
|
+
pa.array([i for i in range(0, 10)]),
|
1187
|
+
pa.array(["foo"] * 10),
|
1188
|
+
pa.array([i / 10 for i in range(10, 20)]),
|
1189
|
+
],
|
1190
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1191
|
+
),
|
1192
|
+
incremental_deltas=pa.Table.from_arrays(
|
1193
|
+
[
|
1194
|
+
pa.array([str(i) for i in range(10)]),
|
1195
|
+
pa.array([i for i in range(20, 30)]),
|
1196
|
+
pa.array(["foo"] * 10),
|
1197
|
+
pa.array([i / 10 for i in range(40, 50)]),
|
1198
|
+
],
|
1199
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1200
|
+
),
|
1201
|
+
incremental_deltas_delta_type=DeltaType.UPSERT,
|
1202
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
1203
|
+
[
|
1204
|
+
pa.array([str(i) for i in range(10)]),
|
1205
|
+
pa.array([i for i in range(20, 30)]),
|
1206
|
+
pa.array(["foo"] * 10),
|
1207
|
+
pa.array([i / 10 for i in range(40, 50)]),
|
1208
|
+
],
|
1209
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
|
1210
|
+
),
|
1211
|
+
do_create_placement_group=False,
|
1212
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
1213
|
+
hash_bucket_count=1,
|
1214
|
+
read_kwargs_provider=None,
|
1215
|
+
drop_duplicates=True,
|
1216
|
+
skip_enabled_compact_partition_drivers=None,
|
1217
|
+
),
|
1218
|
+
"16-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
|
1094
1219
|
primary_keys={"pk_col_1"},
|
1095
1220
|
sort_keys=[
|
1096
1221
|
SortKey.of(key_name="sk_col_1"),
|
@@ -1137,9 +1262,9 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
|
|
1137
1262
|
),
|
1138
1263
|
}
|
1139
1264
|
|
1140
|
-
|
1141
1265
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
1142
1266
|
|
1267
|
+
|
1143
1268
|
REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
|
1144
1269
|
REBASE_THEN_INCREMENTAL_TEST_CASES
|
1145
1270
|
)
|
@@ -7,6 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Set
|
|
7
7
|
from boto3.resources.base import ServiceResource
|
8
8
|
import pyarrow as pa
|
9
9
|
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
+
from deltacat.types.media import StorageType
|
10
11
|
|
11
12
|
from deltacat.tests.compute.test_util_common import (
|
12
13
|
get_rcf,
|
@@ -269,7 +270,9 @@ def test_compact_partition_incremental(
|
|
269
270
|
**compaction_audit_obj
|
270
271
|
)
|
271
272
|
|
272
|
-
tables = ds.download_delta(
|
273
|
+
tables = ds.download_delta(
|
274
|
+
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
275
|
+
)
|
273
276
|
actual_compacted_table = pa.concat_tables(tables)
|
274
277
|
sorting_cols: List[Any] = [(val, "ascending") for val in primary_keys]
|
275
278
|
# the compacted table may contain multiple files and chunks
|
@@ -32,6 +32,7 @@ from deltacat.tests.compute.compact_partition_test_cases import (
|
|
32
32
|
REBASE_THEN_INCREMENTAL_TEST_CASES,
|
33
33
|
)
|
34
34
|
from typing import Any, Callable, Dict, List, Optional, Set
|
35
|
+
from deltacat.types.media import StorageType
|
35
36
|
|
36
37
|
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
37
38
|
"db_file_path",
|
@@ -272,7 +273,9 @@ def test_compact_partition_rebase_then_incremental(
|
|
272
273
|
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
273
274
|
setup_s3_resource, rcf_file_s3_uri
|
274
275
|
)
|
275
|
-
tables = ds.download_delta(
|
276
|
+
tables = ds.download_delta(
|
277
|
+
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
278
|
+
)
|
276
279
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
277
280
|
# if no primary key is specified then sort by sort_key for consistent assertion
|
278
281
|
sorting_cols: List[Any] = (
|
@@ -341,7 +344,11 @@ def test_compact_partition_rebase_then_incremental(
|
|
341
344
|
**compaction_audit_obj
|
342
345
|
)
|
343
346
|
|
344
|
-
tables = ds.download_delta(
|
347
|
+
tables = ds.download_delta(
|
348
|
+
compacted_delta_locator_incremental,
|
349
|
+
storage_type=StorageType.LOCAL,
|
350
|
+
**ds_mock_kwargs,
|
351
|
+
)
|
345
352
|
actual_compacted_table = pa.concat_tables(tables)
|
346
353
|
expected_terminal_compact_partition_result = (
|
347
354
|
expected_terminal_compact_partition_result.combine_chunks().sort_by(
|
@@ -1,10 +1,12 @@
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Set, Union, Tuple
|
2
2
|
|
3
3
|
import pyarrow as pa
|
4
|
+
import daft
|
4
5
|
import json
|
5
6
|
import sqlite3
|
6
7
|
from sqlite3 import Cursor, Connection
|
7
8
|
import uuid
|
9
|
+
import ray
|
8
10
|
import io
|
9
11
|
|
10
12
|
from deltacat.tests.test_utils.storage import create_empty_delta
|
@@ -38,7 +40,13 @@ from deltacat.storage import (
|
|
38
40
|
ManifestEntry,
|
39
41
|
ManifestEntryList,
|
40
42
|
)
|
41
|
-
from deltacat.types.media import
|
43
|
+
from deltacat.types.media import (
|
44
|
+
ContentType,
|
45
|
+
StorageType,
|
46
|
+
TableType,
|
47
|
+
ContentEncoding,
|
48
|
+
DistributedDatasetType,
|
49
|
+
)
|
42
50
|
from deltacat.utils.common import ReadKwargsProvider
|
43
51
|
|
44
52
|
SQLITE_CUR_ARG = "sqlite3_cur"
|
@@ -337,9 +345,10 @@ def download_delta(
|
|
337
345
|
columns: Optional[List[str]] = None,
|
338
346
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
339
347
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
348
|
+
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
340
349
|
*args,
|
341
350
|
**kwargs,
|
342
|
-
) -> Union[LocalDataset, DistributedDataset]:
|
351
|
+
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
343
352
|
result = []
|
344
353
|
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
345
354
|
|
@@ -356,6 +365,14 @@ def download_delta(
|
|
356
365
|
)
|
357
366
|
)
|
358
367
|
|
368
|
+
if storage_type == StorageType.DISTRIBUTED:
|
369
|
+
if distributed_dataset_type is DistributedDatasetType.DAFT:
|
370
|
+
return daft.from_arrow(result)
|
371
|
+
elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
|
372
|
+
return ray.data.from_arrow(result)
|
373
|
+
else:
|
374
|
+
raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
|
375
|
+
|
359
376
|
return result
|
360
377
|
|
361
378
|
|
@@ -1,14 +1,18 @@
|
|
1
|
-
from typing import List
|
1
|
+
from typing import List, Optional
|
2
2
|
import pyarrow as pa
|
3
|
-
from deltacat.storage import Delta, Partition
|
3
|
+
from deltacat.storage import Delta, Partition, PartitionLocator
|
4
4
|
import deltacat.tests.local_deltacat_storage as ds
|
5
5
|
|
6
6
|
|
7
7
|
def create_delta_from_csv_file(
|
8
|
-
namespace: str,
|
8
|
+
namespace: str,
|
9
|
+
file_paths: List[str],
|
10
|
+
table_name: Optional[str] = None,
|
11
|
+
*args,
|
12
|
+
**kwargs
|
9
13
|
) -> Delta:
|
10
14
|
staged_partition = stage_partition_from_file_paths(
|
11
|
-
namespace, file_paths, *args, **kwargs
|
15
|
+
namespace, file_paths, *args, table_name=table_name, **kwargs
|
12
16
|
)
|
13
17
|
|
14
18
|
committed_delta = commit_delta_to_staged_partition(
|
@@ -19,10 +23,15 @@ def create_delta_from_csv_file(
|
|
19
23
|
|
20
24
|
|
21
25
|
def stage_partition_from_file_paths(
|
22
|
-
namespace: str,
|
26
|
+
namespace: str,
|
27
|
+
file_paths: List[str],
|
28
|
+
table_name: Optional[str] = None,
|
29
|
+
*args,
|
30
|
+
**kwargs
|
23
31
|
) -> Partition:
|
24
32
|
ds.create_namespace(namespace, {}, **kwargs)
|
25
|
-
table_name
|
33
|
+
if table_name is None:
|
34
|
+
table_name = "-".join(file_paths).replace("/", "_")
|
26
35
|
ds.create_table_version(namespace, table_name, "1", **kwargs)
|
27
36
|
stream = ds.get_stream(namespace, table_name, "1", **kwargs)
|
28
37
|
staged_partition = ds.stage_partition(stream, [], **kwargs)
|
@@ -31,19 +40,29 @@ def stage_partition_from_file_paths(
|
|
31
40
|
|
32
41
|
def commit_delta_to_staged_partition(
|
33
42
|
staged_partition, file_paths: List[str], *args, **kwargs
|
43
|
+
) -> Delta:
|
44
|
+
committed_delta = commit_delta_to_partition(
|
45
|
+
staged_partition, file_paths=file_paths, *args, **kwargs
|
46
|
+
)
|
47
|
+
ds.commit_partition(staged_partition, **kwargs)
|
48
|
+
return committed_delta
|
49
|
+
|
50
|
+
|
51
|
+
def commit_delta_to_partition(
|
52
|
+
partition: Partition, file_paths: List[str], *args, **kwargs
|
34
53
|
) -> Delta:
|
35
54
|
tables = []
|
36
55
|
|
56
|
+
if isinstance(partition, PartitionLocator):
|
57
|
+
partition = ds.get_partition(
|
58
|
+
partition.stream_locator, partition.partition_values, *args, **kwargs
|
59
|
+
)
|
60
|
+
|
37
61
|
for file_path in file_paths:
|
38
62
|
table = pa.csv.read_csv(file_path)
|
39
63
|
tables.append(table)
|
40
|
-
deltas = []
|
41
64
|
|
42
|
-
|
43
|
-
|
44
|
-
deltas.append(delta)
|
65
|
+
table = pa.concat_tables(tables)
|
66
|
+
staged_delta = ds.stage_delta(table, partition, **kwargs)
|
45
67
|
|
46
|
-
|
47
|
-
committed_delta = ds.commit_delta(merged_delta, **kwargs)
|
48
|
-
ds.commit_partition(staged_partition, **kwargs)
|
49
|
-
return committed_delta
|
68
|
+
return ds.commit_delta(staged_delta, **kwargs)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
from deltacat.types.media import ContentEncoding, ContentType
|
3
|
-
from deltacat.utils.daft import daft_s3_file_to_table
|
3
|
+
from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
|
4
4
|
|
5
5
|
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
6
6
|
from deltacat.types.partial_download import PartialParquetParameters
|
@@ -9,7 +9,7 @@ import pyarrow as pa
|
|
9
9
|
from pyarrow import parquet as pq
|
10
10
|
|
11
11
|
|
12
|
-
class
|
12
|
+
class TestDaftS3FileToTable(unittest.TestCase):
|
13
13
|
MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
|
14
14
|
|
15
15
|
def test_read_from_s3_all_columns(self):
|
@@ -121,5 +121,45 @@ class TestDaftParquetReader(unittest.TestCase):
|
|
121
121
|
self.assertEqual(table.num_rows, 10)
|
122
122
|
|
123
123
|
|
124
|
+
class TestDaftS3FilesToDataFrame(unittest.TestCase):
|
125
|
+
MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
|
126
|
+
|
127
|
+
def test_read_from_s3_all_columns(self):
|
128
|
+
df = s3_files_to_dataframe(
|
129
|
+
uris=[self.MVP_PATH],
|
130
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
131
|
+
content_type=ContentType.PARQUET.value,
|
132
|
+
ray_init_options={"local_mode": True},
|
133
|
+
)
|
134
|
+
|
135
|
+
table = df.to_arrow()
|
136
|
+
self.assertEqual(table.schema.names, ["a", "b"])
|
137
|
+
self.assertEqual(table.num_rows, 100)
|
138
|
+
|
139
|
+
def test_does_not_read_from_s3_if_not_materialized(self):
|
140
|
+
df = s3_files_to_dataframe(
|
141
|
+
uris=[self.MVP_PATH],
|
142
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
143
|
+
content_type=ContentType.PARQUET.value,
|
144
|
+
ray_init_options={"local_mode": True},
|
145
|
+
)
|
146
|
+
|
147
|
+
self.assertRaises(RuntimeError, lambda: len(df))
|
148
|
+
df.collect()
|
149
|
+
self.assertEqual(len(df), 100)
|
150
|
+
|
151
|
+
def test_raises_error_if_not_supported_content_type(self):
|
152
|
+
|
153
|
+
self.assertRaises(
|
154
|
+
AssertionError,
|
155
|
+
lambda: s3_files_to_dataframe(
|
156
|
+
uris=[self.MVP_PATH],
|
157
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
158
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
159
|
+
ray_init_options={"local_mode": True},
|
160
|
+
),
|
161
|
+
)
|
162
|
+
|
163
|
+
|
124
164
|
if __name__ == "__main__":
|
125
165
|
unittest.main()
|
deltacat/types/media.py
CHANGED
deltacat/types/tables.py
CHANGED
@@ -15,10 +15,11 @@ from ray.data.read_api import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
import deltacat.storage as dcs
|
18
|
-
from deltacat.types.media import TableType
|
18
|
+
from deltacat.types.media import TableType, DistributedDatasetType
|
19
19
|
from deltacat.utils import numpy as np_utils
|
20
20
|
from deltacat.utils import pandas as pd_utils
|
21
21
|
from deltacat.utils import pyarrow as pa_utils
|
22
|
+
from deltacat.utils import daft as daft_utils
|
22
23
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
23
24
|
|
24
25
|
TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
@@ -78,6 +79,11 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
|
|
78
79
|
}
|
79
80
|
|
80
81
|
|
82
|
+
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
83
|
+
DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
|
84
|
+
}
|
85
|
+
|
86
|
+
|
81
87
|
class TableWriteMode(str, Enum):
|
82
88
|
"""
|
83
89
|
Enum controlling how a given dataset will be written to a table.
|
deltacat/utils/daft.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional, List
|
3
|
-
|
2
|
+
from typing import Optional, List, Any, Dict, Callable
|
3
|
+
import daft
|
4
|
+
import ray
|
4
5
|
from daft.table import read_parquet_into_pyarrow
|
5
|
-
from daft import TimeUnit
|
6
|
+
from daft import TimeUnit, DataFrame
|
6
7
|
from daft.io import IOConfig, S3Config
|
7
8
|
import pyarrow as pa
|
8
9
|
|
@@ -22,6 +23,66 @@ from deltacat.types.partial_download import (
|
|
22
23
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
23
24
|
|
24
25
|
|
26
|
+
def s3_files_to_dataframe(
|
27
|
+
uris: List[str],
|
28
|
+
content_type: str,
|
29
|
+
content_encoding: str,
|
30
|
+
column_names: Optional[List[str]] = None,
|
31
|
+
include_columns: Optional[List[str]] = None,
|
32
|
+
read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
33
|
+
ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
|
34
|
+
s3_client_kwargs: Optional[Any] = None,
|
35
|
+
ray_init_options: Optional[Dict[str, Any]] = None,
|
36
|
+
) -> DataFrame:
|
37
|
+
|
38
|
+
if ray_init_options is None:
|
39
|
+
ray_init_options = {}
|
40
|
+
|
41
|
+
assert (
|
42
|
+
content_type == ContentType.PARQUET.value
|
43
|
+
), f"daft native reader currently only supports parquet, got {content_type}"
|
44
|
+
|
45
|
+
assert (
|
46
|
+
content_encoding == ContentEncoding.IDENTITY.value
|
47
|
+
), f"daft native reader currently only supports identity encoding, got {content_encoding}"
|
48
|
+
|
49
|
+
if not ray.is_initialized():
|
50
|
+
ray.init(address="auto", ignore_reinit_error=True, **ray_init_options)
|
51
|
+
|
52
|
+
daft.context.set_runner_ray(noop_if_initialized=True)
|
53
|
+
|
54
|
+
if s3_client_kwargs is None:
|
55
|
+
s3_client_kwargs = {}
|
56
|
+
|
57
|
+
kwargs = {}
|
58
|
+
if read_func_kwargs_provider is not None:
|
59
|
+
kwargs = read_func_kwargs_provider(content_type, kwargs)
|
60
|
+
|
61
|
+
# TODO(raghumdani): pass in coerce_int96_timestamp arg
|
62
|
+
# https://github.com/Eventual-Inc/Daft/issues/1894
|
63
|
+
|
64
|
+
io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
|
65
|
+
|
66
|
+
logger.debug(
|
67
|
+
f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
|
68
|
+
)
|
69
|
+
|
70
|
+
df, latency = timed_invocation(
|
71
|
+
daft.read_parquet, path=uris, io_config=io_config, use_native_downloader=True
|
72
|
+
)
|
73
|
+
|
74
|
+
logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
|
75
|
+
|
76
|
+
columns_to_read = include_columns or column_names
|
77
|
+
|
78
|
+
logger.debug(f"Taking columns {columns_to_read} from the daft df.")
|
79
|
+
|
80
|
+
if columns_to_read:
|
81
|
+
return df.select(*columns_to_read)
|
82
|
+
else:
|
83
|
+
return df
|
84
|
+
|
85
|
+
|
25
86
|
def daft_s3_file_to_table(
|
26
87
|
s3_url: str,
|
27
88
|
content_type: str,
|
@@ -55,16 +116,7 @@ def daft_s3_file_to_table(
|
|
55
116
|
):
|
56
117
|
row_groups = partial_file_download_params.row_groups_to_download
|
57
118
|
|
58
|
-
io_config =
|
59
|
-
s3=S3Config(
|
60
|
-
key_id=s3_client_kwargs.get("aws_access_key_id"),
|
61
|
-
access_key=s3_client_kwargs.get("aws_secret_access_key"),
|
62
|
-
session_token=s3_client_kwargs.get("aws_session_token"),
|
63
|
-
retry_mode="adaptive",
|
64
|
-
num_tries=BOTO_MAX_RETRIES,
|
65
|
-
max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
66
|
-
)
|
67
|
-
)
|
119
|
+
io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
|
68
120
|
|
69
121
|
logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
|
70
122
|
|
@@ -95,3 +147,16 @@ def daft_s3_file_to_table(
|
|
95
147
|
return coerce_pyarrow_table_to_schema(pa_table, input_schema)
|
96
148
|
else:
|
97
149
|
return pa_table
|
150
|
+
|
151
|
+
|
152
|
+
def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
|
153
|
+
return IOConfig(
|
154
|
+
s3=S3Config(
|
155
|
+
key_id=s3_client_kwargs.get("aws_access_key_id"),
|
156
|
+
access_key=s3_client_kwargs.get("aws_secret_access_key"),
|
157
|
+
session_token=s3_client_kwargs.get("aws_session_token"),
|
158
|
+
retry_mode="adaptive",
|
159
|
+
num_tries=BOTO_MAX_RETRIES,
|
160
|
+
max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
|
161
|
+
)
|
162
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.0
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -27,7 +27,7 @@ Requires-Dist: tenacity ==8.1.0
|
|
27
27
|
Requires-Dist: typing-extensions ==4.4.0
|
28
28
|
Requires-Dist: pymemcache ==4.0.0
|
29
29
|
Requires-Dist: redis ==4.6.0
|
30
|
-
Requires-Dist: getdaft ==0.2.
|
30
|
+
Requires-Dist: getdaft ==0.2.16
|
31
31
|
Requires-Dist: schedule ==1.2.0
|
32
32
|
|
33
33
|
# DeltaCAT
|