deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/__init__.py +4 -0
  3. deltacat/aws/redshift/model/manifest.py +93 -1
  4. deltacat/aws/s3u.py +250 -111
  5. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  6. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  7. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  8. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  10. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  11. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  12. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  13. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  14. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  15. deltacat/compute/compactor_v2/utils/task_options.py +47 -4
  16. deltacat/compute/merge_on_read/__init__.py +4 -0
  17. deltacat/compute/merge_on_read/daft.py +40 -0
  18. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  20. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  21. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  22. deltacat/storage/interface.py +10 -2
  23. deltacat/storage/model/types.py +3 -11
  24. deltacat/tests/catalog/__init__.py +0 -0
  25. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  26. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  27. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  28. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  29. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  30. deltacat/tests/test_utils/pyarrow.py +33 -14
  31. deltacat/tests/utils/test_daft.py +42 -2
  32. deltacat/types/media.py +5 -0
  33. deltacat/types/tables.py +7 -1
  34. deltacat/utils/daft.py +78 -13
  35. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
  36. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
  37. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
  38. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
  39. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -442,6 +442,33 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
442
442
  drop_duplicates=True,
443
443
  skip_enabled_compact_partition_drivers=None,
444
444
  ),
445
+ "12-incremental-decimal-single-hash-bucket": IncrementalCompactionTestCaseParams(
446
+ primary_keys={"pk_col_1"},
447
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
448
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
449
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
450
+ input_deltas=pa.Table.from_arrays(
451
+ [
452
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
453
+ pa.array([i for i in range(20)]),
454
+ ],
455
+ names=["pk_col_1", "sk_col_1"],
456
+ ),
457
+ input_deltas_delta_type=DeltaType.UPSERT,
458
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
459
+ [
460
+ pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
461
+ pa.array([3, 7, 11, 15, 19]),
462
+ ],
463
+ names=["pk_col_1", "sk_col_1"],
464
+ ),
465
+ do_create_placement_group=False,
466
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
467
+ hash_bucket_count=1,
468
+ read_kwargs_provider=None,
469
+ drop_duplicates=True,
470
+ skip_enabled_compact_partition_drivers=None,
471
+ ),
445
472
  }
446
473
 
447
474
  REBASE_THEN_INCREMENTAL_TEST_CASES = {
@@ -1091,6 +1118,104 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
1091
1118
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
1092
1119
  ),
1093
1120
  "14-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
1121
+ primary_keys={"pk_col_1"},
1122
+ sort_keys=[
1123
+ SortKey.of(key_name="sk_col_1"),
1124
+ SortKey.of(key_name="sk_col_2"),
1125
+ ],
1126
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1127
+ partition_values=["1"],
1128
+ input_deltas=pa.Table.from_arrays(
1129
+ [
1130
+ pa.array([str(i) for i in range(10)]),
1131
+ pa.array([i for i in range(0, 10)]),
1132
+ pa.array(["foo"] * 10),
1133
+ pa.array([i / 10 for i in range(10, 20)]),
1134
+ ],
1135
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1136
+ ),
1137
+ input_deltas_delta_type=DeltaType.UPSERT,
1138
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1139
+ [
1140
+ pa.array([str(i) for i in range(10)]),
1141
+ pa.array([i for i in range(0, 10)]),
1142
+ pa.array(["foo"] * 10),
1143
+ pa.array([i / 10 for i in range(10, 20)]),
1144
+ ],
1145
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1146
+ ),
1147
+ incremental_deltas=None,
1148
+ incremental_deltas_delta_type=DeltaType.UPSERT,
1149
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1150
+ [
1151
+ pa.array([str(i) for i in range(10)]),
1152
+ pa.array([i for i in range(0, 10)]),
1153
+ pa.array(["foo"] * 10),
1154
+ pa.array([i / 10 for i in range(10, 20)]),
1155
+ ],
1156
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1157
+ ),
1158
+ do_create_placement_group=False,
1159
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1160
+ hash_bucket_count=3,
1161
+ read_kwargs_provider=None,
1162
+ drop_duplicates=True,
1163
+ skip_enabled_compact_partition_drivers=None,
1164
+ ),
1165
+ "15-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1166
+ primary_keys={"pk_col_1"},
1167
+ sort_keys=[
1168
+ SortKey.of(key_name="sk_col_1"),
1169
+ SortKey.of(key_name="sk_col_2"),
1170
+ ],
1171
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
1172
+ partition_values=["1"],
1173
+ input_deltas=pa.Table.from_arrays(
1174
+ [
1175
+ pa.array([str(i) for i in range(10)]),
1176
+ pa.array([i for i in range(0, 10)]),
1177
+ pa.array(["foo"] * 10),
1178
+ pa.array([i / 10 for i in range(10, 20)]),
1179
+ ],
1180
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1181
+ ),
1182
+ input_deltas_delta_type=DeltaType.UPSERT,
1183
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
1184
+ [
1185
+ pa.array([str(i) for i in range(10)]),
1186
+ pa.array([i for i in range(0, 10)]),
1187
+ pa.array(["foo"] * 10),
1188
+ pa.array([i / 10 for i in range(10, 20)]),
1189
+ ],
1190
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1191
+ ),
1192
+ incremental_deltas=pa.Table.from_arrays(
1193
+ [
1194
+ pa.array([str(i) for i in range(10)]),
1195
+ pa.array([i for i in range(20, 30)]),
1196
+ pa.array(["foo"] * 10),
1197
+ pa.array([i / 10 for i in range(40, 50)]),
1198
+ ],
1199
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1200
+ ),
1201
+ incremental_deltas_delta_type=DeltaType.UPSERT,
1202
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
1203
+ [
1204
+ pa.array([str(i) for i in range(10)]),
1205
+ pa.array([i for i in range(20, 30)]),
1206
+ pa.array(["foo"] * 10),
1207
+ pa.array([i / 10 for i in range(40, 50)]),
1208
+ ],
1209
+ names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
1210
+ ),
1211
+ do_create_placement_group=False,
1212
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
1213
+ hash_bucket_count=1,
1214
+ read_kwargs_provider=None,
1215
+ drop_duplicates=True,
1216
+ skip_enabled_compact_partition_drivers=None,
1217
+ ),
1218
+ "16-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
1094
1219
  primary_keys={"pk_col_1"},
1095
1220
  sort_keys=[
1096
1221
  SortKey.of(key_name="sk_col_1"),
@@ -1137,9 +1262,9 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
1137
1262
  ),
1138
1263
  }
1139
1264
 
1140
-
1141
1265
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
1142
1266
 
1267
+
1143
1268
  REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
1144
1269
  REBASE_THEN_INCREMENTAL_TEST_CASES
1145
1270
  )
@@ -7,6 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Set
7
7
  from boto3.resources.base import ServiceResource
8
8
  import pyarrow as pa
9
9
  from pytest_benchmark.fixture import BenchmarkFixture
10
+ from deltacat.types.media import StorageType
10
11
 
11
12
  from deltacat.tests.compute.test_util_common import (
12
13
  get_rcf,
@@ -269,7 +270,9 @@ def test_compact_partition_incremental(
269
270
  **compaction_audit_obj
270
271
  )
271
272
 
272
- tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
273
+ tables = ds.download_delta(
274
+ compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
275
+ )
273
276
  actual_compacted_table = pa.concat_tables(tables)
274
277
  sorting_cols: List[Any] = [(val, "ascending") for val in primary_keys]
275
278
  # the compacted table may contain multiple files and chunks
@@ -32,6 +32,7 @@ from deltacat.tests.compute.compact_partition_test_cases import (
32
32
  REBASE_THEN_INCREMENTAL_TEST_CASES,
33
33
  )
34
34
  from typing import Any, Callable, Dict, List, Optional, Set
35
+ from deltacat.types.media import StorageType
35
36
 
36
37
  DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
37
38
  "db_file_path",
@@ -272,7 +273,9 @@ def test_compact_partition_rebase_then_incremental(
272
273
  compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
273
274
  setup_s3_resource, rcf_file_s3_uri
274
275
  )
275
- tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
276
+ tables = ds.download_delta(
277
+ compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
278
+ )
276
279
  actual_rebase_compacted_table = pa.concat_tables(tables)
277
280
  # if no primary key is specified then sort by sort_key for consistent assertion
278
281
  sorting_cols: List[Any] = (
@@ -341,7 +344,11 @@ def test_compact_partition_rebase_then_incremental(
341
344
  **compaction_audit_obj
342
345
  )
343
346
 
344
- tables = ds.download_delta(compacted_delta_locator_incremental, **ds_mock_kwargs)
347
+ tables = ds.download_delta(
348
+ compacted_delta_locator_incremental,
349
+ storage_type=StorageType.LOCAL,
350
+ **ds_mock_kwargs,
351
+ )
345
352
  actual_compacted_table = pa.concat_tables(tables)
346
353
  expected_terminal_compact_partition_result = (
347
354
  expected_terminal_compact_partition_result.combine_chunks().sort_by(
@@ -1,10 +1,12 @@
1
1
  from typing import Any, Callable, Dict, List, Optional, Set, Union, Tuple
2
2
 
3
3
  import pyarrow as pa
4
+ import daft
4
5
  import json
5
6
  import sqlite3
6
7
  from sqlite3 import Cursor, Connection
7
8
  import uuid
9
+ import ray
8
10
  import io
9
11
 
10
12
  from deltacat.tests.test_utils.storage import create_empty_delta
@@ -38,7 +40,13 @@ from deltacat.storage import (
38
40
  ManifestEntry,
39
41
  ManifestEntryList,
40
42
  )
41
- from deltacat.types.media import ContentType, StorageType, TableType, ContentEncoding
43
+ from deltacat.types.media import (
44
+ ContentType,
45
+ StorageType,
46
+ TableType,
47
+ ContentEncoding,
48
+ DistributedDatasetType,
49
+ )
42
50
  from deltacat.utils.common import ReadKwargsProvider
43
51
 
44
52
  SQLITE_CUR_ARG = "sqlite3_cur"
@@ -337,9 +345,10 @@ def download_delta(
337
345
  columns: Optional[List[str]] = None,
338
346
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
339
347
  ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
348
+ distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
340
349
  *args,
341
350
  **kwargs,
342
- ) -> Union[LocalDataset, DistributedDataset]:
351
+ ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
343
352
  result = []
344
353
  manifest = get_delta_manifest(delta_like, *args, **kwargs)
345
354
 
@@ -356,6 +365,14 @@ def download_delta(
356
365
  )
357
366
  )
358
367
 
368
+ if storage_type == StorageType.DISTRIBUTED:
369
+ if distributed_dataset_type is DistributedDatasetType.DAFT:
370
+ return daft.from_arrow(result)
371
+ elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
372
+ return ray.data.from_arrow(result)
373
+ else:
374
+ raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
375
+
359
376
  return result
360
377
 
361
378
 
@@ -1,14 +1,18 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
  import pyarrow as pa
3
- from deltacat.storage import Delta, Partition
3
+ from deltacat.storage import Delta, Partition, PartitionLocator
4
4
  import deltacat.tests.local_deltacat_storage as ds
5
5
 
6
6
 
7
7
  def create_delta_from_csv_file(
8
- namespace: str, file_paths: List[str], *args, **kwargs
8
+ namespace: str,
9
+ file_paths: List[str],
10
+ table_name: Optional[str] = None,
11
+ *args,
12
+ **kwargs
9
13
  ) -> Delta:
10
14
  staged_partition = stage_partition_from_file_paths(
11
- namespace, file_paths, *args, **kwargs
15
+ namespace, file_paths, *args, table_name=table_name, **kwargs
12
16
  )
13
17
 
14
18
  committed_delta = commit_delta_to_staged_partition(
@@ -19,10 +23,15 @@ def create_delta_from_csv_file(
19
23
 
20
24
 
21
25
  def stage_partition_from_file_paths(
22
- namespace: str, file_paths: List[str], *args, **kwargs
26
+ namespace: str,
27
+ file_paths: List[str],
28
+ table_name: Optional[str] = None,
29
+ *args,
30
+ **kwargs
23
31
  ) -> Partition:
24
32
  ds.create_namespace(namespace, {}, **kwargs)
25
- table_name = "-".join(file_paths).replace("/", "_")
33
+ if table_name is None:
34
+ table_name = "-".join(file_paths).replace("/", "_")
26
35
  ds.create_table_version(namespace, table_name, "1", **kwargs)
27
36
  stream = ds.get_stream(namespace, table_name, "1", **kwargs)
28
37
  staged_partition = ds.stage_partition(stream, [], **kwargs)
@@ -31,19 +40,29 @@ def stage_partition_from_file_paths(
31
40
 
32
41
  def commit_delta_to_staged_partition(
33
42
  staged_partition, file_paths: List[str], *args, **kwargs
43
+ ) -> Delta:
44
+ committed_delta = commit_delta_to_partition(
45
+ staged_partition, file_paths=file_paths, *args, **kwargs
46
+ )
47
+ ds.commit_partition(staged_partition, **kwargs)
48
+ return committed_delta
49
+
50
+
51
+ def commit_delta_to_partition(
52
+ partition: Partition, file_paths: List[str], *args, **kwargs
34
53
  ) -> Delta:
35
54
  tables = []
36
55
 
56
+ if isinstance(partition, PartitionLocator):
57
+ partition = ds.get_partition(
58
+ partition.stream_locator, partition.partition_values, *args, **kwargs
59
+ )
60
+
37
61
  for file_path in file_paths:
38
62
  table = pa.csv.read_csv(file_path)
39
63
  tables.append(table)
40
- deltas = []
41
64
 
42
- for table in tables:
43
- delta = ds.stage_delta(table, staged_partition, **kwargs)
44
- deltas.append(delta)
65
+ table = pa.concat_tables(tables)
66
+ staged_delta = ds.stage_delta(table, partition, **kwargs)
45
67
 
46
- merged_delta = Delta.merge_deltas(deltas=deltas)
47
- committed_delta = ds.commit_delta(merged_delta, **kwargs)
48
- ds.commit_partition(staged_partition, **kwargs)
49
- return committed_delta
68
+ return ds.commit_delta(staged_delta, **kwargs)
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
- from deltacat.utils.daft import daft_s3_file_to_table
3
+ from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
4
4
 
5
5
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
6
6
  from deltacat.types.partial_download import PartialParquetParameters
@@ -9,7 +9,7 @@ import pyarrow as pa
9
9
  from pyarrow import parquet as pq
10
10
 
11
11
 
12
- class TestDaftParquetReader(unittest.TestCase):
12
+ class TestDaftS3FileToTable(unittest.TestCase):
13
13
  MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
14
14
 
15
15
  def test_read_from_s3_all_columns(self):
@@ -121,5 +121,45 @@ class TestDaftParquetReader(unittest.TestCase):
121
121
  self.assertEqual(table.num_rows, 10)
122
122
 
123
123
 
124
+ class TestDaftS3FilesToDataFrame(unittest.TestCase):
125
+ MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
126
+
127
+ def test_read_from_s3_all_columns(self):
128
+ df = s3_files_to_dataframe(
129
+ uris=[self.MVP_PATH],
130
+ content_encoding=ContentEncoding.IDENTITY.value,
131
+ content_type=ContentType.PARQUET.value,
132
+ ray_init_options={"local_mode": True},
133
+ )
134
+
135
+ table = df.to_arrow()
136
+ self.assertEqual(table.schema.names, ["a", "b"])
137
+ self.assertEqual(table.num_rows, 100)
138
+
139
+ def test_does_not_read_from_s3_if_not_materialized(self):
140
+ df = s3_files_to_dataframe(
141
+ uris=[self.MVP_PATH],
142
+ content_encoding=ContentEncoding.IDENTITY.value,
143
+ content_type=ContentType.PARQUET.value,
144
+ ray_init_options={"local_mode": True},
145
+ )
146
+
147
+ self.assertRaises(RuntimeError, lambda: len(df))
148
+ df.collect()
149
+ self.assertEqual(len(df), 100)
150
+
151
+ def test_raises_error_if_not_supported_content_type(self):
152
+
153
+ self.assertRaises(
154
+ AssertionError,
155
+ lambda: s3_files_to_dataframe(
156
+ uris=[self.MVP_PATH],
157
+ content_encoding=ContentEncoding.IDENTITY.value,
158
+ content_type=ContentType.UNESCAPED_TSV.value,
159
+ ray_init_options={"local_mode": True},
160
+ ),
161
+ )
162
+
163
+
124
164
  if __name__ == "__main__":
125
165
  unittest.main()
deltacat/types/media.py CHANGED
@@ -44,6 +44,11 @@ class TableType(str, Enum):
44
44
  PYARROW_PARQUET = "pyarrow_parquet"
45
45
 
46
46
 
47
+ class DistributedDatasetType(str, Enum):
48
+ DAFT = "daft"
49
+ RAY_DATASET = "ray_dataset"
50
+
51
+
47
52
  class SchemaType(str, Enum):
48
53
  ARROW = "arrow"
49
54
 
deltacat/types/tables.py CHANGED
@@ -15,10 +15,11 @@ from ray.data.read_api import (
15
15
  )
16
16
 
17
17
  import deltacat.storage as dcs
18
- from deltacat.types.media import TableType
18
+ from deltacat.types.media import TableType, DistributedDatasetType
19
19
  from deltacat.utils import numpy as np_utils
20
20
  from deltacat.utils import pandas as pd_utils
21
21
  from deltacat.utils import pyarrow as pa_utils
22
+ from deltacat.utils import daft as daft_utils
22
23
  from deltacat.utils.ray_utils import dataset as ds_utils
23
24
 
24
25
  TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
@@ -78,6 +79,11 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
78
79
  }
79
80
 
80
81
 
82
+ DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
83
+ DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
84
+ }
85
+
86
+
81
87
  class TableWriteMode(str, Enum):
82
88
  """
83
89
  Enum controlling how a given dataset will be written to a table.
deltacat/utils/daft.py CHANGED
@@ -1,8 +1,9 @@
1
1
  import logging
2
- from typing import Optional, List
3
-
2
+ from typing import Optional, List, Any, Dict, Callable
3
+ import daft
4
+ import ray
4
5
  from daft.table import read_parquet_into_pyarrow
5
- from daft import TimeUnit
6
+ from daft import TimeUnit, DataFrame
6
7
  from daft.io import IOConfig, S3Config
7
8
  import pyarrow as pa
8
9
 
@@ -22,6 +23,66 @@ from deltacat.types.partial_download import (
22
23
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
24
 
24
25
 
26
+ def s3_files_to_dataframe(
27
+ uris: List[str],
28
+ content_type: str,
29
+ content_encoding: str,
30
+ column_names: Optional[List[str]] = None,
31
+ include_columns: Optional[List[str]] = None,
32
+ read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
33
+ ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
34
+ s3_client_kwargs: Optional[Any] = None,
35
+ ray_init_options: Optional[Dict[str, Any]] = None,
36
+ ) -> DataFrame:
37
+
38
+ if ray_init_options is None:
39
+ ray_init_options = {}
40
+
41
+ assert (
42
+ content_type == ContentType.PARQUET.value
43
+ ), f"daft native reader currently only supports parquet, got {content_type}"
44
+
45
+ assert (
46
+ content_encoding == ContentEncoding.IDENTITY.value
47
+ ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
48
+
49
+ if not ray.is_initialized():
50
+ ray.init(address="auto", ignore_reinit_error=True, **ray_init_options)
51
+
52
+ daft.context.set_runner_ray(noop_if_initialized=True)
53
+
54
+ if s3_client_kwargs is None:
55
+ s3_client_kwargs = {}
56
+
57
+ kwargs = {}
58
+ if read_func_kwargs_provider is not None:
59
+ kwargs = read_func_kwargs_provider(content_type, kwargs)
60
+
61
+ # TODO(raghumdani): pass in coerce_int96_timestamp arg
62
+ # https://github.com/Eventual-Inc/Daft/issues/1894
63
+
64
+ io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
65
+
66
+ logger.debug(
67
+ f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
68
+ )
69
+
70
+ df, latency = timed_invocation(
71
+ daft.read_parquet, path=uris, io_config=io_config, use_native_downloader=True
72
+ )
73
+
74
+ logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
75
+
76
+ columns_to_read = include_columns or column_names
77
+
78
+ logger.debug(f"Taking columns {columns_to_read} from the daft df.")
79
+
80
+ if columns_to_read:
81
+ return df.select(*columns_to_read)
82
+ else:
83
+ return df
84
+
85
+
25
86
  def daft_s3_file_to_table(
26
87
  s3_url: str,
27
88
  content_type: str,
@@ -55,16 +116,7 @@ def daft_s3_file_to_table(
55
116
  ):
56
117
  row_groups = partial_file_download_params.row_groups_to_download
57
118
 
58
- io_config = IOConfig(
59
- s3=S3Config(
60
- key_id=s3_client_kwargs.get("aws_access_key_id"),
61
- access_key=s3_client_kwargs.get("aws_secret_access_key"),
62
- session_token=s3_client_kwargs.get("aws_session_token"),
63
- retry_mode="adaptive",
64
- num_tries=BOTO_MAX_RETRIES,
65
- max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
66
- )
67
- )
119
+ io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
68
120
 
69
121
  logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
70
122
 
@@ -95,3 +147,16 @@ def daft_s3_file_to_table(
95
147
  return coerce_pyarrow_table_to_schema(pa_table, input_schema)
96
148
  else:
97
149
  return pa_table
150
+
151
+
152
+ def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
153
+ return IOConfig(
154
+ s3=S3Config(
155
+ key_id=s3_client_kwargs.get("aws_access_key_id"),
156
+ access_key=s3_client_kwargs.get("aws_secret_access_key"),
157
+ session_token=s3_client_kwargs.get("aws_session_token"),
158
+ retry_mode="adaptive",
159
+ num_tries=BOTO_MAX_RETRIES,
160
+ max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
161
+ )
162
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.2.9
3
+ Version: 1.0.0
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -27,7 +27,7 @@ Requires-Dist: tenacity ==8.1.0
27
27
  Requires-Dist: typing-extensions ==4.4.0
28
28
  Requires-Dist: pymemcache ==4.0.0
29
29
  Requires-Dist: redis ==4.6.0
30
- Requires-Dist: getdaft ==0.2.4
30
+ Requires-Dist: getdaft ==0.2.16
31
31
  Requires-Dist: schedule ==1.2.0
32
32
 
33
33
  # DeltaCAT