deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
1
+ from typing import List
2
+ import pyarrow as pa
3
+ from deltacat.storage import Delta
4
+ import deltacat.tests.local_deltacat_storage as ds
5
+
6
+
7
+ def create_delta_from_csv_file(
8
+ namespace: str, file_paths: List[str], *args, **kwargs
9
+ ) -> Delta:
10
+ tables = []
11
+
12
+ for file_path in file_paths:
13
+ table = pa.csv.read_csv(file_path)
14
+ tables.append(table)
15
+
16
+ ds.create_namespace(namespace, {}, **kwargs)
17
+ table_name = "-".join(file_paths).replace("/", "_")
18
+ ds.create_table_version(namespace, table_name, "1", **kwargs)
19
+ stream = ds.get_stream(namespace, table_name, "1", **kwargs)
20
+ staged_partition = ds.stage_partition(stream, [], **kwargs)
21
+
22
+ deltas = []
23
+
24
+ for table in tables:
25
+ delta = ds.stage_delta(table, staged_partition, **kwargs)
26
+ deltas.append(delta)
27
+
28
+ merged_delta = Delta.merge_deltas(deltas=deltas)
29
+ committed_delta = ds.commit_delta(merged_delta, **kwargs)
30
+ ds.commit_partition(staged_partition, **kwargs)
31
+
32
+ return committed_delta
@@ -0,0 +1,13 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+ import json
4
+ from typing import Any, Dict
5
+ from boto3.resources.base import ServiceResource
6
+
7
+
8
+ def read_s3_contents(
9
+ s3_resource: ServiceResource, bucket_name: str, key: str
10
+ ) -> Dict[str, Any]:
11
+ response = s3_resource.Object(bucket_name, key).get()
12
+ file_content: str = response["Body"].read().decode("utf-8")
13
+ return json.loads(file_content)
File without changes
@@ -0,0 +1,76 @@
1
+ import unittest
2
+ from deltacat.types.media import ContentEncoding, ContentType
3
+ from deltacat.utils.daft import daft_s3_file_to_table
4
+
5
+ from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
6
+ from deltacat.types.partial_download import PartialParquetParameters
7
+ import pyarrow as pa
8
+
9
+ from pyarrow import parquet as pq
10
+
11
+
12
+ class TestDaftParquetReader(unittest.TestCase):
13
+ MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
14
+
15
+ def test_read_from_s3_all_columns(self):
16
+ table = daft_s3_file_to_table(
17
+ self.MVP_PATH,
18
+ content_encoding=ContentEncoding.IDENTITY.value,
19
+ content_type=ContentType.PARQUET.value,
20
+ )
21
+ self.assertEqual(table.schema.names, ["a", "b"])
22
+ self.assertEqual(table.num_rows, 100)
23
+
24
+ def test_read_from_s3_single_column_via_include_columns(self):
25
+ table = daft_s3_file_to_table(
26
+ self.MVP_PATH,
27
+ content_encoding=ContentEncoding.IDENTITY.value,
28
+ content_type=ContentType.PARQUET.value,
29
+ include_columns=["b"],
30
+ )
31
+ self.assertEqual(table.schema.names, ["b"])
32
+ self.assertEqual(table.num_rows, 100)
33
+
34
+ def test_read_from_s3_single_column_via_column_names(self):
35
+ table = daft_s3_file_to_table(
36
+ self.MVP_PATH,
37
+ content_encoding=ContentEncoding.IDENTITY.value,
38
+ content_type=ContentType.PARQUET.value,
39
+ column_names=["b"],
40
+ )
41
+ self.assertEqual(table.schema.names, ["b"])
42
+ self.assertEqual(table.num_rows, 100)
43
+
44
+ def test_read_from_s3_single_column_with_schema(self):
45
+ schema = pa.schema([("a", pa.int64()), ("b", pa.string())])
46
+ pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
47
+ schema=schema
48
+ )
49
+ table = daft_s3_file_to_table(
50
+ self.MVP_PATH,
51
+ content_encoding=ContentEncoding.IDENTITY.value,
52
+ content_type=ContentType.PARQUET.value,
53
+ include_columns=["b"],
54
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
55
+ )
56
+ self.assertEqual(table.schema.names, ["b"])
57
+ self.assertEqual(table.num_rows, 100)
58
+
59
+ def test_read_from_s3_single_column_with_row_groups(self):
60
+
61
+ metadata = pq.read_metadata(self.MVP_PATH)
62
+ ppp = PartialParquetParameters.of(pq_metadata=metadata)
63
+ ppp["row_groups_to_download"] = ppp.row_groups_to_download[1:2]
64
+ table = daft_s3_file_to_table(
65
+ self.MVP_PATH,
66
+ content_encoding=ContentEncoding.IDENTITY.value,
67
+ content_type=ContentType.PARQUET.value,
68
+ column_names=["b"],
69
+ partial_file_download_params=ppp,
70
+ )
71
+ self.assertEqual(table.schema.names, ["b"])
72
+ self.assertEqual(table.num_rows, 10)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ unittest.main()
@@ -0,0 +1,133 @@
1
+ from unittest import TestCase
2
+ from deltacat.utils.pyarrow import (
3
+ s3_parquet_file_to_table,
4
+ s3_partial_parquet_file_to_table,
5
+ )
6
+ from deltacat.types.media import ContentEncoding, ContentType
7
+ from deltacat.types.partial_download import PartialParquetParameters
8
+ from pyarrow.parquet import ParquetFile
9
+ import pyarrow as pa
10
+
11
+ PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
12
+
13
+
14
+ class TestS3ParquetFileToTable(TestCase):
15
+ def test_s3_parquet_file_to_table_sanity(self):
16
+
17
+ result = s3_parquet_file_to_table(
18
+ PARQUET_FILE_PATH,
19
+ ContentType.PARQUET.value,
20
+ ContentEncoding.IDENTITY.value,
21
+ ["n_legs", "animal"],
22
+ ["n_legs"],
23
+ )
24
+
25
+ self.assertEqual(len(result), 6)
26
+ self.assertEqual(len(result.column_names), 1)
27
+ schema = result.schema
28
+ schema_index = schema.get_field_index("n_legs")
29
+ self.assertEqual(schema.field(schema_index).type, "int64")
30
+
31
+ def test_s3_parquet_file_to_table_when_schema_overridden(self):
32
+
33
+ schema = pa.schema(
34
+ [pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
35
+ )
36
+
37
+ pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
38
+
39
+ result = s3_parquet_file_to_table(
40
+ PARQUET_FILE_PATH,
41
+ ContentType.PARQUET.value,
42
+ ContentEncoding.IDENTITY.value,
43
+ ["n_legs", "animal"],
44
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
45
+ )
46
+
47
+ self.assertEqual(len(result), 6)
48
+ self.assertEqual(len(result.column_names), 2)
49
+
50
+ result_schema = result.schema
51
+ for index, field in enumerate(result_schema):
52
+ self.assertEqual(field.name, schema.field(index).name)
53
+
54
+ self.assertEqual(result.schema.field(1).type, "string")
55
+
56
+
57
+ class TestS3PartialParquetFileToTable(TestCase):
58
+ def test_s3_partial_parquet_file_to_table_sanity(self):
59
+
60
+ pq_file = ParquetFile(PARQUET_FILE_PATH)
61
+ partial_parquet_params = PartialParquetParameters.of(
62
+ pq_metadata=pq_file.metadata
63
+ )
64
+
65
+ self.assertEqual(
66
+ partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
67
+ )
68
+
69
+ # only first row group to be downloaded
70
+ partial_parquet_params.row_groups_to_download.pop()
71
+
72
+ result = s3_partial_parquet_file_to_table(
73
+ PARQUET_FILE_PATH,
74
+ include_columns=["n_legs"],
75
+ content_encoding=ContentEncoding.IDENTITY.value,
76
+ content_type=ContentType.PARQUET.value,
77
+ partial_file_download_params=partial_parquet_params,
78
+ )
79
+
80
+ self.assertEqual(len(result), 3)
81
+ self.assertEqual(len(result.columns), 1)
82
+
83
+ def test_s3_partial_parquet_file_to_table_when_schema_passed(self):
84
+
85
+ pq_file = ParquetFile(PARQUET_FILE_PATH)
86
+ partial_parquet_params = PartialParquetParameters.of(
87
+ pq_metadata=pq_file.metadata
88
+ )
89
+ # only first row group to be downloaded
90
+ partial_parquet_params.row_groups_to_download.pop()
91
+
92
+ schema = pa.schema(
93
+ [pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
94
+ )
95
+
96
+ pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
97
+
98
+ result = s3_partial_parquet_file_to_table(
99
+ PARQUET_FILE_PATH,
100
+ ContentType.PARQUET.value,
101
+ ContentEncoding.IDENTITY.value,
102
+ ["n_legs", "animal"],
103
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
104
+ partial_file_download_params=partial_parquet_params,
105
+ )
106
+
107
+ self.assertEqual(len(result), 3)
108
+ self.assertEqual(len(result.column_names), 2)
109
+
110
+ result_schema = result.schema
111
+ self.assertEqual(result_schema.field(0).type, "string")
112
+ self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
113
+
114
+ def test_s3_partial_parquet_file_to_table_when_multiple_row_groups(self):
115
+
116
+ pq_file = ParquetFile(PARQUET_FILE_PATH)
117
+ partial_parquet_params = PartialParquetParameters.of(
118
+ pq_metadata=pq_file.metadata
119
+ )
120
+
121
+ self.assertEqual(
122
+ partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
123
+ )
124
+
125
+ result = s3_partial_parquet_file_to_table(
126
+ PARQUET_FILE_PATH,
127
+ content_encoding=ContentEncoding.IDENTITY.value,
128
+ content_type=ContentType.PARQUET.value,
129
+ partial_file_download_params=partial_parquet_params,
130
+ )
131
+
132
+ self.assertEqual(len(result), 6)
133
+ self.assertEqual(len(result.columns), 2)
@@ -1,37 +1,22 @@
1
1
  import unittest
2
2
  from unittest import mock
3
- import sys
3
+ import time
4
4
 
5
5
 
6
6
  class TestGetCurrentClusterUtilization(unittest.TestCase):
7
- @classmethod
8
- def setUpClass(cls):
9
- cls.ray_mock = mock.MagicMock()
10
- cls.ray_mock.cluster_resources.return_value = {
7
+ @mock.patch("deltacat.utils.resources.ray")
8
+ def test_sanity(self, ray_mock):
9
+ ray_mock.cluster_resources.return_value = {
11
10
  "CPU": 10,
12
11
  "memory": 10,
13
12
  "object_store_memory": 5,
14
13
  }
15
- cls.ray_mock.available_resources.return_value = {
14
+ ray_mock.available_resources.return_value = {
16
15
  "CPU": 6,
17
16
  "memory": 4,
18
17
  "object_store_memory": 5,
19
18
  }
20
19
 
21
- cls.module_patcher = mock.patch.dict("sys.modules", {"ray": cls.ray_mock})
22
- cls.module_patcher.start()
23
-
24
- # delete reference to reload from mocked ray
25
- if "deltacat.utils.resources" in sys.modules:
26
- del sys.modules["deltacat.utils.resources"]
27
-
28
- super().setUpClass()
29
-
30
- @classmethod
31
- def tearDownClass(cls) -> None:
32
- cls.module_patcher.stop()
33
-
34
- def test_sanity(self):
35
20
  from deltacat.utils.resources import ClusterUtilization
36
21
 
37
22
  result = ClusterUtilization.get_current_cluster_utilization()
@@ -43,3 +28,21 @@ class TestGetCurrentClusterUtilization(unittest.TestCase):
43
28
  self.assertEqual(0, result.used_object_store_memory_bytes)
44
29
  self.assertEqual(6, result.used_memory_bytes)
45
30
  self.assertIsNotNone(result.used_resources)
31
+
32
+
33
+ class TestClusterUtilizationOverTimeRange(unittest.TestCase):
34
+ @mock.patch("deltacat.utils.resources.ray")
35
+ def test_sanity(self, ray_mock):
36
+ from deltacat.utils.resources import ClusterUtilizationOverTimeRange
37
+
38
+ ray_mock.cluster_resources.side_effect = [{"CPU": 32} for _ in range(5)]
39
+ ray_mock.available_resources.side_effect = [
40
+ {"CPU": 2 ** (i + 1)} for i in range(5)
41
+ ]
42
+
43
+ with ClusterUtilizationOverTimeRange() as cu:
44
+ time.sleep(3)
45
+ self.assertTrue(cu.used_vcpu_seconds <= 82) # 30 + 28 + 24
46
+ self.assertTrue(
47
+ cu.total_vcpu_seconds >= cu.used_vcpu_seconds
48
+ ) # total is greater than used
deltacat/types/media.py CHANGED
@@ -41,6 +41,7 @@ class TableType(str, Enum):
41
41
  PYARROW = "pyarrow"
42
42
  PANDAS = "pandas"
43
43
  NUMPY = "numpy"
44
+ PYARROW_PARQUET = "pyarrow_parquet"
44
45
 
45
46
 
46
47
  class SchemaType(str, Enum):
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, Any, Optional, List
4
+ from pyarrow.parquet import FileMetaData
5
+
6
+
7
+ class PartialFileDownloadParams(Dict[str, Any]):
8
+ """
9
+ A content type params class used to represent arguments required
10
+ to down the file partially. This is useful specifically in cases
11
+ where you'd like to instruct downloader to partially download a
12
+ manifest entry.
13
+ """
14
+
15
+ pass
16
+
17
+
18
+ class PartialParquetParameters(PartialFileDownloadParams):
19
+ @staticmethod
20
+ def of(
21
+ row_groups_to_download: Optional[List[int]] = None,
22
+ num_row_groups: Optional[int] = None,
23
+ num_rows: Optional[int] = None,
24
+ in_memory_size_bytes: Optional[float] = None,
25
+ pq_metadata: Optional[FileMetaData] = None,
26
+ ) -> PartialParquetParameters:
27
+
28
+ if (
29
+ row_groups_to_download is None
30
+ or num_row_groups is None
31
+ or num_rows is None
32
+ or in_memory_size_bytes is None
33
+ ):
34
+ assert (
35
+ pq_metadata is not None
36
+ ), "Parquet file metadata must be passed explicitly"
37
+
38
+ num_row_groups = pq_metadata.num_row_groups
39
+ row_groups_to_download = [rg for rg in range(num_row_groups)]
40
+ in_memory_size_bytes = 0.0
41
+
42
+ for rg in row_groups_to_download:
43
+ row_group_meta = pq_metadata.row_group(rg)
44
+ in_memory_size_bytes += row_group_meta.total_byte_size
45
+
46
+ result = PartialParquetParameters(
47
+ {
48
+ "row_groups_to_download": row_groups_to_download,
49
+ "num_row_groups": num_row_groups,
50
+ "num_rows": num_rows,
51
+ "in_memory_size_bytes": in_memory_size_bytes,
52
+ }
53
+ )
54
+
55
+ if pq_metadata:
56
+ result["pq_metadata"] = pq_metadata
57
+
58
+ return result
59
+
60
+ @property
61
+ def row_groups_to_download(self) -> List[int]:
62
+ return self["row_groups_to_download"]
63
+
64
+ @property
65
+ def num_row_groups(self) -> List[int]:
66
+ return self["num_row_groups"]
67
+
68
+ @property
69
+ def num_rows(self) -> int:
70
+ return self["num_rows"]
71
+
72
+ @property
73
+ def in_memory_size_bytes(self) -> float:
74
+ return self["in_memory_size_bytes"]
75
+
76
+ @property
77
+ def pq_metadata(self) -> Optional[FileMetaData]:
78
+ return self.get("pq_metadata")
79
+
80
+ @pq_metadata.setter
81
+ def pq_metadata(self, metadata: FileMetaData) -> None:
82
+ self["pq_metadata"] = metadata
deltacat/types/tables.py CHANGED
@@ -21,6 +21,7 @@ from deltacat.utils import pyarrow as pa_utils
21
21
  from deltacat.utils.ray_utils import dataset as ds_utils
22
22
 
23
23
  TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
24
+ TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
24
25
  TableType.PYARROW.value: pa_utils.s3_file_to_table,
25
26
  TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
26
27
  TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
@@ -0,0 +1,26 @@
1
+ import inspect
2
+ import copy
3
+ from typing import Any, Dict
4
+
5
+
6
+ def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
7
+ """
8
+ This method removes any upsupported key word arguments if variable
9
+ kwargs are not allowed in the method signature.
10
+
11
+ Returns: a sanitized dict of kwargs.
12
+ """
13
+ signature = inspect.signature(callable)
14
+ params = signature.parameters
15
+
16
+ new_kwargs = copy.copy(kwargs)
17
+
18
+ for key in params:
19
+ if params[key].kind == inspect.Parameter.VAR_KEYWORD:
20
+ return kwargs
21
+
22
+ for key in kwargs.keys():
23
+ if key not in params:
24
+ new_kwargs.pop(key)
25
+
26
+ return new_kwargs
deltacat/utils/daft.py ADDED
@@ -0,0 +1,87 @@
1
+ import logging
2
+ from typing import Optional, List
3
+
4
+ from daft.table import Table
5
+ from daft.logical.schema import Schema
6
+ from daft import TimeUnit
7
+ from daft.io import IOConfig, S3Config
8
+ import pyarrow as pa
9
+
10
+ from deltacat import logs
11
+ from deltacat.utils.common import ReadKwargsProvider
12
+
13
+ from deltacat.types.media import ContentType, ContentEncoding
14
+ from deltacat.aws.constants import BOTO_MAX_RETRIES
15
+ from deltacat.utils.performance import timed_invocation
16
+
17
+ from deltacat.types.partial_download import (
18
+ PartialFileDownloadParams,
19
+ )
20
+
21
+
22
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
+
24
+
25
+ def daft_s3_file_to_table(
26
+ s3_url: str,
27
+ content_type: str,
28
+ content_encoding: str,
29
+ column_names: Optional[List[str]] = None,
30
+ include_columns: Optional[List[str]] = None,
31
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
32
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
33
+ **s3_client_kwargs,
34
+ ):
35
+ assert (
36
+ content_type == ContentType.PARQUET.value
37
+ ), f"daft native reader currently only supports parquet, got {content_type}"
38
+
39
+ assert (
40
+ content_encoding == ContentEncoding.IDENTITY.value
41
+ ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
42
+
43
+ kwargs = {}
44
+ if pa_read_func_kwargs_provider is not None:
45
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
46
+
47
+ coerce_int96_timestamp_unit = TimeUnit.from_str(
48
+ kwargs.get("coerce_int96_timestamp_unit", "ms")
49
+ )
50
+
51
+ row_groups = None
52
+ if (
53
+ partial_file_download_params is not None
54
+ and partial_file_download_params.row_groups_to_download is not None
55
+ ):
56
+ row_groups = partial_file_download_params.row_groups_to_download
57
+
58
+ io_config = IOConfig(
59
+ s3=S3Config(
60
+ key_id=s3_client_kwargs.get("aws_access_key_id"),
61
+ access_key=s3_client_kwargs.get("aws_secret_access_key"),
62
+ session_token=s3_client_kwargs.get("aws_session_token"),
63
+ retry_mode="adaptive",
64
+ num_tries=BOTO_MAX_RETRIES,
65
+ )
66
+ )
67
+
68
+ table, latency = timed_invocation(
69
+ Table.read_parquet,
70
+ path=s3_url,
71
+ columns=include_columns or column_names,
72
+ row_groups=row_groups,
73
+ io_config=io_config,
74
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
75
+ multithreaded_io=False,
76
+ )
77
+
78
+ logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
79
+
80
+ if kwargs.get("schema") is not None:
81
+ schema = kwargs["schema"]
82
+ if include_columns is not None:
83
+ schema = pa.schema([schema.field(col) for col in include_columns])
84
+ daft_schema = Schema.from_pyarrow_schema(schema)
85
+ return table.cast_to_schema(daft_schema).to_arrow()
86
+ else:
87
+ return table.to_arrow()
@@ -1,6 +1,8 @@
1
1
  import time
2
2
  from collections import Counter
3
- from typing import Any, Callable, Tuple
3
+ from typing import Any, Callable, Tuple, TypeVar
4
+
5
+ T = TypeVar("T")
4
6
 
5
7
 
6
8
  def invoke_with_perf_counter(
@@ -15,7 +17,7 @@ def invoke_with_perf_counter(
15
17
  return result, latency
16
18
 
17
19
 
18
- def timed_invocation(func: Callable, *args, **kwargs) -> Tuple[Any, float]:
20
+ def timed_invocation(func: Callable[[Any], T], *args, **kwargs) -> Tuple[T, float]:
19
21
 
20
22
  start = time.perf_counter()
21
23
  result = func(*args, **kwargs)
@@ -21,9 +21,10 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
21
21
 
22
22
  @dataclass
23
23
  class PlacementGroupConfig:
24
- def __init__(self, opts, resource):
24
+ def __init__(self, opts, resource, node_ips):
25
25
  self.opts = opts
26
26
  self.resource = resource
27
+ self.node_ips = node_ips
27
28
 
28
29
 
29
30
  class NodeGroupManager:
@@ -207,6 +208,7 @@ class PlacementGroupManager:
207
208
  cpu_per_bundle: int,
208
209
  strategy="SPREAD",
209
210
  capture_child_tasks=True,
211
+ memory_per_bundle=None,
210
212
  ):
211
213
  head_res_key = self.get_current_node_resource_key()
212
214
  # run the task on head and consume a fractional cpu, so that pg can be created on non-head node
@@ -216,7 +218,11 @@ class PlacementGroupManager:
216
218
  self._pg_configs = ray.get(
217
219
  [
218
220
  _config.options(resources={head_res_key: 0.01}).remote(
219
- total_cpus_per_pg, cpu_per_bundle, strategy, capture_child_tasks
221
+ total_cpus_per_pg,
222
+ cpu_per_bundle,
223
+ strategy,
224
+ capture_child_tasks,
225
+ memory_per_bundle=memory_per_bundle,
220
226
  )
221
227
  for i in range(num_pgs)
222
228
  ]
@@ -251,12 +257,18 @@ def _config(
251
257
  strategy="SPREAD",
252
258
  capture_child_tasks=True,
253
259
  time_out: Optional[float] = None,
260
+ memory_per_bundle: Optional[float] = None,
254
261
  ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
255
262
  pg_config = None
256
263
  opts = {}
257
264
  cluster_resources = {}
258
265
  num_bundles = (int)(total_cpus_per_pg / cpu_per_node)
259
266
  bundles = [{"CPU": cpu_per_node} for i in range(num_bundles)]
267
+
268
+ if memory_per_bundle:
269
+ for bundle in bundles:
270
+ bundle["memory"] = memory_per_bundle
271
+
260
272
  pg = placement_group(bundles, strategy=strategy)
261
273
  ray.get(pg.ready(), timeout=time_out)
262
274
  if not pg:
@@ -275,6 +287,7 @@ def _config(
275
287
  # query available resources given list of node id
276
288
  all_nodes_available_res = ray._private.state.state._available_resources_per_node()
277
289
  pg_res = {"CPU": 0, "memory": 0, "object_store_memory": 0}
290
+ node_ips = []
278
291
  for node_id in node_ids:
279
292
  if node_id in all_nodes_available_res:
280
293
  v = all_nodes_available_res[node_id]
@@ -282,10 +295,14 @@ def _config(
282
295
  pg_res["CPU"] += node_detail["resources_total"]["CPU"]
283
296
  pg_res["memory"] += v["memory"]
284
297
  pg_res["object_store_memory"] += v["object_store_memory"]
298
+ node_ips.append(node_detail["node_ip"])
285
299
  cluster_resources["CPU"] = int(pg_res["CPU"])
286
300
  cluster_resources["memory"] = float(pg_res["memory"])
287
301
  cluster_resources["object_store_memory"] = float(pg_res["object_store_memory"])
288
- pg_config = PlacementGroupConfig(opts, cluster_resources)
302
+ pg_config = PlacementGroupConfig(
303
+ opts=opts, resource=cluster_resources, node_ips=node_ips
304
+ )
289
305
  logger.info(f"pg has resources:{cluster_resources}")
306
+ logger.debug(f"pg has node ips:{node_ips}")
290
307
 
291
308
  return pg_config