deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,27 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
5
|
+
|
4
6
|
import pyarrow as pa
|
5
7
|
|
8
|
+
from deltacat.storage.model.locator import Locator
|
6
9
|
from deltacat.storage.model.namespace import NamespaceLocator
|
7
10
|
from deltacat.storage.model.table import TableLocator
|
8
|
-
from deltacat.storage.model.locator import Locator
|
9
11
|
from deltacat.types.media import ContentType
|
10
12
|
|
11
|
-
from typing import Any, Dict, List, Optional, Union
|
12
|
-
|
13
13
|
|
14
14
|
class TableVersion(dict):
|
15
15
|
@staticmethod
|
16
|
-
def of(
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
16
|
+
def of(
|
17
|
+
locator: Optional[TableVersionLocator],
|
18
|
+
schema: Optional[Union[pa.Schema, str, bytes]],
|
19
|
+
partition_keys: Optional[List[Dict[str, Any]]] = None,
|
20
|
+
primary_key_columns: Optional[List[str]] = None,
|
21
|
+
description: Optional[str] = None,
|
22
|
+
properties: Optional[Dict[str, str]] = None,
|
23
|
+
content_types: Optional[List[ContentType]] = None,
|
24
|
+
) -> TableVersion:
|
23
25
|
table_version = TableVersion()
|
24
26
|
table_version.locator = locator
|
25
27
|
table_version.schema = schema
|
@@ -38,9 +40,7 @@ class TableVersion(dict):
|
|
38
40
|
return val
|
39
41
|
|
40
42
|
@locator.setter
|
41
|
-
def locator(
|
42
|
-
self,
|
43
|
-
table_version_locator: Optional[TableVersionLocator]) -> None:
|
43
|
+
def locator(self, table_version_locator: Optional[TableVersionLocator]) -> None:
|
44
44
|
self["tableVersionLocator"] = table_version_locator
|
45
45
|
|
46
46
|
@property
|
@@ -56,9 +56,7 @@ class TableVersion(dict):
|
|
56
56
|
return self.get("partitionKeys")
|
57
57
|
|
58
58
|
@partition_keys.setter
|
59
|
-
def partition_keys(
|
60
|
-
self,
|
61
|
-
partition_keys: Optional[List[Dict[str, Any]]]) -> None:
|
59
|
+
def partition_keys(self, partition_keys: Optional[List[Dict[str, Any]]]) -> None:
|
62
60
|
self["partitionKeys"] = partition_keys
|
63
61
|
|
64
62
|
@property
|
@@ -88,13 +86,14 @@ class TableVersion(dict):
|
|
88
86
|
@property
|
89
87
|
def content_types(self) -> Optional[List[ContentType]]:
|
90
88
|
content_types = self.get("contentTypes")
|
91
|
-
return
|
92
|
-
|
89
|
+
return (
|
90
|
+
None
|
91
|
+
if content_types is None
|
92
|
+
else [None if _ is None else ContentType(_) for _ in content_types]
|
93
|
+
)
|
93
94
|
|
94
95
|
@content_types.setter
|
95
|
-
def content_types(
|
96
|
-
self,
|
97
|
-
content_types: Optional[List[ContentType]]) -> None:
|
96
|
+
def content_types(self, content_types: Optional[List[ContentType]]) -> None:
|
98
97
|
self["contentTypes"] = content_types
|
99
98
|
|
100
99
|
@property
|
@@ -132,27 +131,29 @@ class TableVersion(dict):
|
|
132
131
|
return table_version_locator.table_version
|
133
132
|
return None
|
134
133
|
|
135
|
-
def is_supported_content_type(
|
136
|
-
self,
|
137
|
-
content_type: ContentType):
|
134
|
+
def is_supported_content_type(self, content_type: ContentType):
|
138
135
|
supported_content_types = self.content_types
|
139
|
-
return (not supported_content_types) or
|
140
|
-
|
136
|
+
return (not supported_content_types) or (
|
137
|
+
content_type in supported_content_types
|
138
|
+
)
|
141
139
|
|
142
140
|
|
143
141
|
class TableVersionLocator(Locator, dict):
|
144
142
|
@staticmethod
|
145
|
-
def of(
|
146
|
-
|
143
|
+
def of(
|
144
|
+
table_locator: Optional[TableLocator], table_version: Optional[str]
|
145
|
+
) -> TableVersionLocator:
|
147
146
|
table_version_locator = TableVersionLocator()
|
148
147
|
table_version_locator.table_locator = table_locator
|
149
148
|
table_version_locator.table_version = table_version
|
150
149
|
return table_version_locator
|
151
150
|
|
152
151
|
@staticmethod
|
153
|
-
def at(
|
154
|
-
|
155
|
-
|
152
|
+
def at(
|
153
|
+
namespace: Optional[str],
|
154
|
+
table_name: Optional[str],
|
155
|
+
table_version: Optional[str],
|
156
|
+
) -> TableVersionLocator:
|
156
157
|
table_locator = TableLocator.at(namespace, table_name)
|
157
158
|
return TableVersionLocator.of(table_locator, table_version)
|
158
159
|
|
deltacat/storage/model/types.py
CHANGED
@@ -1,38 +1,30 @@
|
|
1
1
|
import unittest
|
2
2
|
from typing import Tuple
|
3
3
|
|
4
|
-
from deltacat.compute.stats.utils.intervals import
|
4
|
+
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
5
5
|
|
6
6
|
|
7
7
|
class TestMergeIntervals(unittest.TestCase):
|
8
8
|
def test_unbounded_start_range(self):
|
9
|
-
intervals = sorted(merge_intervals(
|
10
|
-
{(3, 9), (None, 15), (13, 30)}
|
11
|
-
))
|
9
|
+
intervals = sorted(merge_intervals({(3, 9), (None, 15), (13, 30)}))
|
12
10
|
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
13
11
|
self.assertEqual(interval[0], None)
|
14
12
|
self.assertEqual(interval[1], 30)
|
15
13
|
|
16
14
|
def test_unbounded_end_range(self):
|
17
|
-
intervals = sorted(merge_intervals(
|
18
|
-
{(3, 9), (2, None), (13, 30)}
|
19
|
-
))
|
15
|
+
intervals = sorted(merge_intervals({(3, 9), (2, None), (13, 30)}))
|
20
16
|
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
21
17
|
self.assertEqual(interval[0], 2)
|
22
18
|
self.assertEqual(interval[1], None)
|
23
19
|
|
24
20
|
def test_unbounded_start_end_range(self):
|
25
|
-
intervals = sorted(merge_intervals(
|
26
|
-
{(None, None)}
|
27
|
-
))
|
21
|
+
intervals = sorted(merge_intervals({(None, None)}))
|
28
22
|
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
29
23
|
self.assertEqual(interval[0], None)
|
30
24
|
self.assertEqual(interval[1], None)
|
31
25
|
|
32
26
|
def test_no_overlap_range(self):
|
33
|
-
intervals = sorted(merge_intervals(
|
34
|
-
{(3, 9), (11, 14), (19, 30)}
|
35
|
-
))
|
27
|
+
intervals = sorted(merge_intervals({(3, 9), (11, 14), (19, 30)}))
|
36
28
|
interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
37
29
|
interval2: Tuple[DeltaRange, DeltaRange] = intervals[1]
|
38
30
|
interval3: Tuple[DeltaRange, DeltaRange] = intervals[2]
|
@@ -41,22 +33,17 @@ class TestMergeIntervals(unittest.TestCase):
|
|
41
33
|
self.assertEqual(interval3, (19, 30))
|
42
34
|
|
43
35
|
def test_overlap_range(self):
|
44
|
-
intervals = sorted(merge_intervals(
|
45
|
-
{(3, 9), (9, 14), (14, 30)}
|
46
|
-
))
|
36
|
+
intervals = sorted(merge_intervals({(3, 9), (9, 14), (14, 30)}))
|
47
37
|
interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
48
38
|
self.assertEqual(interval1, (3, 30))
|
49
39
|
|
50
40
|
def test_invalid_range(self):
|
51
|
-
self.assertRaises(ValueError, merge_intervals,
|
52
|
-
{(3, 9), (9, 3)}
|
53
|
-
)
|
41
|
+
self.assertRaises(ValueError, merge_intervals, {(3, 9), (9, 3)})
|
54
42
|
|
55
43
|
def test_invalid_type(self):
|
56
|
-
self.assertRaises(ValueError, merge_intervals,
|
57
|
-
|
58
|
-
self.assertRaises(ValueError, merge_intervals,
|
59
|
-
{(3, 9), ("1", 3)})
|
44
|
+
self.assertRaises(ValueError, merge_intervals, {(3, 9), (1.2, 3)})
|
45
|
+
self.assertRaises(ValueError, merge_intervals, {(3, 9), ("1", 3)})
|
60
46
|
|
61
|
-
|
47
|
+
|
48
|
+
if __name__ == "__main__":
|
62
49
|
unittest.main()
|
File without changes
|
@@ -0,0 +1,284 @@
|
|
1
|
+
import unittest
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.utils.pyarrow import RecordBatchTables
|
6
|
+
|
7
|
+
|
8
|
+
class TestRecordBatchTables(unittest.TestCase):
|
9
|
+
def setUp(self) -> None:
|
10
|
+
self.column_names = ["pk", "sk"]
|
11
|
+
|
12
|
+
def test_single_table_with_batches_and_remainder(self):
|
13
|
+
min_records_batch = 8
|
14
|
+
bt = RecordBatchTables(min_records_batch)
|
15
|
+
col1 = pa.array([i for i in range(10)])
|
16
|
+
col2 = pa.array(["test"] * 10)
|
17
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
18
|
+
bt.append(test_table)
|
19
|
+
self.assertTrue(bt.has_batches())
|
20
|
+
self.assertEqual(bt.batched_record_count, 8)
|
21
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
22
|
+
self.assertTrue(bt.has_remaining())
|
23
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
24
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
25
|
+
|
26
|
+
def test_single_table_with_no_remainder(self):
|
27
|
+
min_records_batch = 5
|
28
|
+
bt = RecordBatchTables(min_records_batch)
|
29
|
+
col1 = pa.array([i for i in range(min_records_batch)])
|
30
|
+
col2 = pa.array(["test"] * min_records_batch)
|
31
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
32
|
+
bt.append(test_table)
|
33
|
+
self.assertFalse(bt.has_remaining())
|
34
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
35
|
+
|
36
|
+
def test_single_table_with_only_batches(self):
|
37
|
+
min_records_batch = 10
|
38
|
+
bt = RecordBatchTables(min_records_batch)
|
39
|
+
col1 = pa.array([i for i in range(min_records_batch)])
|
40
|
+
col2 = pa.array(["test"] * min_records_batch)
|
41
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
42
|
+
bt.append(test_table)
|
43
|
+
self.assertTrue(bt.has_batches())
|
44
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
45
|
+
self.assertFalse(bt.has_remaining())
|
46
|
+
self.assertEqual(bt.batched_record_count, 10)
|
47
|
+
self.assertEqual(bt.remaining_record_count, 0)
|
48
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
49
|
+
|
50
|
+
def test_single_table_with_only_remainder(self):
|
51
|
+
min_records_batch = 11
|
52
|
+
bt = RecordBatchTables(min_records_batch)
|
53
|
+
col1 = pa.array([i for i in range(10)])
|
54
|
+
col2 = pa.array(["test"] * 10)
|
55
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
56
|
+
bt.append(test_table)
|
57
|
+
self.assertFalse(bt.has_batches())
|
58
|
+
self.assertTrue(bt.has_remaining())
|
59
|
+
self.assertEqual(bt.batched_record_count, 0)
|
60
|
+
self.assertEqual(bt.remaining_record_count, 10)
|
61
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
62
|
+
|
63
|
+
def test_grouped_tables_with_only_remainder(self):
|
64
|
+
min_records_batch = 600
|
65
|
+
test_table_num_records = 100
|
66
|
+
grouped_tables = [
|
67
|
+
pa.Table.from_arrays(
|
68
|
+
[
|
69
|
+
pa.array(
|
70
|
+
[
|
71
|
+
i
|
72
|
+
for i in range(
|
73
|
+
i * test_table_num_records,
|
74
|
+
(i + 1) * test_table_num_records,
|
75
|
+
)
|
76
|
+
]
|
77
|
+
),
|
78
|
+
pa.array(["foo"] * test_table_num_records),
|
79
|
+
],
|
80
|
+
names=self.column_names,
|
81
|
+
)
|
82
|
+
for i in range(5)
|
83
|
+
]
|
84
|
+
|
85
|
+
bt = RecordBatchTables(min_records_batch)
|
86
|
+
for table in grouped_tables:
|
87
|
+
bt.append(table)
|
88
|
+
self.assertFalse(bt.has_batches())
|
89
|
+
self.assertTrue(bt.has_remaining())
|
90
|
+
self.assertEqual(bt.remaining_record_count, 500)
|
91
|
+
self.assertLess(bt.remaining_record_count, min_records_batch)
|
92
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
93
|
+
|
94
|
+
def test_grouped_tables_with_batches_and_remainder(self):
|
95
|
+
min_records_batch = 450
|
96
|
+
test_table_num_records = 100
|
97
|
+
grouped_tables = [
|
98
|
+
pa.Table.from_arrays(
|
99
|
+
[
|
100
|
+
pa.array(
|
101
|
+
[
|
102
|
+
i
|
103
|
+
for i in range(
|
104
|
+
i * test_table_num_records,
|
105
|
+
(i + 1) * test_table_num_records,
|
106
|
+
)
|
107
|
+
]
|
108
|
+
),
|
109
|
+
pa.array(["foo"] * 100),
|
110
|
+
],
|
111
|
+
names=self.column_names,
|
112
|
+
)
|
113
|
+
for i in range(5)
|
114
|
+
]
|
115
|
+
|
116
|
+
bt = RecordBatchTables(min_records_batch)
|
117
|
+
for table in grouped_tables:
|
118
|
+
bt.append(table)
|
119
|
+
self.assertTrue(bt.has_batches())
|
120
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
121
|
+
self.assertTrue(bt.has_remaining())
|
122
|
+
self.assertEqual(bt.batched_record_count, 450)
|
123
|
+
self.assertEqual(bt.remaining_record_count, 50)
|
124
|
+
self.assertTrue(bt.batched_record_count % min_records_batch == 0)
|
125
|
+
self.assertLess(bt.remaining_record_count, min_records_batch)
|
126
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
127
|
+
|
128
|
+
def test_grouped_tables_with_smaller_batch_size_than_table_records(self):
|
129
|
+
min_records_batch = 5
|
130
|
+
test_table_num_records = 39
|
131
|
+
grouped_tables = [
|
132
|
+
pa.Table.from_arrays(
|
133
|
+
[
|
134
|
+
pa.array(
|
135
|
+
[
|
136
|
+
i
|
137
|
+
for i in range(
|
138
|
+
i * test_table_num_records,
|
139
|
+
(i + 1) * test_table_num_records,
|
140
|
+
)
|
141
|
+
]
|
142
|
+
),
|
143
|
+
pa.array(["foo"] * test_table_num_records),
|
144
|
+
],
|
145
|
+
names=self.column_names,
|
146
|
+
)
|
147
|
+
for i in range(3)
|
148
|
+
]
|
149
|
+
|
150
|
+
bt = RecordBatchTables(min_records_batch)
|
151
|
+
for table in grouped_tables:
|
152
|
+
bt.append(table)
|
153
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
154
|
+
|
155
|
+
self.assertTrue(bt.has_batches())
|
156
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
157
|
+
self.assertEqual(bt.batched_record_count, 115)
|
158
|
+
self.assertTrue(bt.batched_record_count % min_records_batch == 0)
|
159
|
+
self.assertTrue(bt.has_remaining())
|
160
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
161
|
+
self.assertLess(bt.remaining_record_count, min_records_batch)
|
162
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
163
|
+
|
164
|
+
def test_batched_tables_factory_from_input_tables(self):
|
165
|
+
min_records_batch = 5
|
166
|
+
test_table_num_records = 39
|
167
|
+
grouped_tables = [
|
168
|
+
pa.Table.from_arrays(
|
169
|
+
[
|
170
|
+
pa.array(
|
171
|
+
[
|
172
|
+
i
|
173
|
+
for i in range(
|
174
|
+
i * test_table_num_records,
|
175
|
+
(i + 1) * test_table_num_records,
|
176
|
+
)
|
177
|
+
]
|
178
|
+
),
|
179
|
+
pa.array(["foo"] * test_table_num_records),
|
180
|
+
],
|
181
|
+
names=self.column_names,
|
182
|
+
)
|
183
|
+
for i in range(3)
|
184
|
+
]
|
185
|
+
bt = RecordBatchTables.from_tables(grouped_tables, min_records_batch)
|
186
|
+
self.assertTrue(type(bt), RecordBatchTables)
|
187
|
+
self.assertTrue(bt.has_batches())
|
188
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
189
|
+
self.assertEqual(bt.batched_record_count, 115)
|
190
|
+
self.assertTrue(bt.batched_record_count % min_records_batch == 0)
|
191
|
+
self.assertTrue(bt.has_remaining())
|
192
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
193
|
+
self.assertLess(bt.remaining_record_count, min_records_batch)
|
194
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
195
|
+
|
196
|
+
def test_clear(self):
|
197
|
+
min_records_batch = 8
|
198
|
+
bt = RecordBatchTables(min_records_batch)
|
199
|
+
col1 = pa.array([i for i in range(10)])
|
200
|
+
col2 = pa.array(["test"] * 10)
|
201
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
202
|
+
bt.append(test_table)
|
203
|
+
self.assertTrue(bt.has_batches())
|
204
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
205
|
+
self.assertEqual(bt.batched_record_count, 8)
|
206
|
+
|
207
|
+
bt.clear_batches()
|
208
|
+
self.assertFalse(bt.has_batches())
|
209
|
+
self.assertEqual(bt.batched_record_count, 0)
|
210
|
+
|
211
|
+
def test_append_after_clear(self):
|
212
|
+
min_records_batch = 8
|
213
|
+
bt = RecordBatchTables(min_records_batch)
|
214
|
+
col1 = pa.array([i for i in range(10)])
|
215
|
+
col2 = pa.array(["test"] * 10)
|
216
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
217
|
+
bt.append(test_table)
|
218
|
+
self.assertTrue(bt.has_batches())
|
219
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
220
|
+
self.assertEqual(bt.batched_record_count, 8)
|
221
|
+
prev_remainder_records = bt.remaining_record_count
|
222
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
223
|
+
|
224
|
+
bt.clear_batches()
|
225
|
+
self.assertFalse(bt.has_batches())
|
226
|
+
self.assertEqual(bt.batched_record_count, 0)
|
227
|
+
|
228
|
+
col1 = pa.array([i for i in range(10, 20)])
|
229
|
+
col2 = pa.array(["test"] * 10)
|
230
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
231
|
+
bt.append(test_table)
|
232
|
+
|
233
|
+
self.assertEqual(bt.batched_record_count, 8)
|
234
|
+
self.assertEqual(bt.remaining_record_count, 4)
|
235
|
+
self.assertNotEquals(prev_remainder_records, bt.remaining_record_count)
|
236
|
+
self.assertTrue(_is_sorted(bt, self.column_names[0]))
|
237
|
+
|
238
|
+
bt.clear_remaining()
|
239
|
+
self.assertFalse(bt.has_remaining())
|
240
|
+
self.assertTrue(bt.remaining_record_count == 0)
|
241
|
+
|
242
|
+
def test_evict(self):
|
243
|
+
min_records_batch = 8
|
244
|
+
bt = RecordBatchTables(min_records_batch)
|
245
|
+
col1 = pa.array([i for i in range(10)])
|
246
|
+
col2 = pa.array(["test"] * 10)
|
247
|
+
test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
|
248
|
+
bt.append(test_table)
|
249
|
+
self.assertTrue(bt.has_batches())
|
250
|
+
self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
|
251
|
+
self.assertTrue(bt.has_remaining())
|
252
|
+
self.assertEqual(bt.batched_record_count, 8)
|
253
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
254
|
+
prev_batched_records = bt.batched_record_count
|
255
|
+
|
256
|
+
evicted_tables = bt.evict()
|
257
|
+
self.assertFalse(bt.has_batches())
|
258
|
+
self.assertTrue(bt.has_remaining())
|
259
|
+
self.assertEqual(bt.batched_record_count, 0)
|
260
|
+
self.assertEqual(bt.remaining_record_count, 2)
|
261
|
+
self.assertEqual(sum([len(t) for t in evicted_tables]), prev_batched_records)
|
262
|
+
|
263
|
+
|
264
|
+
def _is_sorted(batched_tables: RecordBatchTables, sort_key: str):
|
265
|
+
merged_table = pa.concat_tables(
|
266
|
+
[*batched_tables.batched, *batched_tables.remaining]
|
267
|
+
)
|
268
|
+
explicitly_sorted_merged_table = merged_table.sort_by([(sort_key, "ascending")])
|
269
|
+
return explicitly_sorted_merged_table == merged_table
|
270
|
+
|
271
|
+
|
272
|
+
def _is_gte_batch_size_and_divisible(
|
273
|
+
batched_tables: RecordBatchTables, min_records_batch: int
|
274
|
+
):
|
275
|
+
return all(
|
276
|
+
[
|
277
|
+
len(table) // min_records_batch > 0 and len(table) % min_records_batch == 0
|
278
|
+
for table in batched_tables.batched
|
279
|
+
]
|
280
|
+
)
|
281
|
+
|
282
|
+
|
283
|
+
if __name__ == "__main__":
|
284
|
+
unittest.main()
|
deltacat/types/media.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
|
3
|
-
from typing import Set, Dict
|
2
|
+
from typing import Dict, Set
|
4
3
|
|
5
4
|
|
6
5
|
class ContentType(str, Enum):
|
@@ -57,7 +56,7 @@ DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
|
|
57
56
|
ContentType.UNESCAPED_TSV.value,
|
58
57
|
ContentType.TSV.value,
|
59
58
|
ContentType.CSV.value,
|
60
|
-
ContentType.PSV.value
|
59
|
+
ContentType.PSV.value,
|
61
60
|
}
|
62
61
|
|
63
62
|
TABULAR_CONTENT_TYPES: Set[str] = {
|
@@ -75,7 +74,7 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
|
|
75
74
|
ContentType.TSV.value,
|
76
75
|
ContentType.CSV.value,
|
77
76
|
ContentType.PSV.value,
|
78
|
-
ContentType.JSON.value
|
77
|
+
ContentType.JSON.value,
|
79
78
|
}
|
80
79
|
|
81
80
|
CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {
|
deltacat/types/tables.py
CHANGED
@@ -1,28 +1,34 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import
|
2
|
+
from typing import Callable, Dict, Type, Union
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
6
|
import pyarrow as pa
|
7
|
-
import deltacat.storage as dcs
|
8
|
-
|
9
7
|
from ray.data.dataset import Dataset
|
10
|
-
from ray.data.read_api import
|
11
|
-
from_arrow,
|
8
|
+
from ray.data.read_api import (
|
9
|
+
from_arrow,
|
10
|
+
from_arrow_refs,
|
11
|
+
from_numpy,
|
12
|
+
from_pandas,
|
13
|
+
from_pandas_refs,
|
14
|
+
)
|
12
15
|
|
16
|
+
import deltacat.storage as dcs
|
13
17
|
from deltacat.types.media import TableType
|
14
|
-
from deltacat.utils import
|
15
|
-
|
18
|
+
from deltacat.utils import numpy as np_utils
|
19
|
+
from deltacat.utils import pandas as pd_utils
|
20
|
+
from deltacat.utils import pyarrow as pa_utils
|
16
21
|
from deltacat.utils.ray_utils import dataset as ds_utils
|
17
22
|
|
18
23
|
TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
|
19
24
|
TableType.PYARROW.value: pa_utils.s3_file_to_table,
|
20
25
|
TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
|
21
|
-
TableType.NUMPY.value: np_utils.s3_file_to_ndarray
|
26
|
+
TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
|
22
27
|
}
|
23
28
|
|
24
29
|
TABLE_CLASS_TO_WRITER_FUNC: Dict[
|
25
|
-
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
30
|
+
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
31
|
+
] = {
|
26
32
|
pa.Table: pa_utils.table_to_file,
|
27
33
|
pd.DataFrame: pd_utils.dataframe_to_file,
|
28
34
|
np.ndarray: np_utils.ndarray_to_file,
|
@@ -30,7 +36,8 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
|
|
30
36
|
}
|
31
37
|
|
32
38
|
TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
33
|
-
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
39
|
+
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
40
|
+
] = {
|
34
41
|
pa.Table: pa_utils.slice_table,
|
35
42
|
pd.DataFrame: pd_utils.slice_dataframe,
|
36
43
|
np.ndarray: np_utils.slice_ndarray,
|
@@ -38,7 +45,8 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
|
|
38
45
|
}
|
39
46
|
|
40
47
|
TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
41
|
-
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
48
|
+
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
49
|
+
] = {
|
42
50
|
pa.Table: pa_utils.table_size,
|
43
51
|
pd.DataFrame: pd_utils.dataframe_size,
|
44
52
|
np.ndarray: np_utils.ndarray_size,
|
@@ -77,6 +85,7 @@ class TableWriteMode(str, Enum):
|
|
77
85
|
Updates or inserts records based on the table's primary and sort keys by
|
78
86
|
default.
|
79
87
|
"""
|
88
|
+
|
80
89
|
AUTO = "auto"
|
81
90
|
CREATE = "create"
|
82
91
|
APPEND = "append"
|
@@ -84,26 +93,27 @@ class TableWriteMode(str, Enum):
|
|
84
93
|
MERGE = "merge"
|
85
94
|
|
86
95
|
|
87
|
-
def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset])
|
88
|
-
-> int:
|
96
|
+
def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
|
89
97
|
return len(table) if not isinstance(table, Dataset) else table.count()
|
90
98
|
|
91
99
|
|
92
|
-
def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset])
|
93
|
-
-> Callable:
|
100
|
+
def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
|
94
101
|
table_writer_func = TABLE_CLASS_TO_WRITER_FUNC.get(type(table))
|
95
102
|
if table_writer_func is None:
|
96
|
-
msg =
|
97
|
-
|
103
|
+
msg = (
|
104
|
+
f"No writer found for table type: {type(table)}.\n"
|
105
|
+
f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
|
106
|
+
)
|
98
107
|
raise ValueError(msg)
|
99
108
|
return table_writer_func
|
100
109
|
|
101
110
|
|
102
|
-
def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset])
|
103
|
-
-> Callable:
|
111
|
+
def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
|
104
112
|
table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
|
105
113
|
if table_slicer_func is None:
|
106
|
-
msg =
|
107
|
-
|
114
|
+
msg = (
|
115
|
+
f"No slicer found for table type: {type(table)}.\n"
|
116
|
+
f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
|
117
|
+
)
|
108
118
|
raise ValueError(msg)
|
109
119
|
return table_slicer_func
|
deltacat/utils/common.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
import hashlib
|
2
|
-
|
3
|
-
import time
|
4
2
|
import os
|
5
|
-
|
3
|
+
import time
|
6
4
|
from typing import Any, Dict
|
7
5
|
|
6
|
+
|
8
7
|
def env_bool(key: str, default: bool) -> int:
|
9
8
|
if key in os.environ:
|
10
9
|
return bool(os.environ[key])
|
@@ -44,16 +43,11 @@ class ContentTypeKwargsProvider:
|
|
44
43
|
as input, and returns finalized keyword args as output. Useful for merging
|
45
44
|
content-type-specific keyword arguments into an existing fixed dictionary
|
46
45
|
of keyword arguments."""
|
47
|
-
|
48
|
-
|
49
|
-
content_type: str,
|
50
|
-
kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
46
|
+
|
47
|
+
def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
51
48
|
raise NotImplementedError
|
52
49
|
|
53
|
-
def __call__(
|
54
|
-
self,
|
55
|
-
content_type: str,
|
56
|
-
kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
50
|
+
def __call__(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
57
51
|
return self._get_kwargs(content_type, kwargs)
|
58
52
|
|
59
53
|
|