deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,27 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Any, Dict, List, Optional, Union
5
+
4
6
  import pyarrow as pa
5
7
 
8
+ from deltacat.storage.model.locator import Locator
6
9
  from deltacat.storage.model.namespace import NamespaceLocator
7
10
  from deltacat.storage.model.table import TableLocator
8
- from deltacat.storage.model.locator import Locator
9
11
  from deltacat.types.media import ContentType
10
12
 
11
- from typing import Any, Dict, List, Optional, Union
12
-
13
13
 
14
14
  class TableVersion(dict):
15
15
  @staticmethod
16
- def of(locator: Optional[TableVersionLocator],
17
- schema: Optional[Union[pa.Schema, str, bytes]],
18
- partition_keys: Optional[List[Dict[str, Any]]] = None,
19
- primary_key_columns: Optional[List[str]] = None,
20
- description: Optional[str] = None,
21
- properties: Optional[Dict[str, str]] = None,
22
- content_types: Optional[List[ContentType]] = None) -> TableVersion:
16
+ def of(
17
+ locator: Optional[TableVersionLocator],
18
+ schema: Optional[Union[pa.Schema, str, bytes]],
19
+ partition_keys: Optional[List[Dict[str, Any]]] = None,
20
+ primary_key_columns: Optional[List[str]] = None,
21
+ description: Optional[str] = None,
22
+ properties: Optional[Dict[str, str]] = None,
23
+ content_types: Optional[List[ContentType]] = None,
24
+ ) -> TableVersion:
23
25
  table_version = TableVersion()
24
26
  table_version.locator = locator
25
27
  table_version.schema = schema
@@ -38,9 +40,7 @@ class TableVersion(dict):
38
40
  return val
39
41
 
40
42
  @locator.setter
41
- def locator(
42
- self,
43
- table_version_locator: Optional[TableVersionLocator]) -> None:
43
+ def locator(self, table_version_locator: Optional[TableVersionLocator]) -> None:
44
44
  self["tableVersionLocator"] = table_version_locator
45
45
 
46
46
  @property
@@ -56,9 +56,7 @@ class TableVersion(dict):
56
56
  return self.get("partitionKeys")
57
57
 
58
58
  @partition_keys.setter
59
- def partition_keys(
60
- self,
61
- partition_keys: Optional[List[Dict[str, Any]]]) -> None:
59
+ def partition_keys(self, partition_keys: Optional[List[Dict[str, Any]]]) -> None:
62
60
  self["partitionKeys"] = partition_keys
63
61
 
64
62
  @property
@@ -88,13 +86,14 @@ class TableVersion(dict):
88
86
  @property
89
87
  def content_types(self) -> Optional[List[ContentType]]:
90
88
  content_types = self.get("contentTypes")
91
- return None if content_types is None else \
92
- [None if _ is None else ContentType(_) for _ in content_types]
89
+ return (
90
+ None
91
+ if content_types is None
92
+ else [None if _ is None else ContentType(_) for _ in content_types]
93
+ )
93
94
 
94
95
  @content_types.setter
95
- def content_types(
96
- self,
97
- content_types: Optional[List[ContentType]]) -> None:
96
+ def content_types(self, content_types: Optional[List[ContentType]]) -> None:
98
97
  self["contentTypes"] = content_types
99
98
 
100
99
  @property
@@ -132,27 +131,29 @@ class TableVersion(dict):
132
131
  return table_version_locator.table_version
133
132
  return None
134
133
 
135
- def is_supported_content_type(
136
- self,
137
- content_type: ContentType):
134
+ def is_supported_content_type(self, content_type: ContentType):
138
135
  supported_content_types = self.content_types
139
- return (not supported_content_types) or \
140
- (content_type in supported_content_types)
136
+ return (not supported_content_types) or (
137
+ content_type in supported_content_types
138
+ )
141
139
 
142
140
 
143
141
  class TableVersionLocator(Locator, dict):
144
142
  @staticmethod
145
- def of(table_locator: Optional[TableLocator],
146
- table_version: Optional[str]) -> TableVersionLocator:
143
+ def of(
144
+ table_locator: Optional[TableLocator], table_version: Optional[str]
145
+ ) -> TableVersionLocator:
147
146
  table_version_locator = TableVersionLocator()
148
147
  table_version_locator.table_locator = table_locator
149
148
  table_version_locator.table_version = table_version
150
149
  return table_version_locator
151
150
 
152
151
  @staticmethod
153
- def at(namespace: Optional[str],
154
- table_name: Optional[str],
155
- table_version: Optional[str]) -> TableVersionLocator:
152
+ def at(
153
+ namespace: Optional[str],
154
+ table_name: Optional[str],
155
+ table_version: Optional[str],
156
+ ) -> TableVersionLocator:
156
157
  table_locator = TableLocator.at(namespace, table_name)
157
158
  return TableVersionLocator.of(table_locator, table_version)
158
159
 
@@ -49,6 +49,7 @@ class SchemaConsistencyType(str, Enum):
49
49
  VALIDATE: Raise an error for any fields that don't fit the schema. An
50
50
  explicit subset of column names to validate may optionally be specified.
51
51
  """
52
+
52
53
  NONE = "none"
53
54
  COERCE = "coerce"
54
55
  VALIDATE = "validate"
@@ -1,38 +1,30 @@
1
1
  import unittest
2
2
  from typing import Tuple
3
3
 
4
- from deltacat.compute.stats.utils.intervals import merge_intervals, DeltaRange
4
+ from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
5
5
 
6
6
 
7
7
  class TestMergeIntervals(unittest.TestCase):
8
8
  def test_unbounded_start_range(self):
9
- intervals = sorted(merge_intervals(
10
- {(3, 9), (None, 15), (13, 30)}
11
- ))
9
+ intervals = sorted(merge_intervals({(3, 9), (None, 15), (13, 30)}))
12
10
  interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
13
11
  self.assertEqual(interval[0], None)
14
12
  self.assertEqual(interval[1], 30)
15
13
 
16
14
  def test_unbounded_end_range(self):
17
- intervals = sorted(merge_intervals(
18
- {(3, 9), (2, None), (13, 30)}
19
- ))
15
+ intervals = sorted(merge_intervals({(3, 9), (2, None), (13, 30)}))
20
16
  interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
21
17
  self.assertEqual(interval[0], 2)
22
18
  self.assertEqual(interval[1], None)
23
19
 
24
20
  def test_unbounded_start_end_range(self):
25
- intervals = sorted(merge_intervals(
26
- {(None, None)}
27
- ))
21
+ intervals = sorted(merge_intervals({(None, None)}))
28
22
  interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
29
23
  self.assertEqual(interval[0], None)
30
24
  self.assertEqual(interval[1], None)
31
25
 
32
26
  def test_no_overlap_range(self):
33
- intervals = sorted(merge_intervals(
34
- {(3, 9), (11, 14), (19, 30)}
35
- ))
27
+ intervals = sorted(merge_intervals({(3, 9), (11, 14), (19, 30)}))
36
28
  interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
37
29
  interval2: Tuple[DeltaRange, DeltaRange] = intervals[1]
38
30
  interval3: Tuple[DeltaRange, DeltaRange] = intervals[2]
@@ -41,22 +33,17 @@ class TestMergeIntervals(unittest.TestCase):
41
33
  self.assertEqual(interval3, (19, 30))
42
34
 
43
35
  def test_overlap_range(self):
44
- intervals = sorted(merge_intervals(
45
- {(3, 9), (9, 14), (14, 30)}
46
- ))
36
+ intervals = sorted(merge_intervals({(3, 9), (9, 14), (14, 30)}))
47
37
  interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
48
38
  self.assertEqual(interval1, (3, 30))
49
39
 
50
40
  def test_invalid_range(self):
51
- self.assertRaises(ValueError, merge_intervals,
52
- {(3, 9), (9, 3)}
53
- )
41
+ self.assertRaises(ValueError, merge_intervals, {(3, 9), (9, 3)})
54
42
 
55
43
  def test_invalid_type(self):
56
- self.assertRaises(ValueError, merge_intervals,
57
- {(3, 9), (1.2, 3)})
58
- self.assertRaises(ValueError, merge_intervals,
59
- {(3, 9), ("1", 3)})
44
+ self.assertRaises(ValueError, merge_intervals, {(3, 9), (1.2, 3)})
45
+ self.assertRaises(ValueError, merge_intervals, {(3, 9), ("1", 3)})
60
46
 
61
- if __name__ == '__main__':
47
+
48
+ if __name__ == "__main__":
62
49
  unittest.main()
File without changes
@@ -0,0 +1,284 @@
1
+ import unittest
2
+
3
+ import pyarrow as pa
4
+
5
+ from deltacat.utils.pyarrow import RecordBatchTables
6
+
7
+
8
+ class TestRecordBatchTables(unittest.TestCase):
9
+ def setUp(self) -> None:
10
+ self.column_names = ["pk", "sk"]
11
+
12
+ def test_single_table_with_batches_and_remainder(self):
13
+ min_records_batch = 8
14
+ bt = RecordBatchTables(min_records_batch)
15
+ col1 = pa.array([i for i in range(10)])
16
+ col2 = pa.array(["test"] * 10)
17
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
18
+ bt.append(test_table)
19
+ self.assertTrue(bt.has_batches())
20
+ self.assertEqual(bt.batched_record_count, 8)
21
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
22
+ self.assertTrue(bt.has_remaining())
23
+ self.assertEqual(bt.remaining_record_count, 2)
24
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
25
+
26
+ def test_single_table_with_no_remainder(self):
27
+ min_records_batch = 5
28
+ bt = RecordBatchTables(min_records_batch)
29
+ col1 = pa.array([i for i in range(min_records_batch)])
30
+ col2 = pa.array(["test"] * min_records_batch)
31
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
32
+ bt.append(test_table)
33
+ self.assertFalse(bt.has_remaining())
34
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
35
+
36
+ def test_single_table_with_only_batches(self):
37
+ min_records_batch = 10
38
+ bt = RecordBatchTables(min_records_batch)
39
+ col1 = pa.array([i for i in range(min_records_batch)])
40
+ col2 = pa.array(["test"] * min_records_batch)
41
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
42
+ bt.append(test_table)
43
+ self.assertTrue(bt.has_batches())
44
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
45
+ self.assertFalse(bt.has_remaining())
46
+ self.assertEqual(bt.batched_record_count, 10)
47
+ self.assertEqual(bt.remaining_record_count, 0)
48
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
49
+
50
+ def test_single_table_with_only_remainder(self):
51
+ min_records_batch = 11
52
+ bt = RecordBatchTables(min_records_batch)
53
+ col1 = pa.array([i for i in range(10)])
54
+ col2 = pa.array(["test"] * 10)
55
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
56
+ bt.append(test_table)
57
+ self.assertFalse(bt.has_batches())
58
+ self.assertTrue(bt.has_remaining())
59
+ self.assertEqual(bt.batched_record_count, 0)
60
+ self.assertEqual(bt.remaining_record_count, 10)
61
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
62
+
63
+ def test_grouped_tables_with_only_remainder(self):
64
+ min_records_batch = 600
65
+ test_table_num_records = 100
66
+ grouped_tables = [
67
+ pa.Table.from_arrays(
68
+ [
69
+ pa.array(
70
+ [
71
+ i
72
+ for i in range(
73
+ i * test_table_num_records,
74
+ (i + 1) * test_table_num_records,
75
+ )
76
+ ]
77
+ ),
78
+ pa.array(["foo"] * test_table_num_records),
79
+ ],
80
+ names=self.column_names,
81
+ )
82
+ for i in range(5)
83
+ ]
84
+
85
+ bt = RecordBatchTables(min_records_batch)
86
+ for table in grouped_tables:
87
+ bt.append(table)
88
+ self.assertFalse(bt.has_batches())
89
+ self.assertTrue(bt.has_remaining())
90
+ self.assertEqual(bt.remaining_record_count, 500)
91
+ self.assertLess(bt.remaining_record_count, min_records_batch)
92
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
93
+
94
+ def test_grouped_tables_with_batches_and_remainder(self):
95
+ min_records_batch = 450
96
+ test_table_num_records = 100
97
+ grouped_tables = [
98
+ pa.Table.from_arrays(
99
+ [
100
+ pa.array(
101
+ [
102
+ i
103
+ for i in range(
104
+ i * test_table_num_records,
105
+ (i + 1) * test_table_num_records,
106
+ )
107
+ ]
108
+ ),
109
+ pa.array(["foo"] * 100),
110
+ ],
111
+ names=self.column_names,
112
+ )
113
+ for i in range(5)
114
+ ]
115
+
116
+ bt = RecordBatchTables(min_records_batch)
117
+ for table in grouped_tables:
118
+ bt.append(table)
119
+ self.assertTrue(bt.has_batches())
120
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
121
+ self.assertTrue(bt.has_remaining())
122
+ self.assertEqual(bt.batched_record_count, 450)
123
+ self.assertEqual(bt.remaining_record_count, 50)
124
+ self.assertTrue(bt.batched_record_count % min_records_batch == 0)
125
+ self.assertLess(bt.remaining_record_count, min_records_batch)
126
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
127
+
128
+ def test_grouped_tables_with_smaller_batch_size_than_table_records(self):
129
+ min_records_batch = 5
130
+ test_table_num_records = 39
131
+ grouped_tables = [
132
+ pa.Table.from_arrays(
133
+ [
134
+ pa.array(
135
+ [
136
+ i
137
+ for i in range(
138
+ i * test_table_num_records,
139
+ (i + 1) * test_table_num_records,
140
+ )
141
+ ]
142
+ ),
143
+ pa.array(["foo"] * test_table_num_records),
144
+ ],
145
+ names=self.column_names,
146
+ )
147
+ for i in range(3)
148
+ ]
149
+
150
+ bt = RecordBatchTables(min_records_batch)
151
+ for table in grouped_tables:
152
+ bt.append(table)
153
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
154
+
155
+ self.assertTrue(bt.has_batches())
156
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
157
+ self.assertEqual(bt.batched_record_count, 115)
158
+ self.assertTrue(bt.batched_record_count % min_records_batch == 0)
159
+ self.assertTrue(bt.has_remaining())
160
+ self.assertEqual(bt.remaining_record_count, 2)
161
+ self.assertLess(bt.remaining_record_count, min_records_batch)
162
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
163
+
164
+ def test_batched_tables_factory_from_input_tables(self):
165
+ min_records_batch = 5
166
+ test_table_num_records = 39
167
+ grouped_tables = [
168
+ pa.Table.from_arrays(
169
+ [
170
+ pa.array(
171
+ [
172
+ i
173
+ for i in range(
174
+ i * test_table_num_records,
175
+ (i + 1) * test_table_num_records,
176
+ )
177
+ ]
178
+ ),
179
+ pa.array(["foo"] * test_table_num_records),
180
+ ],
181
+ names=self.column_names,
182
+ )
183
+ for i in range(3)
184
+ ]
185
+ bt = RecordBatchTables.from_tables(grouped_tables, min_records_batch)
186
+ self.assertTrue(type(bt), RecordBatchTables)
187
+ self.assertTrue(bt.has_batches())
188
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
189
+ self.assertEqual(bt.batched_record_count, 115)
190
+ self.assertTrue(bt.batched_record_count % min_records_batch == 0)
191
+ self.assertTrue(bt.has_remaining())
192
+ self.assertEqual(bt.remaining_record_count, 2)
193
+ self.assertLess(bt.remaining_record_count, min_records_batch)
194
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
195
+
196
+ def test_clear(self):
197
+ min_records_batch = 8
198
+ bt = RecordBatchTables(min_records_batch)
199
+ col1 = pa.array([i for i in range(10)])
200
+ col2 = pa.array(["test"] * 10)
201
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
202
+ bt.append(test_table)
203
+ self.assertTrue(bt.has_batches())
204
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
205
+ self.assertEqual(bt.batched_record_count, 8)
206
+
207
+ bt.clear_batches()
208
+ self.assertFalse(bt.has_batches())
209
+ self.assertEqual(bt.batched_record_count, 0)
210
+
211
+ def test_append_after_clear(self):
212
+ min_records_batch = 8
213
+ bt = RecordBatchTables(min_records_batch)
214
+ col1 = pa.array([i for i in range(10)])
215
+ col2 = pa.array(["test"] * 10)
216
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
217
+ bt.append(test_table)
218
+ self.assertTrue(bt.has_batches())
219
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
220
+ self.assertEqual(bt.batched_record_count, 8)
221
+ prev_remainder_records = bt.remaining_record_count
222
+ self.assertEqual(bt.remaining_record_count, 2)
223
+
224
+ bt.clear_batches()
225
+ self.assertFalse(bt.has_batches())
226
+ self.assertEqual(bt.batched_record_count, 0)
227
+
228
+ col1 = pa.array([i for i in range(10, 20)])
229
+ col2 = pa.array(["test"] * 10)
230
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
231
+ bt.append(test_table)
232
+
233
+ self.assertEqual(bt.batched_record_count, 8)
234
+ self.assertEqual(bt.remaining_record_count, 4)
235
+ self.assertNotEquals(prev_remainder_records, bt.remaining_record_count)
236
+ self.assertTrue(_is_sorted(bt, self.column_names[0]))
237
+
238
+ bt.clear_remaining()
239
+ self.assertFalse(bt.has_remaining())
240
+ self.assertTrue(bt.remaining_record_count == 0)
241
+
242
+ def test_evict(self):
243
+ min_records_batch = 8
244
+ bt = RecordBatchTables(min_records_batch)
245
+ col1 = pa.array([i for i in range(10)])
246
+ col2 = pa.array(["test"] * 10)
247
+ test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
248
+ bt.append(test_table)
249
+ self.assertTrue(bt.has_batches())
250
+ self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
251
+ self.assertTrue(bt.has_remaining())
252
+ self.assertEqual(bt.batched_record_count, 8)
253
+ self.assertEqual(bt.remaining_record_count, 2)
254
+ prev_batched_records = bt.batched_record_count
255
+
256
+ evicted_tables = bt.evict()
257
+ self.assertFalse(bt.has_batches())
258
+ self.assertTrue(bt.has_remaining())
259
+ self.assertEqual(bt.batched_record_count, 0)
260
+ self.assertEqual(bt.remaining_record_count, 2)
261
+ self.assertEqual(sum([len(t) for t in evicted_tables]), prev_batched_records)
262
+
263
+
264
+ def _is_sorted(batched_tables: RecordBatchTables, sort_key: str):
265
+ merged_table = pa.concat_tables(
266
+ [*batched_tables.batched, *batched_tables.remaining]
267
+ )
268
+ explicitly_sorted_merged_table = merged_table.sort_by([(sort_key, "ascending")])
269
+ return explicitly_sorted_merged_table == merged_table
270
+
271
+
272
+ def _is_gte_batch_size_and_divisible(
273
+ batched_tables: RecordBatchTables, min_records_batch: int
274
+ ):
275
+ return all(
276
+ [
277
+ len(table) // min_records_batch > 0 and len(table) % min_records_batch == 0
278
+ for table in batched_tables.batched
279
+ ]
280
+ )
281
+
282
+
283
+ if __name__ == "__main__":
284
+ unittest.main()
deltacat/types/media.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from enum import Enum
2
-
3
- from typing import Set, Dict
2
+ from typing import Dict, Set
4
3
 
5
4
 
6
5
  class ContentType(str, Enum):
@@ -57,7 +56,7 @@ DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
57
56
  ContentType.UNESCAPED_TSV.value,
58
57
  ContentType.TSV.value,
59
58
  ContentType.CSV.value,
60
- ContentType.PSV.value
59
+ ContentType.PSV.value,
61
60
  }
62
61
 
63
62
  TABULAR_CONTENT_TYPES: Set[str] = {
@@ -75,7 +74,7 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
75
74
  ContentType.TSV.value,
76
75
  ContentType.CSV.value,
77
76
  ContentType.PSV.value,
78
- ContentType.JSON.value
77
+ ContentType.JSON.value,
79
78
  }
80
79
 
81
80
  CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {
deltacat/types/tables.py CHANGED
@@ -1,28 +1,34 @@
1
1
  from enum import Enum
2
- from typing import Dict, Callable, Type, Union
2
+ from typing import Callable, Dict, Type, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
- import deltacat.storage as dcs
8
-
9
7
  from ray.data.dataset import Dataset
10
- from ray.data.read_api import from_arrow_refs, from_pandas_refs, from_numpy, \
11
- from_arrow, from_pandas
8
+ from ray.data.read_api import (
9
+ from_arrow,
10
+ from_arrow_refs,
11
+ from_numpy,
12
+ from_pandas,
13
+ from_pandas_refs,
14
+ )
12
15
 
16
+ import deltacat.storage as dcs
13
17
  from deltacat.types.media import TableType
14
- from deltacat.utils import pyarrow as pa_utils, pandas as pd_utils, \
15
- numpy as np_utils
18
+ from deltacat.utils import numpy as np_utils
19
+ from deltacat.utils import pandas as pd_utils
20
+ from deltacat.utils import pyarrow as pa_utils
16
21
  from deltacat.utils.ray_utils import dataset as ds_utils
17
22
 
18
23
  TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
19
24
  TableType.PYARROW.value: pa_utils.s3_file_to_table,
20
25
  TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
21
- TableType.NUMPY.value: np_utils.s3_file_to_ndarray
26
+ TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
22
27
  }
23
28
 
24
29
  TABLE_CLASS_TO_WRITER_FUNC: Dict[
25
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
30
+ Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
31
+ ] = {
26
32
  pa.Table: pa_utils.table_to_file,
27
33
  pd.DataFrame: pd_utils.dataframe_to_file,
28
34
  np.ndarray: np_utils.ndarray_to_file,
@@ -30,7 +36,8 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
30
36
  }
31
37
 
32
38
  TABLE_CLASS_TO_SLICER_FUNC: Dict[
33
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
39
+ Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
40
+ ] = {
34
41
  pa.Table: pa_utils.slice_table,
35
42
  pd.DataFrame: pd_utils.slice_dataframe,
36
43
  np.ndarray: np_utils.slice_ndarray,
@@ -38,7 +45,8 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
38
45
  }
39
46
 
40
47
  TABLE_CLASS_TO_SIZE_FUNC: Dict[
41
- Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
48
+ Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
49
+ ] = {
42
50
  pa.Table: pa_utils.table_size,
43
51
  pd.DataFrame: pd_utils.dataframe_size,
44
52
  np.ndarray: np_utils.ndarray_size,
@@ -77,6 +85,7 @@ class TableWriteMode(str, Enum):
77
85
  Updates or inserts records based on the table's primary and sort keys by
78
86
  default.
79
87
  """
88
+
80
89
  AUTO = "auto"
81
90
  CREATE = "create"
82
91
  APPEND = "append"
@@ -84,26 +93,27 @@ class TableWriteMode(str, Enum):
84
93
  MERGE = "merge"
85
94
 
86
95
 
87
- def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
88
- -> int:
96
+ def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
89
97
  return len(table) if not isinstance(table, Dataset) else table.count()
90
98
 
91
99
 
92
- def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
93
- -> Callable:
100
+ def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
94
101
  table_writer_func = TABLE_CLASS_TO_WRITER_FUNC.get(type(table))
95
102
  if table_writer_func is None:
96
- msg = f"No writer found for table type: {type(table)}.\n" \
97
- f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
103
+ msg = (
104
+ f"No writer found for table type: {type(table)}.\n"
105
+ f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
106
+ )
98
107
  raise ValueError(msg)
99
108
  return table_writer_func
100
109
 
101
110
 
102
- def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
103
- -> Callable:
111
+ def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
104
112
  table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
105
113
  if table_slicer_func is None:
106
- msg = f"No slicer found for table type: {type(table)}.\n" \
107
- f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
114
+ msg = (
115
+ f"No slicer found for table type: {type(table)}.\n"
116
+ f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
117
+ )
108
118
  raise ValueError(msg)
109
119
  return table_slicer_func
deltacat/utils/common.py CHANGED
@@ -1,10 +1,9 @@
1
1
  import hashlib
2
-
3
- import time
4
2
  import os
5
-
3
+ import time
6
4
  from typing import Any, Dict
7
5
 
6
+
8
7
  def env_bool(key: str, default: bool) -> int:
9
8
  if key in os.environ:
10
9
  return bool(os.environ[key])
@@ -44,16 +43,11 @@ class ContentTypeKwargsProvider:
44
43
  as input, and returns finalized keyword args as output. Useful for merging
45
44
  content-type-specific keyword arguments into an existing fixed dictionary
46
45
  of keyword arguments."""
47
- def _get_kwargs(
48
- self,
49
- content_type: str,
50
- kwargs: Dict[str, Any]) -> Dict[str, Any]:
46
+
47
+ def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
51
48
  raise NotImplementedError
52
49
 
53
- def __call__(
54
- self,
55
- content_type: str,
56
- kwargs: Dict[str, Any]) -> Dict[str, Any]:
50
+ def __call__(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
57
51
  return self._get_kwargs(content_type, kwargs)
58
52
 
59
53