PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/storage/model/table_version.py CHANGED Viewed

@@ -1,25 +1,27 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+from typing import Any, Dict, List, Optional, Union
 import pyarrow as pa
+from deltacat.storage.model.locator import Locator
 from deltacat.storage.model.namespace import NamespaceLocator
 from deltacat.storage.model.table import TableLocator
-from deltacat.storage.model.locator import Locator
 from deltacat.types.media import ContentType
-from typing import Any, Dict, List, Optional, Union
 class TableVersion(dict):
     @staticmethod
-    def of(locator: Optional[TableVersionLocator],
-           schema: Optional[Union[pa.Schema, str, bytes]],
-           partition_keys: Optional[List[Dict[str, Any]]] = None,
-           primary_key_columns: Optional[List[str]] = None,
-           description: Optional[str] = None,
-           properties: Optional[Dict[str, str]] = None,
-           content_types: Optional[List[ContentType]] = None) -> TableVersion:
+    def of(
+        locator: Optional[TableVersionLocator],
+        schema: Optional[Union[pa.Schema, str, bytes]],
+        partition_keys: Optional[List[Dict[str, Any]]] = None,
+        primary_key_columns: Optional[List[str]] = None,
+        description: Optional[str] = None,
+        properties: Optional[Dict[str, str]] = None,
+        content_types: Optional[List[ContentType]] = None,
+    ) -> TableVersion:
         table_version = TableVersion()
         table_version.locator = locator
         table_version.schema = schema
@@ -38,9 +40,7 @@ class TableVersion(dict):
         return val
     @locator.setter
-    def locator(
-            self,
-            table_version_locator: Optional[TableVersionLocator]) -> None:
+    def locator(self, table_version_locator: Optional[TableVersionLocator]) -> None:
         self["tableVersionLocator"] = table_version_locator
     @property
@@ -56,9 +56,7 @@ class TableVersion(dict):
         return self.get("partitionKeys")
     @partition_keys.setter
-    def partition_keys(
-            self,
-            partition_keys: Optional[List[Dict[str, Any]]]) -> None:
+    def partition_keys(self, partition_keys: Optional[List[Dict[str, Any]]]) -> None:
         self["partitionKeys"] = partition_keys
     @property
@@ -88,13 +86,14 @@ class TableVersion(dict):
     @property
     def content_types(self) -> Optional[List[ContentType]]:
         content_types = self.get("contentTypes")
-        return None if content_types is None else \
-            [None if _ is None else ContentType(_) for _ in content_types]
+        return (
+            None
+            if content_types is None
+            else [None if _ is None else ContentType(_) for _ in content_types]
+        )
     @content_types.setter
-    def content_types(
-            self,
-            content_types: Optional[List[ContentType]]) -> None:
+    def content_types(self, content_types: Optional[List[ContentType]]) -> None:
         self["contentTypes"] = content_types
     @property
@@ -132,27 +131,29 @@ class TableVersion(dict):
             return table_version_locator.table_version
         return None
-    def is_supported_content_type(
-            self,
-            content_type: ContentType):
+    def is_supported_content_type(self, content_type: ContentType):
         supported_content_types = self.content_types
-        return (not supported_content_types) or \
-               (content_type in supported_content_types)
+        return (not supported_content_types) or (
+            content_type in supported_content_types
+        )
 class TableVersionLocator(Locator, dict):
     @staticmethod
-    def of(table_locator: Optional[TableLocator],
-           table_version: Optional[str]) -> TableVersionLocator:
+    def of(
+        table_locator: Optional[TableLocator], table_version: Optional[str]
+    ) -> TableVersionLocator:
         table_version_locator = TableVersionLocator()
         table_version_locator.table_locator = table_locator
         table_version_locator.table_version = table_version
         return table_version_locator
     @staticmethod
-    def at(namespace: Optional[str],
-           table_name: Optional[str],
-           table_version: Optional[str]) -> TableVersionLocator:
+    def at(
+        namespace: Optional[str],
+        table_name: Optional[str],
+        table_version: Optional[str],
+    ) -> TableVersionLocator:
         table_locator = TableLocator.at(namespace, table_name)
         return TableVersionLocator.of(table_locator, table_version)

deltacat/storage/model/types.py CHANGED Viewed

@@ -49,6 +49,7 @@ class SchemaConsistencyType(str, Enum):
     VALIDATE: Raise an error for any fields that don't fit the schema. An
     explicit subset of column names to validate may optionally be specified.
     """
     NONE = "none"
     COERCE = "coerce"
     VALIDATE = "validate"

deltacat/tests/stats/test_intervals.py CHANGED Viewed

@@ -1,38 +1,30 @@
 import unittest
 from typing import Tuple
-from deltacat.compute.stats.utils.intervals import merge_intervals, DeltaRange
+from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
 class TestMergeIntervals(unittest.TestCase):
     def test_unbounded_start_range(self):
-        intervals = sorted(merge_intervals(
-            {(3, 9), (None, 15), (13, 30)}
-        ))
+        intervals = sorted(merge_intervals({(3, 9), (None, 15), (13, 30)}))
         interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
         self.assertEqual(interval[0], None)
         self.assertEqual(interval[1], 30)
     def test_unbounded_end_range(self):
-        intervals = sorted(merge_intervals(
-            {(3, 9), (2, None), (13, 30)}
-        ))
+        intervals = sorted(merge_intervals({(3, 9), (2, None), (13, 30)}))
         interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
         self.assertEqual(interval[0], 2)
         self.assertEqual(interval[1], None)
     def test_unbounded_start_end_range(self):
-        intervals = sorted(merge_intervals(
-            {(None, None)}
-        ))
+        intervals = sorted(merge_intervals({(None, None)}))
         interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
         self.assertEqual(interval[0], None)
         self.assertEqual(interval[1], None)
     def test_no_overlap_range(self):
-        intervals = sorted(merge_intervals(
-            {(3, 9), (11, 14), (19, 30)}
-        ))
+        intervals = sorted(merge_intervals({(3, 9), (11, 14), (19, 30)}))
         interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
         interval2: Tuple[DeltaRange, DeltaRange] = intervals[1]
         interval3: Tuple[DeltaRange, DeltaRange] = intervals[2]
@@ -41,22 +33,17 @@ class TestMergeIntervals(unittest.TestCase):
         self.assertEqual(interval3, (19, 30))
     def test_overlap_range(self):
-        intervals = sorted(merge_intervals(
-            {(3, 9), (9, 14), (14, 30)}
-        ))
+        intervals = sorted(merge_intervals({(3, 9), (9, 14), (14, 30)}))
         interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
         self.assertEqual(interval1, (3, 30))
     def test_invalid_range(self):
-        self.assertRaises(ValueError, merge_intervals,
-            {(3, 9), (9, 3)}
-        )
+        self.assertRaises(ValueError, merge_intervals, {(3, 9), (9, 3)})
     def test_invalid_type(self):
-        self.assertRaises(ValueError, merge_intervals,
-                          {(3, 9), (1.2, 3)})
-        self.assertRaises(ValueError, merge_intervals,
-                          {(3, 9), ("1", 3)})
+        self.assertRaises(ValueError, merge_intervals, {(3, 9), (1.2, 3)})
+        self.assertRaises(ValueError, merge_intervals, {(3, 9), ("1", 3)})
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

deltacat/tests/utils/__init__.py ADDED Viewed

File without changes

deltacat/tests/utils/test_record_batch_tables.py ADDED Viewed

@@ -0,0 +1,284 @@
+import unittest
+import pyarrow as pa
+from deltacat.utils.pyarrow import RecordBatchTables
+class TestRecordBatchTables(unittest.TestCase):
+    def setUp(self) -> None:
+        self.column_names = ["pk", "sk"]
+    def test_single_table_with_batches_and_remainder(self):
+        min_records_batch = 8
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(10)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertTrue(bt.has_batches())
+        self.assertEqual(bt.batched_record_count, 8)
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.remaining_record_count, 2)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_single_table_with_no_remainder(self):
+        min_records_batch = 5
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(min_records_batch)])
+        col2 = pa.array(["test"] * min_records_batch)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertFalse(bt.has_remaining())
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_single_table_with_only_batches(self):
+        min_records_batch = 10
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(min_records_batch)])
+        col2 = pa.array(["test"] * min_records_batch)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertFalse(bt.has_remaining())
+        self.assertEqual(bt.batched_record_count, 10)
+        self.assertEqual(bt.remaining_record_count, 0)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_single_table_with_only_remainder(self):
+        min_records_batch = 11
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(10)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertFalse(bt.has_batches())
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.batched_record_count, 0)
+        self.assertEqual(bt.remaining_record_count, 10)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_grouped_tables_with_only_remainder(self):
+        min_records_batch = 600
+        test_table_num_records = 100
+        grouped_tables = [
+            pa.Table.from_arrays(
+                [
+                    pa.array(
+                        [
+                            i
+                            for i in range(
+                                i * test_table_num_records,
+                                (i + 1) * test_table_num_records,
+                            )
+                        ]
+                    ),
+                    pa.array(["foo"] * test_table_num_records),
+                ],
+                names=self.column_names,
+            )
+            for i in range(5)
+        ]
+        bt = RecordBatchTables(min_records_batch)
+        for table in grouped_tables:
+            bt.append(table)
+        self.assertFalse(bt.has_batches())
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.remaining_record_count, 500)
+        self.assertLess(bt.remaining_record_count, min_records_batch)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_grouped_tables_with_batches_and_remainder(self):
+        min_records_batch = 450
+        test_table_num_records = 100
+        grouped_tables = [
+            pa.Table.from_arrays(
+                [
+                    pa.array(
+                        [
+                            i
+                            for i in range(
+                                i * test_table_num_records,
+                                (i + 1) * test_table_num_records,
+                            )
+                        ]
+                    ),
+                    pa.array(["foo"] * 100),
+                ],
+                names=self.column_names,
+            )
+            for i in range(5)
+        ]
+        bt = RecordBatchTables(min_records_batch)
+        for table in grouped_tables:
+            bt.append(table)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.batched_record_count, 450)
+        self.assertEqual(bt.remaining_record_count, 50)
+        self.assertTrue(bt.batched_record_count % min_records_batch == 0)
+        self.assertLess(bt.remaining_record_count, min_records_batch)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_grouped_tables_with_smaller_batch_size_than_table_records(self):
+        min_records_batch = 5
+        test_table_num_records = 39
+        grouped_tables = [
+            pa.Table.from_arrays(
+                [
+                    pa.array(
+                        [
+                            i
+                            for i in range(
+                                i * test_table_num_records,
+                                (i + 1) * test_table_num_records,
+                            )
+                        ]
+                    ),
+                    pa.array(["foo"] * test_table_num_records),
+                ],
+                names=self.column_names,
+            )
+            for i in range(3)
+        ]
+        bt = RecordBatchTables(min_records_batch)
+        for table in grouped_tables:
+            bt.append(table)
+            self.assertTrue(_is_sorted(bt, self.column_names[0]))
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertEqual(bt.batched_record_count, 115)
+        self.assertTrue(bt.batched_record_count % min_records_batch == 0)
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.remaining_record_count, 2)
+        self.assertLess(bt.remaining_record_count, min_records_batch)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_batched_tables_factory_from_input_tables(self):
+        min_records_batch = 5
+        test_table_num_records = 39
+        grouped_tables = [
+            pa.Table.from_arrays(
+                [
+                    pa.array(
+                        [
+                            i
+                            for i in range(
+                                i * test_table_num_records,
+                                (i + 1) * test_table_num_records,
+                            )
+                        ]
+                    ),
+                    pa.array(["foo"] * test_table_num_records),
+                ],
+                names=self.column_names,
+            )
+            for i in range(3)
+        ]
+        bt = RecordBatchTables.from_tables(grouped_tables, min_records_batch)
+        self.assertTrue(type(bt), RecordBatchTables)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertEqual(bt.batched_record_count, 115)
+        self.assertTrue(bt.batched_record_count % min_records_batch == 0)
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.remaining_record_count, 2)
+        self.assertLess(bt.remaining_record_count, min_records_batch)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+    def test_clear(self):
+        min_records_batch = 8
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(10)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertEqual(bt.batched_record_count, 8)
+        bt.clear_batches()
+        self.assertFalse(bt.has_batches())
+        self.assertEqual(bt.batched_record_count, 0)
+    def test_append_after_clear(self):
+        min_records_batch = 8
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(10)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertEqual(bt.batched_record_count, 8)
+        prev_remainder_records = bt.remaining_record_count
+        self.assertEqual(bt.remaining_record_count, 2)
+        bt.clear_batches()
+        self.assertFalse(bt.has_batches())
+        self.assertEqual(bt.batched_record_count, 0)
+        col1 = pa.array([i for i in range(10, 20)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertEqual(bt.batched_record_count, 8)
+        self.assertEqual(bt.remaining_record_count, 4)
+        self.assertNotEquals(prev_remainder_records, bt.remaining_record_count)
+        self.assertTrue(_is_sorted(bt, self.column_names[0]))
+        bt.clear_remaining()
+        self.assertFalse(bt.has_remaining())
+        self.assertTrue(bt.remaining_record_count == 0)
+    def test_evict(self):
+        min_records_batch = 8
+        bt = RecordBatchTables(min_records_batch)
+        col1 = pa.array([i for i in range(10)])
+        col2 = pa.array(["test"] * 10)
+        test_table = pa.Table.from_arrays([col1, col2], names=self.column_names)
+        bt.append(test_table)
+        self.assertTrue(bt.has_batches())
+        self.assertTrue(_is_gte_batch_size_and_divisible(bt, min_records_batch))
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.batched_record_count, 8)
+        self.assertEqual(bt.remaining_record_count, 2)
+        prev_batched_records = bt.batched_record_count
+        evicted_tables = bt.evict()
+        self.assertFalse(bt.has_batches())
+        self.assertTrue(bt.has_remaining())
+        self.assertEqual(bt.batched_record_count, 0)
+        self.assertEqual(bt.remaining_record_count, 2)
+        self.assertEqual(sum([len(t) for t in evicted_tables]), prev_batched_records)
+def _is_sorted(batched_tables: RecordBatchTables, sort_key: str):
+    merged_table = pa.concat_tables(
+        [*batched_tables.batched, *batched_tables.remaining]
+    )
+    explicitly_sorted_merged_table = merged_table.sort_by([(sort_key, "ascending")])
+    return explicitly_sorted_merged_table == merged_table
+def _is_gte_batch_size_and_divisible(
+    batched_tables: RecordBatchTables, min_records_batch: int
+):
+    return all(
+        [
+            len(table) // min_records_batch > 0 and len(table) % min_records_batch == 0
+            for table in batched_tables.batched
+        ]
+    )
+if __name__ == "__main__":
+    unittest.main()

deltacat/types/media.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from enum import Enum
-from typing import Set, Dict
+from typing import Dict, Set
 class ContentType(str, Enum):
@@ -57,7 +56,7 @@ DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
     ContentType.UNESCAPED_TSV.value,
     ContentType.TSV.value,
     ContentType.CSV.value,
-    ContentType.PSV.value
+    ContentType.PSV.value,
 }
 TABULAR_CONTENT_TYPES: Set[str] = {
@@ -75,7 +74,7 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
     ContentType.TSV.value,
     ContentType.CSV.value,
     ContentType.PSV.value,
-    ContentType.JSON.value
+    ContentType.JSON.value,
 }
 CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {

deltacat/types/tables.py CHANGED Viewed

@@ -1,28 +1,34 @@
 from enum import Enum
-from typing import Dict, Callable, Type, Union
+from typing import Callable, Dict, Type, Union
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import deltacat.storage as dcs
 from ray.data.dataset import Dataset
-from ray.data.read_api import from_arrow_refs, from_pandas_refs, from_numpy, \
-    from_arrow, from_pandas
+from ray.data.read_api import (
+    from_arrow,
+    from_arrow_refs,
+    from_numpy,
+    from_pandas,
+    from_pandas_refs,
+)
+import deltacat.storage as dcs
 from deltacat.types.media import TableType
-from deltacat.utils import pyarrow as pa_utils, pandas as pd_utils, \
-    numpy as np_utils
+from deltacat.utils import numpy as np_utils
+from deltacat.utils import pandas as pd_utils
+from deltacat.utils import pyarrow as pa_utils
 from deltacat.utils.ray_utils import dataset as ds_utils
 TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
     TableType.PYARROW.value: pa_utils.s3_file_to_table,
     TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
-    TableType.NUMPY.value: np_utils.s3_file_to_ndarray
+    TableType.NUMPY.value: np_utils.s3_file_to_ndarray,
 }
 TABLE_CLASS_TO_WRITER_FUNC: Dict[
-    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
+    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
+] = {
     pa.Table: pa_utils.table_to_file,
     pd.DataFrame: pd_utils.dataframe_to_file,
     np.ndarray: np_utils.ndarray_to_file,
@@ -30,7 +36,8 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
 }
 TABLE_CLASS_TO_SLICER_FUNC: Dict[
-    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
+    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
+] = {
     pa.Table: pa_utils.slice_table,
     pd.DataFrame: pd_utils.slice_dataframe,
     np.ndarray: np_utils.slice_ndarray,
@@ -38,7 +45,8 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
 }
 TABLE_CLASS_TO_SIZE_FUNC: Dict[
-    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable] = {
+    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
+] = {
     pa.Table: pa_utils.table_size,
     pd.DataFrame: pd_utils.dataframe_size,
     np.ndarray: np_utils.ndarray_size,
@@ -77,6 +85,7 @@ class TableWriteMode(str, Enum):
     Updates or inserts records based on the table's primary and sort keys by
     default.
     """
     AUTO = "auto"
     CREATE = "create"
     APPEND = "append"
@@ -84,26 +93,27 @@ class TableWriteMode(str, Enum):
     MERGE = "merge"
-def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
-        -> int:
+def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
     return len(table) if not isinstance(table, Dataset) else table.count()
-def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
-        -> Callable:
+def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
     table_writer_func = TABLE_CLASS_TO_WRITER_FUNC.get(type(table))
     if table_writer_func is None:
-        msg = f"No writer found for table type: {type(table)}.\n" \
-              f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
+        msg = (
+            f"No writer found for table type: {type(table)}.\n"
+            f"Known table types: {TABLE_CLASS_TO_WRITER_FUNC.keys}"
+        )
         raise ValueError(msg)
     return table_writer_func
-def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) \
-        -> Callable:
+def get_table_slicer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:
     table_slicer_func = TABLE_CLASS_TO_SLICER_FUNC.get(type(table))
     if table_slicer_func is None:
-        msg = f"No slicer found for table type: {type(table)}.\n" \
-              f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
+        msg = (
+            f"No slicer found for table type: {type(table)}.\n"
+            f"Known table types: {TABLE_CLASS_TO_SLICER_FUNC.keys}"
+        )
         raise ValueError(msg)
     return table_slicer_func

deltacat/utils/common.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import hashlib
-import time
 import os
+import time
 from typing import Any, Dict
 def env_bool(key: str, default: bool) -> int:
     if key in os.environ:
         return bool(os.environ[key])
@@ -44,16 +43,11 @@ class ContentTypeKwargsProvider:
     as input, and returns finalized keyword args as output. Useful for merging
     content-type-specific keyword arguments into an existing fixed dictionary
     of keyword arguments."""
-    def _get_kwargs(
-            self,
-            content_type: str,
-            kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
         raise NotImplementedError
-    def __call__(
-            self,
-            content_type: str,
-            kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def __call__(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
         return self._get_kwargs(content_type, kwargs)

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl