deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,32 @@
1
- import pyarrow as pa
2
- from deltacat import SortKey, TableWriteMode, ContentType, ListResult, \
3
- Namespace, LifecycleState, SchemaConsistencyType, LocalTable, \
4
- LocalDataset, DistributedDataset, TableDefinition
5
1
  from typing import Any, Dict, List, Optional, Set, Union
6
2
 
3
+ import pyarrow as pa
4
+
5
+ from deltacat.catalog.model.table_definition import TableDefinition
6
+ from deltacat.compute.compactor.model.sort_key import SortKey
7
+ from deltacat.storage.model.list_result import ListResult
8
+ from deltacat.storage.model.namespace import Namespace
9
+ from deltacat.storage.model.types import (
10
+ DistributedDataset,
11
+ LifecycleState,
12
+ LocalDataset,
13
+ LocalTable,
14
+ SchemaConsistencyType,
15
+ )
16
+ from deltacat.types.media import ContentType
17
+ from deltacat.types.tables import TableWriteMode
18
+
7
19
 
8
20
  # table functions
9
21
  def write_to_table(
10
- data: Union[LocalTable, LocalDataset, DistributedDataset],
11
- table: str,
12
- namespace: Optional[str] = None,
13
- mode: TableWriteMode = TableWriteMode.AUTO,
14
- content_type: ContentType = ContentType.PARQUET,
15
- *args,
16
- **kwargs) -> None:
22
+ data: Union[LocalTable, LocalDataset, DistributedDataset],
23
+ table: str,
24
+ namespace: Optional[str] = None,
25
+ mode: TableWriteMode = TableWriteMode.AUTO,
26
+ content_type: ContentType = ContentType.PARQUET,
27
+ *args,
28
+ **kwargs
29
+ ) -> None:
17
30
  """Write local or distributed data to a table. Raises an error if the
18
31
  table does not exist and the table write mode is not CREATE or AUTO.
19
32
 
@@ -25,168 +38,136 @@ def write_to_table(
25
38
 
26
39
 
27
40
  def read_table(
28
- table: str,
29
- namespace: Optional[str] = None,
30
- *args,
31
- **kwargs) -> DistributedDataset:
41
+ table: str, namespace: Optional[str] = None, *args, **kwargs
42
+ ) -> DistributedDataset:
32
43
  """Read a table into a distributed dataset."""
33
44
  raise NotImplementedError("read_table not implemented")
34
45
 
35
46
 
36
47
  def alter_table(
37
- table: str,
38
- namespace: Optional[str] = None,
39
- lifecycle_state: Optional[LifecycleState] = None,
40
- schema_updates: Optional[Dict[str, Any]] = None,
41
- partition_updates: Optional[Dict[str, Any]] = None,
42
- primary_keys: Optional[Set[str]] = None,
43
- sort_keys: Optional[List[SortKey]] = None,
44
- description: Optional[str] = None,
45
- properties: Optional[Dict[str, str]] = None,
46
- *args,
47
- **kwargs) -> None:
48
+ table: str,
49
+ namespace: Optional[str] = None,
50
+ lifecycle_state: Optional[LifecycleState] = None,
51
+ schema_updates: Optional[Dict[str, Any]] = None,
52
+ partition_updates: Optional[Dict[str, Any]] = None,
53
+ primary_keys: Optional[Set[str]] = None,
54
+ sort_keys: Optional[List[SortKey]] = None,
55
+ description: Optional[str] = None,
56
+ properties: Optional[Dict[str, str]] = None,
57
+ *args,
58
+ **kwargs
59
+ ) -> None:
48
60
  """Alter table definition."""
49
61
  raise NotImplementedError("alter_table not implemented")
50
62
 
51
63
 
52
64
  def create_table(
53
- table: str,
54
- namespace: Optional[str] = None,
55
- lifecycle_state: Optional[LifecycleState] = None,
56
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
57
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
58
- partition_keys: Optional[List[Dict[str, Any]]] = None,
59
- primary_keys: Optional[Set[str]] = None,
60
- sort_keys: Optional[List[SortKey]] = None,
61
- description: Optional[str] = None,
62
- properties: Optional[Dict[str, str]] = None,
63
- permissions: Optional[Dict[str, Any]] = None,
64
- content_types: Optional[List[ContentType]] = None,
65
- replace_existing_table: bool = False,
66
- *args,
67
- **kwargs) -> TableDefinition:
65
+ table: str,
66
+ namespace: Optional[str] = None,
67
+ lifecycle_state: Optional[LifecycleState] = None,
68
+ schema: Optional[Union[pa.Schema, str, bytes]] = None,
69
+ schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
70
+ partition_keys: Optional[List[Dict[str, Any]]] = None,
71
+ primary_keys: Optional[Set[str]] = None,
72
+ sort_keys: Optional[List[SortKey]] = None,
73
+ description: Optional[str] = None,
74
+ properties: Optional[Dict[str, str]] = None,
75
+ permissions: Optional[Dict[str, Any]] = None,
76
+ content_types: Optional[List[ContentType]] = None,
77
+ replace_existing_table: bool = False,
78
+ *args,
79
+ **kwargs
80
+ ) -> TableDefinition:
68
81
  """Create an empty table. Raises an error if the table already exists and
69
82
  `replace_existing_table` is False."""
70
83
  raise NotImplementedError("create_table not implemented")
71
84
 
72
85
 
73
86
  def drop_table(
74
- table: str,
75
- namespace: Optional[str] = None,
76
- purge: bool = False,
77
- *args,
78
- **kwargs) -> None:
87
+ table: str, namespace: Optional[str] = None, purge: bool = False, *args, **kwargs
88
+ ) -> None:
79
89
  """Drop a table from the catalog and optionally purge it. Raises an error
80
90
  if the table does not exist."""
81
91
  raise NotImplementedError("drop_table not implemented")
82
92
 
83
93
 
84
- def refresh_table(
85
- table: str,
86
- namespace: Optional[str] = None,
87
- *args,
88
- **kwargs) -> None:
94
+ def refresh_table(table: str, namespace: Optional[str] = None, *args, **kwargs) -> None:
89
95
  """Refresh metadata cached on the Ray cluster for the given table."""
90
96
  raise NotImplementedError("refresh_table not implemented")
91
97
 
92
98
 
93
99
  def list_tables(
94
- namespace: Optional[str] = None,
95
- *args,
96
- **kwargs) -> ListResult[TableDefinition]:
100
+ namespace: Optional[str] = None, *args, **kwargs
101
+ ) -> ListResult[TableDefinition]:
97
102
  """List a page of table definitions. Raises an error if the given namespace
98
103
  does not exist."""
99
104
  raise NotImplementedError("list_tables not implemented")
100
105
 
101
106
 
102
107
  def get_table(
103
- table: str,
104
- namespace: Optional[str] = None,
105
- *args,
106
- **kwargs) -> Optional[TableDefinition]:
108
+ table: str, namespace: Optional[str] = None, *args, **kwargs
109
+ ) -> Optional[TableDefinition]:
107
110
  """Get table definition metadata. Returns None if the given table does not
108
111
  exist."""
109
112
  raise NotImplementedError("get_table not implemented")
110
113
 
111
114
 
112
115
  def truncate_table(
113
- table: str,
114
- namespace: Optional[str] = None,
115
- *args,
116
- **kwargs) -> None:
116
+ table: str, namespace: Optional[str] = None, *args, **kwargs
117
+ ) -> None:
117
118
  """Truncate table data. Raises an error if the table does not exist."""
118
119
  raise NotImplementedError("truncate_table not implemented")
119
120
 
120
121
 
121
122
  def rename_table(
122
- table: str,
123
- new_name: str,
124
- namespace: Optional[str] = None,
125
- *args,
126
- **kwargs) -> None:
123
+ table: str, new_name: str, namespace: Optional[str] = None, *args, **kwargs
124
+ ) -> None:
127
125
  """Rename a table."""
128
126
  raise NotImplementedError("rename_table not implemented")
129
127
 
130
128
 
131
- def table_exists(
132
- table: str,
133
- namespace: Optional[str] = None,
134
- *args,
135
- **kwargs) -> bool:
129
+ def table_exists(table: str, namespace: Optional[str] = None, *args, **kwargs) -> bool:
136
130
  """Returns True if the given table exists, False if not."""
137
131
  raise NotImplementedError("table_exists not implemented")
138
132
 
139
133
 
140
134
  # namespace functions
141
- def list_namespaces(
142
- *args,
143
- **kwargs) -> ListResult[Namespace]:
135
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
144
136
  """List a page of table namespaces."""
145
137
  raise NotImplementedError("list_namespaces not implemented")
146
138
 
147
139
 
148
- def get_namespace(
149
- namespace: str,
150
- *args,
151
- **kwargs) -> Optional[Namespace]:
140
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
152
141
  """Gets table namespace metadata for the specified table namespace. Returns
153
142
  None if the given namespace does not exist."""
154
143
  raise NotImplementedError("get_namespace not implemented")
155
144
 
156
145
 
157
- def namespace_exists(
158
- namespace: str,
159
- *args,
160
- **kwargs) -> bool:
146
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
161
147
  """Returns True if the given table namespace exists, False if not."""
162
148
  raise NotImplementedError("namespace_exists not implemented")
163
149
 
164
150
 
165
151
  def create_namespace(
166
- namespace: str,
167
- permissions: Dict[str, Any],
168
- *args,
169
- **kwargs) -> Namespace:
152
+ namespace: str, permissions: Dict[str, Any], *args, **kwargs
153
+ ) -> Namespace:
170
154
  """Creates a table namespace with the given name and permissions. Returns
171
155
  the created namespace. Raises an error if the namespace already exists."""
172
156
  raise NotImplementedError("create_namespace not implemented")
173
157
 
174
158
 
175
159
  def alter_namespace(
176
- namespace: str,
177
- permissions: Optional[Dict[str, Any]] = None,
178
- new_namespace: Optional[str] = None,
179
- *args,
180
- **kwargs) -> None:
160
+ namespace: str,
161
+ permissions: Optional[Dict[str, Any]] = None,
162
+ new_namespace: Optional[str] = None,
163
+ *args,
164
+ **kwargs
165
+ ) -> None:
181
166
  """Alter table namespace definition."""
182
167
  raise NotImplementedError("alter_namespace not implemented")
183
168
 
184
169
 
185
- def drop_namespace(
186
- namespace: str,
187
- purge: bool = False,
188
- *args,
189
- **kwargs) -> None:
170
+ def drop_namespace(namespace: str, purge: bool = False, *args, **kwargs) -> None:
190
171
  """Drop the given namespace and all of its tables from the catalog,
191
172
  optionally purging them."""
192
173
  raise NotImplementedError("drop_namespace not implemented")
@@ -1,21 +1,17 @@
1
1
  # Allow self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Any, Dict, List, Optional
5
+
4
6
  import ray
5
7
 
6
8
  from deltacat.catalog import interface as catalog_interface
7
- from typing import Any, Dict, List, Optional
8
-
9
9
 
10
10
  all_catalogs: Optional[Catalogs] = None
11
11
 
12
12
 
13
13
  class Catalog:
14
- def __init__(
15
- self,
16
- impl=catalog_interface,
17
- *args,
18
- **kwargs):
14
+ def __init__(self, impl=catalog_interface, *args, **kwargs):
19
15
  self._impl = impl
20
16
  self._impl.initialize(*args, **kwargs)
21
17
 
@@ -27,19 +23,22 @@ class Catalog:
27
23
  @ray.remote
28
24
  class Catalogs:
29
25
  def __init__(
30
- self,
31
- catalogs: Dict[str, Catalog],
32
- default_catalog_name: str = None,
33
- *args,
34
- **kwargs):
26
+ self,
27
+ catalogs: Dict[str, Catalog],
28
+ default_catalog_name: str = None,
29
+ *args,
30
+ **kwargs,
31
+ ):
35
32
  if default_catalog_name and default_catalog_name not in catalogs:
36
33
  raise ValueError(
37
34
  f"Catalog {default_catalog_name} not found "
38
- f"in catalogs to register: {catalogs}")
35
+ f"in catalogs to register: {catalogs}"
36
+ )
39
37
  if not catalogs:
40
38
  raise ValueError(
41
39
  f"No catalogs given to register. "
42
- f"Please specify one or more catalogs.")
40
+ f"Please specify one or more catalogs."
41
+ )
43
42
  self.catalogs: Dict[str, Catalog] = catalogs
44
43
  if default_catalog_name:
45
44
  self.default_catalog = self.catalogs[default_catalog_name]
@@ -65,11 +64,12 @@ class Catalogs:
65
64
 
66
65
 
67
66
  def init(
68
- catalogs: Dict[str, Catalog],
69
- default_catalog_name: str = None,
70
- ray_init_args: Dict[str, Any] = None,
71
- *args,
72
- **kwargs) -> None:
67
+ catalogs: Dict[str, Catalog],
68
+ default_catalog_name: str = None,
69
+ ray_init_args: Dict[str, Any] = None,
70
+ *args,
71
+ **kwargs,
72
+ ) -> None:
73
73
 
74
74
  if not ray.is_initialized():
75
75
  if ray_init_args:
@@ -79,5 +79,5 @@ def init(
79
79
 
80
80
  global all_catalogs
81
81
  all_catalogs = Catalogs.remote(
82
- catalogs=catalogs,
83
- default_catalog_name=default_catalog_name)
82
+ catalogs=catalogs, default_catalog_name=default_catalog_name
83
+ )
@@ -1,19 +1,21 @@
1
1
  # Allow self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from deltacat.storage import Table, TableVersion, Stream
4
+ from deltacat.storage import Stream, Table, TableVersion
5
5
 
6
6
 
7
7
  class TableDefinition(dict):
8
8
  @staticmethod
9
- def of(table: Table,
10
- table_version: TableVersion,
11
- stream: Stream) -> TableDefinition:
12
- return TableDefinition({
13
- "table": table,
14
- "tableVersion": table_version,
15
- "stream": stream,
16
- })
9
+ def of(
10
+ table: Table, table_version: TableVersion, stream: Stream
11
+ ) -> TableDefinition:
12
+ return TableDefinition(
13
+ {
14
+ "table": table,
15
+ "tableVersion": table_version,
16
+ "stream": stream,
17
+ }
18
+ )
17
19
 
18
20
  @property
19
21
  def table(self) -> Table:
@@ -1,20 +1,16 @@
1
1
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
2
- from deltacat.compute.compactor.model.delta_file_envelope import \
3
- DeltaFileEnvelope
4
- from deltacat.compute.compactor.model.delta_file_locator import \
5
- DeltaFileLocator
6
- from deltacat.compute.compactor.model.materialize_result import \
7
- MaterializeResult
8
- from deltacat.compute.compactor.model.primary_key_index import \
9
- PrimaryKeyIndexLocator, PrimaryKeyIndexMeta, \
10
- PrimaryKeyIndexVersionLocator, PrimaryKeyIndexVersionMeta
11
- from deltacat.compute.compactor.model.pyarrow_write_result import \
12
- PyArrowWriteResult
13
- from deltacat.compute.compactor.model.round_completion_info import \
14
- RoundCompletionInfo
15
- from deltacat.compute.compactor.model.sort_key import \
16
- SortKey, SortOrder
17
-
2
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
3
+ from deltacat.compute.compactor.model.delta_file_locator import DeltaFileLocator
4
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
5
+ from deltacat.compute.compactor.model.primary_key_index import (
6
+ PrimaryKeyIndexLocator,
7
+ PrimaryKeyIndexMeta,
8
+ PrimaryKeyIndexVersionLocator,
9
+ PrimaryKeyIndexVersionMeta,
10
+ )
11
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
12
+ from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
13
+ from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
18
14
 
19
15
  __all__ = [
20
16
  "DeltaAnnotated",