deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,22 @@
1
- import pyarrow as pa
1
+ from typing import Any, Dict, List, Optional, Set, Union
2
2
 
3
+ import pyarrow as pa
3
4
  import ray
4
5
 
5
- from deltacat import SortKey, TableWriteMode, ContentType, all_catalogs, \
6
- ListResult, Namespace, LifecycleState, SchemaConsistencyType, LocalTable, \
7
- LocalDataset, DistributedDataset, Catalog, TableDefinition
8
- from typing import Any, Dict, List, Optional, Set, Union
6
+ from deltacat.catalog.model.catalog import Catalog, all_catalogs
7
+ from deltacat.catalog.model.table_definition import TableDefinition
8
+ from deltacat.compute.compactor.model.sort_key import SortKey
9
+ from deltacat.storage.model.list_result import ListResult
10
+ from deltacat.storage.model.namespace import Namespace
11
+ from deltacat.storage.model.types import (
12
+ DistributedDataset,
13
+ LifecycleState,
14
+ LocalDataset,
15
+ LocalTable,
16
+ SchemaConsistencyType,
17
+ )
18
+ from deltacat.types.media import ContentType
19
+ from deltacat.types.tables import TableWriteMode
9
20
 
10
21
 
11
22
  def _get_catalog(name: Optional[str] = None) -> Catalog:
@@ -13,27 +24,32 @@ def _get_catalog(name: Optional[str] = None) -> Catalog:
13
24
  raise ValueError(
14
25
  "No catalogs available! Call "
15
26
  "`deltacat.init(catalogs={...})` to register one or more "
16
- "catalogs then retry.")
17
- catalog = ray.get(all_catalogs.get.remote(name)) if name \
27
+ "catalogs then retry."
28
+ )
29
+ catalog = (
30
+ ray.get(all_catalogs.get.remote(name))
31
+ if name
18
32
  else ray.get(all_catalogs.default.remote())
33
+ )
19
34
  if not catalog:
20
35
  available_catalogs = ray.get(all_catalogs.all.remote()).values()
21
36
  raise ValueError(
22
- f"Catalog '{name}' not found. Available catalogs: "
23
- f"{available_catalogs}.")
37
+ f"Catalog '{name}' not found. Available catalogs: " f"{available_catalogs}."
38
+ )
24
39
  return catalog
25
40
 
26
41
 
27
42
  # table functions
28
43
  def write_to_table(
29
- data: Union[LocalTable, LocalDataset, DistributedDataset],
30
- table: str,
31
- namespace: Optional[str] = None,
32
- catalog: Optional[str] = None,
33
- mode: TableWriteMode = TableWriteMode.AUTO,
34
- content_type: ContentType = ContentType.PARQUET,
35
- *args,
36
- **kwargs) -> None:
44
+ data: Union[LocalTable, LocalDataset, DistributedDataset],
45
+ table: str,
46
+ namespace: Optional[str] = None,
47
+ catalog: Optional[str] = None,
48
+ mode: TableWriteMode = TableWriteMode.AUTO,
49
+ content_type: ContentType = ContentType.PARQUET,
50
+ *args,
51
+ **kwargs,
52
+ ) -> None:
37
53
  """Write local or distributed data to a table. Raises an error if the
38
54
  table does not exist and the table write mode is not CREATE or AUTO.
39
55
 
@@ -42,42 +58,35 @@ def write_to_table(
42
58
  an existing table, all `alter_table` parameters may be optionally specified
43
59
  as additional keyword arguments."""
44
60
  _get_catalog(catalog).impl.write_to_table(
45
- data,
46
- table,
47
- namespace,
48
- mode,
49
- content_type,
50
- *args,
51
- **kwargs)
61
+ data, table, namespace, mode, content_type, *args, **kwargs
62
+ )
52
63
 
53
64
 
54
65
  def read_table(
55
- table: str,
56
- namespace: Optional[str] = None,
57
- catalog: Optional[str] = None,
58
- *args,
59
- **kwargs) -> DistributedDataset:
66
+ table: str,
67
+ namespace: Optional[str] = None,
68
+ catalog: Optional[str] = None,
69
+ *args,
70
+ **kwargs,
71
+ ) -> DistributedDataset:
60
72
  """Read a table into a distributed dataset."""
61
- return _get_catalog(catalog).impl.read_table(
62
- table,
63
- namespace,
64
- *args,
65
- **kwargs)
73
+ return _get_catalog(catalog).impl.read_table(table, namespace, *args, **kwargs)
66
74
 
67
75
 
68
76
  def alter_table(
69
- table: str,
70
- namespace: Optional[str] = None,
71
- catalog: Optional[str] = None,
72
- lifecycle_state: Optional[LifecycleState] = None,
73
- schema_updates: Optional[Dict[str, Any]] = None,
74
- partition_updates: Optional[Dict[str, Any]] = None,
75
- primary_keys: Optional[Set[str]] = None,
76
- sort_keys: Optional[List[SortKey]] = None,
77
- description: Optional[str] = None,
78
- properties: Optional[Dict[str, str]] = None,
79
- *args,
80
- **kwargs) -> None:
77
+ table: str,
78
+ namespace: Optional[str] = None,
79
+ catalog: Optional[str] = None,
80
+ lifecycle_state: Optional[LifecycleState] = None,
81
+ schema_updates: Optional[Dict[str, Any]] = None,
82
+ partition_updates: Optional[Dict[str, Any]] = None,
83
+ primary_keys: Optional[Set[str]] = None,
84
+ sort_keys: Optional[List[SortKey]] = None,
85
+ description: Optional[str] = None,
86
+ properties: Optional[Dict[str, str]] = None,
87
+ *args,
88
+ **kwargs,
89
+ ) -> None:
81
90
  """Alter table definition."""
82
91
  _get_catalog(catalog).impl.alter_table(
83
92
  table,
@@ -90,26 +99,28 @@ def alter_table(
90
99
  description,
91
100
  properties,
92
101
  *args,
93
- **kwargs)
102
+ **kwargs,
103
+ )
94
104
 
95
105
 
96
106
  def create_table(
97
- table: str,
98
- namespace: Optional[str] = None,
99
- catalog: Optional[str] = None,
100
- lifecycle_state: Optional[LifecycleState] = None,
101
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
102
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
103
- partition_keys: Optional[List[Dict[str, Any]]] = None,
104
- primary_keys: Optional[Set[str]] = None,
105
- sort_keys: Optional[List[SortKey]] = None,
106
- description: Optional[str] = None,
107
- properties: Optional[Dict[str, str]] = None,
108
- permissions: Optional[Dict[str, Any]] = None,
109
- content_types: Optional[List[ContentType]] = None,
110
- replace_existing_table: bool = False,
111
- *args,
112
- **kwargs) -> TableDefinition:
107
+ table: str,
108
+ namespace: Optional[str] = None,
109
+ catalog: Optional[str] = None,
110
+ lifecycle_state: Optional[LifecycleState] = None,
111
+ schema: Optional[Union[pa.Schema, str, bytes]] = None,
112
+ schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
113
+ partition_keys: Optional[List[Dict[str, Any]]] = None,
114
+ primary_keys: Optional[Set[str]] = None,
115
+ sort_keys: Optional[List[SortKey]] = None,
116
+ description: Optional[str] = None,
117
+ properties: Optional[Dict[str, str]] = None,
118
+ permissions: Optional[Dict[str, Any]] = None,
119
+ content_types: Optional[List[ContentType]] = None,
120
+ replace_existing_table: bool = False,
121
+ *args,
122
+ **kwargs,
123
+ ) -> TableDefinition:
113
124
  """Create an empty table. Raises an error if the table already exists and
114
125
  `replace_existing_table` is False."""
115
126
  return _get_catalog(catalog).impl.create_table(
@@ -127,190 +138,145 @@ def create_table(
127
138
  content_types,
128
139
  replace_existing_table,
129
140
  *args,
130
- **kwargs)
141
+ **kwargs,
142
+ )
131
143
 
132
144
 
133
145
  def drop_table(
134
- table: str,
135
- namespace: Optional[str] = None,
136
- catalog: Optional[str] = None,
137
- purge: bool = False,
138
- *args,
139
- **kwargs) -> None:
146
+ table: str,
147
+ namespace: Optional[str] = None,
148
+ catalog: Optional[str] = None,
149
+ purge: bool = False,
150
+ *args,
151
+ **kwargs,
152
+ ) -> None:
140
153
  """Drop a table from the catalog and optionally purge it. Raises an error
141
154
  if the table does not exist."""
142
- _get_catalog(catalog).impl.drop_table(
143
- table,
144
- namespace,
145
- purge,
146
- *args,
147
- **kwargs)
155
+ _get_catalog(catalog).impl.drop_table(table, namespace, purge, *args, **kwargs)
148
156
 
149
157
 
150
158
  def refresh_table(
151
- table: str,
152
- namespace: Optional[str] = None,
153
- catalog: Optional[str] = None,
154
- *args,
155
- **kwargs) -> None:
159
+ table: str,
160
+ namespace: Optional[str] = None,
161
+ catalog: Optional[str] = None,
162
+ *args,
163
+ **kwargs,
164
+ ) -> None:
156
165
  """Refresh metadata cached on the Ray cluster for the given table."""
157
- _get_catalog(catalog).impl.refresh_table(
158
- table,
159
- namespace,
160
- *args,
161
- **kwargs)
166
+ _get_catalog(catalog).impl.refresh_table(table, namespace, *args, **kwargs)
162
167
 
163
168
 
164
169
  def list_tables(
165
- namespace: Optional[str] = None,
166
- catalog: Optional[str] = None,
167
- *args,
168
- **kwargs) -> ListResult[TableDefinition]:
170
+ namespace: Optional[str] = None, catalog: Optional[str] = None, *args, **kwargs
171
+ ) -> ListResult[TableDefinition]:
169
172
  """List a page of table definitions. Raises an error if the given namespace
170
173
  does not exist."""
171
- return _get_catalog(catalog).impl.list_tables(
172
- namespace,
173
- *args,
174
- **kwargs)
174
+ return _get_catalog(catalog).impl.list_tables(namespace, *args, **kwargs)
175
175
 
176
176
 
177
177
  def get_table(
178
- table: str,
179
- namespace: Optional[str] = None,
180
- catalog: Optional[str] = None,
181
- *args,
182
- **kwargs) -> Optional[TableDefinition]:
178
+ table: str,
179
+ namespace: Optional[str] = None,
180
+ catalog: Optional[str] = None,
181
+ *args,
182
+ **kwargs,
183
+ ) -> Optional[TableDefinition]:
183
184
  """Get table definition metadata. Returns None if the given table does not
184
185
  exist."""
185
- return _get_catalog(catalog).impl.get_table(
186
- table,
187
- namespace,
188
- *args,
189
- **kwargs)
186
+ return _get_catalog(catalog).impl.get_table(table, namespace, *args, **kwargs)
190
187
 
191
188
 
192
189
  def truncate_table(
193
- table: str,
194
- namespace: Optional[str] = None,
195
- catalog: Optional[str] = None,
196
- *args,
197
- **kwargs) -> None:
190
+ table: str,
191
+ namespace: Optional[str] = None,
192
+ catalog: Optional[str] = None,
193
+ *args,
194
+ **kwargs,
195
+ ) -> None:
198
196
  """Truncate table data. Raises an error if the table does not exist."""
199
- _get_catalog(catalog).impl.truncate_table(
200
- table,
201
- namespace,
202
- *args,
203
- **kwargs)
197
+ _get_catalog(catalog).impl.truncate_table(table, namespace, *args, **kwargs)
204
198
 
205
199
 
206
200
  def rename_table(
207
- table: str,
208
- new_name: str,
209
- namespace: Optional[str] = None,
210
- catalog: Optional[str] = None,
211
- *args,
212
- **kwargs) -> None:
201
+ table: str,
202
+ new_name: str,
203
+ namespace: Optional[str] = None,
204
+ catalog: Optional[str] = None,
205
+ *args,
206
+ **kwargs,
207
+ ) -> None:
213
208
  """Rename a table."""
214
- _get_catalog(catalog).impl.rename_table(
215
- table,
216
- new_name,
217
- namespace,
218
- *args,
219
- **kwargs)
209
+ _get_catalog(catalog).impl.rename_table(table, new_name, namespace, *args, **kwargs)
220
210
 
221
211
 
222
212
  def table_exists(
223
- table: str,
224
- namespace: Optional[str] = None,
225
- catalog: Optional[str] = None,
226
- *args,
227
- **kwargs) -> bool:
213
+ table: str,
214
+ namespace: Optional[str] = None,
215
+ catalog: Optional[str] = None,
216
+ *args,
217
+ **kwargs,
218
+ ) -> bool:
228
219
  """Returns True if the given table exists, False if not."""
229
- return _get_catalog(catalog).impl.table_exists(
230
- table,
231
- namespace,
232
- *args,
233
- **kwargs)
220
+ return _get_catalog(catalog).impl.table_exists(table, namespace, *args, **kwargs)
234
221
 
235
222
 
236
223
  # namespace functions
237
224
  def list_namespaces(
238
- catalog: Optional[str] = None,
239
- *args,
240
- **kwargs) -> ListResult[Namespace]:
225
+ catalog: Optional[str] = None, *args, **kwargs
226
+ ) -> ListResult[Namespace]:
241
227
  """List a page of table namespaces."""
242
228
  return _get_catalog(catalog).impl.list_namespaces(*args, **kwargs)
243
229
 
244
230
 
245
231
  def get_namespace(
246
- namespace: str,
247
- catalog: Optional[str] = None,
248
- *args,
249
- **kwargs) -> Optional[Namespace]:
232
+ namespace: str, catalog: Optional[str] = None, *args, **kwargs
233
+ ) -> Optional[Namespace]:
250
234
  """Get table namespace metadata for the specified table namespace. Returns
251
235
  None if the given namespace does not exist."""
252
- return _get_catalog(catalog).impl.get_namespace(
253
- namespace,
254
- *args,
255
- **kwargs)
236
+ return _get_catalog(catalog).impl.get_namespace(namespace, *args, **kwargs)
256
237
 
257
238
 
258
239
  def namespace_exists(
259
- namespace: str,
260
- catalog: Optional[str] = None,
261
- *args,
262
- **kwargs) -> bool:
240
+ namespace: str, catalog: Optional[str] = None, *args, **kwargs
241
+ ) -> bool:
263
242
  """Returns True if the given table namespace exists, False if not."""
264
- return _get_catalog(catalog).impl.namespace_exists(
265
- namespace,
266
- *args,
267
- **kwargs)
243
+ return _get_catalog(catalog).impl.namespace_exists(namespace, *args, **kwargs)
268
244
 
269
245
 
270
246
  def create_namespace(
271
- namespace: str,
272
- permissions: Dict[str, Any],
273
- catalog: Optional[str] = None,
274
- *args,
275
- **kwargs) -> Namespace:
247
+ namespace: str,
248
+ permissions: Dict[str, Any],
249
+ catalog: Optional[str] = None,
250
+ *args,
251
+ **kwargs,
252
+ ) -> Namespace:
276
253
  """Creates a table namespace with the given name and permissions. Returns
277
254
  the created namespace. Raises an error if the namespace already exists."""
278
255
  return _get_catalog(catalog).impl.create_namespace(
279
- namespace,
280
- permissions,
281
- *args,
282
- **kwargs)
256
+ namespace, permissions, *args, **kwargs
257
+ )
283
258
 
284
259
 
285
260
  def alter_namespace(
286
- namespace: str,
287
- catalog: Optional[str] = None,
288
- permissions: Optional[Dict[str, Any]] = None,
289
- new_namespace: Optional[str] = None,
290
- *args,
291
- **kwargs) -> None:
261
+ namespace: str,
262
+ catalog: Optional[str] = None,
263
+ permissions: Optional[Dict[str, Any]] = None,
264
+ new_namespace: Optional[str] = None,
265
+ *args,
266
+ **kwargs,
267
+ ) -> None:
292
268
  """Alter table namespace definition."""
293
269
  _get_catalog(catalog).impl.alter_namespace(
294
- namespace,
295
- permissions,
296
- new_namespace,
297
- *args,
298
- **kwargs)
270
+ namespace, permissions, new_namespace, *args, **kwargs
271
+ )
299
272
 
300
273
 
301
274
  def drop_namespace(
302
- namespace: str,
303
- catalog: Optional[str] = None,
304
- purge: bool = False,
305
- *args,
306
- **kwargs) -> None:
275
+ namespace: str, catalog: Optional[str] = None, purge: bool = False, *args, **kwargs
276
+ ) -> None:
307
277
  """Drop the given namespace and all of its tables from the catalog,
308
278
  optionally purging them."""
309
- _get_catalog(catalog).impl.drop_namespace(
310
- namespace,
311
- purge,
312
- *args,
313
- **kwargs)
279
+ _get_catalog(catalog).impl.drop_namespace(namespace, purge, *args, **kwargs)
314
280
 
315
281
 
316
282
  def default_namespace(catalog: Optional[str] = None) -> str: