deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/utils/numpy.py CHANGED
@@ -1,21 +1,17 @@
1
+ from typing import List, Optional
2
+
1
3
  import numpy as np
2
4
  import pyarrow as pa
3
-
4
5
  from fsspec import AbstractFileSystem
5
-
6
6
  from ray.data.datasource import BlockWritePathProvider
7
7
 
8
8
  from deltacat.types.media import ContentType
9
- from deltacat.utils import pyarrow as pa_utils
10
9
  from deltacat.utils import pandas as pd_utils
10
+ from deltacat.utils import pyarrow as pa_utils
11
11
  from deltacat.utils.common import ReadKwargsProvider
12
12
 
13
- from typing import List, Optional
14
-
15
13
 
16
- def slice_ndarray(
17
- np_array: np.ndarray,
18
- max_len: Optional[int]) -> List[np.ndarray]:
14
+ def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarray]:
19
15
  """
20
16
  Iteratively creates max_len slices from the first dimension of an ndarray.
21
17
  """
@@ -23,17 +19,18 @@ def slice_ndarray(
23
19
  return [np_array]
24
20
 
25
21
  # Slice along the first dimension of the ndarray.
26
- return [np_array[i:i + max_len] for i in range(0, len(np_array), max_len)]
22
+ return [np_array[i : i + max_len] for i in range(0, len(np_array), max_len)]
27
23
 
28
24
 
29
25
  def s3_file_to_ndarray(
30
- s3_url: str,
31
- content_type: str,
32
- content_encoding: str,
33
- column_names: Optional[List[str]] = None,
34
- include_columns: Optional[List[str]] = None,
35
- pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
36
- **s3_client_kwargs) -> np.ndarray:
26
+ s3_url: str,
27
+ content_type: str,
28
+ content_encoding: str,
29
+ column_names: Optional[List[str]] = None,
30
+ include_columns: Optional[List[str]] = None,
31
+ pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
32
+ **s3_client_kwargs
33
+ ) -> np.ndarray:
37
34
  # TODO: Compare perf to s3 -> pyarrow -> pandas [Series/DataFrame] -> numpy
38
35
  dataframe = pd_utils.s3_file_to_dataframe(
39
36
  s3_url,
@@ -52,12 +49,13 @@ def ndarray_size(np_array: np.ndarray) -> int:
52
49
 
53
50
 
54
51
  def ndarray_to_file(
55
- np_array: np.ndarray,
56
- path: str,
57
- file_system: AbstractFileSystem,
58
- block_path_provider: BlockWritePathProvider,
59
- content_type: str = ContentType.PARQUET.value,
60
- **kwargs) -> None:
52
+ np_array: np.ndarray,
53
+ path: str,
54
+ file_system: AbstractFileSystem,
55
+ block_path_provider: BlockWritePathProvider,
56
+ content_type: str = ContentType.PARQUET.value,
57
+ **kwargs
58
+ ) -> None:
61
59
  """
62
60
  Writes the given Numpy ndarray to a file.
63
61
  """
deltacat/utils/pandas.py CHANGED
@@ -1,24 +1,25 @@
1
- import pandas as pd
2
1
  import csv
3
- import math
4
2
  import io
5
3
  import logging
6
- import pyarrow as pa
4
+ import math
5
+ from typing import Any, Callable, Dict, Iterable, List, Optional
7
6
 
7
+ import pandas as pd
8
+ import pyarrow as pa
8
9
  from fsspec import AbstractFileSystem
9
-
10
10
  from ray.data.datasource import BlockWritePathProvider
11
11
 
12
- from deltacat.types.media import ContentType, ContentEncoding, \
13
- EXPLICIT_COMPRESSION_CONTENT_TYPES
14
- from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, \
15
- TABULAR_CONTENT_TYPES
16
12
  from deltacat import logs
17
- from deltacat.utils.common import ReadKwargsProvider, ContentTypeKwargsProvider
13
+ from deltacat.types.media import (
14
+ DELIMITED_TEXT_CONTENT_TYPES,
15
+ EXPLICIT_COMPRESSION_CONTENT_TYPES,
16
+ TABULAR_CONTENT_TYPES,
17
+ ContentEncoding,
18
+ ContentType,
19
+ )
20
+ from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
18
21
  from deltacat.utils.performance import timed_invocation
19
22
 
20
- from typing import Any, Callable, Dict, Iterable, List, Optional
21
-
22
23
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
24
 
24
25
 
@@ -30,7 +31,7 @@ CONTENT_TYPE_TO_PD_READ_FUNC: Dict[str, Callable] = {
30
31
  ContentType.PARQUET.value: pd.read_parquet,
31
32
  ContentType.FEATHER.value: pd.read_feather,
32
33
  ContentType.ORC.value: pd.read_orc,
33
- ContentType.JSON.value: pd.read_json
34
+ ContentType.JSON.value: pd.read_json,
34
35
  }
35
36
 
36
37
 
@@ -39,24 +40,21 @@ class ReadKwargsProviderPandasCsvPureUtf8(ContentTypeKwargsProvider):
39
40
  as UTF-8 strings (i.e. disables type inference). Useful for ensuring
40
41
  lossless reads of UTF-8 delimited text datasets and improving read
41
42
  performance in cases where type casting is not required."""
43
+
42
44
  def __init__(self, include_columns: Optional[Iterable[str]] = None):
43
45
  self.include_columns = include_columns
44
46
 
45
- def _get_kwargs(
46
- self,
47
- content_type: str,
48
- kwargs: Dict[str, Any]) -> Dict[str, Any]:
47
+ def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
49
48
  if content_type in DELIMITED_TEXT_CONTENT_TYPES:
50
- include_columns = self.include_columns \
51
- if self.include_columns else kwargs.get("usecols")
49
+ include_columns = (
50
+ self.include_columns if self.include_columns else kwargs.get("usecols")
51
+ )
52
52
  if not include_columns:
53
53
  # read all columns as strings
54
54
  kwargs["dtype"] = str
55
55
  else:
56
56
  # read only the included columns as strings
57
- kwargs["dtype"] = {
58
- column_name: str for column_name in include_columns
59
- }
57
+ kwargs["dtype"] = {column_name: str for column_name in include_columns}
60
58
  # use the fastest available engine for pure utf-8 reads
61
59
  kwargs["engine"] = "pyarrow"
62
60
  return kwargs
@@ -71,24 +69,17 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
71
69
  "keep_default_na": False,
72
70
  }
73
71
  if content_type == ContentType.TSV.value:
74
- return {
75
- "sep": "\t",
76
- "header": None
77
- }
72
+ return {"sep": "\t", "header": None}
78
73
  if content_type == ContentType.CSV.value:
79
- return {
80
- "sep": ",",
81
- "header": None
82
- }
74
+ return {"sep": ",", "header": None}
83
75
  if content_type == ContentType.PSV.value:
84
- return {
85
- "sep": "|",
86
- "header": None
87
- }
88
- if content_type in {ContentType.PARQUET.value,
89
- ContentType.FEATHER.value,
90
- ContentType.ORC.value,
91
- ContentType.JSON.value}:
76
+ return {"sep": "|", "header": None}
77
+ if content_type in {
78
+ ContentType.PARQUET.value,
79
+ ContentType.FEATHER.value,
80
+ ContentType.ORC.value,
81
+ ContentType.JSON.value,
82
+ }:
92
83
  return {}
93
84
  raise ValueError(f"Unsupported content type: {content_type}")
94
85
 
@@ -96,13 +87,13 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
96
87
  ENCODING_TO_PD_COMPRESSION: Dict[str, str] = {
97
88
  ContentEncoding.GZIP.value: "gzip",
98
89
  ContentEncoding.BZIP2.value: "bz2",
99
- ContentEncoding.IDENTITY.value: "none"
90
+ ContentEncoding.IDENTITY.value: "none",
100
91
  }
101
92
 
102
93
 
103
94
  def slice_dataframe(
104
- dataframe: pd.DataFrame,
105
- max_len: Optional[int]) -> List[pd.DataFrame]:
95
+ dataframe: pd.DataFrame, max_len: Optional[int]
96
+ ) -> List[pd.DataFrame]:
106
97
  """
107
98
  Iteratively create dataframe slices.
108
99
  """
@@ -111,12 +102,11 @@ def slice_dataframe(
111
102
  dataframes = []
112
103
  num_slices = math.ceil(len(dataframe) / max_len)
113
104
  for i in range(num_slices):
114
- dataframes.append(dataframe[i * max_len: (i + 1) * max_len])
105
+ dataframes.append(dataframe[i * max_len : (i + 1) * max_len])
115
106
  return dataframes
116
107
 
117
108
 
118
- def concat_dataframes(dataframes: List[pd.DataFrame]) \
119
- -> Optional[pd.DataFrame]:
109
+ def concat_dataframes(dataframes: List[pd.DataFrame]) -> Optional[pd.DataFrame]:
120
110
  if dataframes is None or not len(dataframes):
121
111
  return None
122
112
  if len(dataframes) == 1:
@@ -125,10 +115,11 @@ def concat_dataframes(dataframes: List[pd.DataFrame]) \
125
115
 
126
116
 
127
117
  def _add_column_kwargs(
128
- content_type: str,
129
- column_names: Optional[List[str]],
130
- include_columns: Optional[List[str]],
131
- kwargs: Dict[str, Any]):
118
+ content_type: str,
119
+ column_names: Optional[List[str]],
120
+ include_columns: Optional[List[str]],
121
+ kwargs: Dict[str, Any],
122
+ ):
132
123
 
133
124
  if content_type in DELIMITED_TEXT_CONTENT_TYPES:
134
125
  kwargs["names"] = column_names
@@ -140,25 +131,27 @@ def _add_column_kwargs(
140
131
  if include_columns:
141
132
  logger.warning(
142
133
  f"Ignoring request to include columns {include_columns} "
143
- f"for non-tabular content type {content_type}")
134
+ f"for non-tabular content type {content_type}"
135
+ )
144
136
 
145
137
 
146
138
  def s3_file_to_dataframe(
147
- s3_url: str,
148
- content_type: str,
149
- content_encoding: str,
150
- column_names: Optional[List[str]] = None,
151
- include_columns: Optional[List[str]] = None,
152
- pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
153
- **s3_client_kwargs) -> pd.DataFrame:
139
+ s3_url: str,
140
+ content_type: str,
141
+ content_encoding: str,
142
+ column_names: Optional[List[str]] = None,
143
+ include_columns: Optional[List[str]] = None,
144
+ pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
145
+ **s3_client_kwargs,
146
+ ) -> pd.DataFrame:
154
147
 
155
148
  from deltacat.aws import s3u as s3_utils
156
- logger.debug(f"Reading {s3_url} to Pandas. Content type: {content_type}. "
157
- f"Encoding: {content_encoding}")
158
- s3_obj = s3_utils.get_object_at_url(
159
- s3_url,
160
- **s3_client_kwargs
149
+
150
+ logger.debug(
151
+ f"Reading {s3_url} to Pandas. Content type: {content_type}. "
152
+ f"Encoding: {content_encoding}"
161
153
  )
154
+ s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
162
155
  logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
163
156
  pd_read_func = CONTENT_TYPE_TO_PD_READ_FUNC[content_type]
164
157
  args = [io.BytesIO(s3_obj["Body"].read())]
@@ -167,17 +160,12 @@ def s3_file_to_dataframe(
167
160
 
168
161
  if content_type in EXPLICIT_COMPRESSION_CONTENT_TYPES:
169
162
  kwargs["compression"] = ENCODING_TO_PD_COMPRESSION.get(
170
- content_encoding,
171
- "infer"
163
+ content_encoding, "infer"
172
164
  )
173
165
  if pd_read_func_kwargs_provider:
174
166
  kwargs = pd_read_func_kwargs_provider(content_type, kwargs)
175
167
  logger.debug(f"Reading {s3_url} via {pd_read_func} with kwargs: {kwargs}")
176
- dataframe, latency = timed_invocation(
177
- pd_read_func,
178
- *args,
179
- **kwargs
180
- )
168
+ dataframe, latency = timed_invocation(pd_read_func, *args, **kwargs)
181
169
  logger.debug(f"Time to read {s3_url} into Pandas Dataframe: {latency}s")
182
170
  return dataframe
183
171
 
@@ -188,11 +176,8 @@ def dataframe_size(dataframe: pd.DataFrame) -> int:
188
176
 
189
177
 
190
178
  def write_csv(
191
- dataframe: pd.DataFrame,
192
- path: str,
193
- *,
194
- filesystem: AbstractFileSystem,
195
- **kwargs) -> None:
179
+ dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
180
+ ) -> None:
196
181
  with filesystem.open(path, "wb") as f:
197
182
  # TODO (pdames): Add support for client-specified compression types.
198
183
  with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
@@ -200,31 +185,22 @@ def write_csv(
200
185
 
201
186
 
202
187
  def write_parquet(
203
- dataframe: pd.DataFrame,
204
- path: str,
205
- *,
206
- filesystem: AbstractFileSystem,
207
- **kwargs) -> None:
188
+ dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
189
+ ) -> None:
208
190
  with filesystem.open(path, "wb") as f:
209
191
  dataframe.to_parquet(f, **kwargs)
210
192
 
211
193
 
212
194
  def write_feather(
213
- dataframe: pd.DataFrame,
214
- path: str,
215
- *,
216
- filesystem: AbstractFileSystem,
217
- **kwargs) -> None:
195
+ dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
196
+ ) -> None:
218
197
  with filesystem.open(path, "wb") as f:
219
198
  dataframe.to_feather(f, **kwargs)
220
199
 
221
200
 
222
201
  def write_json(
223
- dataframe: pd.DataFrame,
224
- path: str,
225
- *,
226
- filesystem: AbstractFileSystem,
227
- **kwargs) -> None:
202
+ dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
203
+ ) -> None:
228
204
  with filesystem.open(path, "wb") as f:
229
205
  # TODO (pdames): Add support for client-specified compression types.
230
206
  with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
@@ -283,12 +259,13 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
283
259
 
284
260
 
285
261
  def dataframe_to_file(
286
- dataframe: pd.DataFrame,
287
- base_path: str,
288
- file_system: AbstractFileSystem,
289
- block_path_provider: BlockWritePathProvider,
290
- content_type: str = ContentType.PARQUET.value,
291
- **kwargs) -> None:
262
+ dataframe: pd.DataFrame,
263
+ base_path: str,
264
+ file_system: AbstractFileSystem,
265
+ block_path_provider: BlockWritePathProvider,
266
+ content_type: str = ContentType.PARQUET.value,
267
+ **kwargs,
268
+ ) -> None:
292
269
  """
293
270
  Writes the given Pandas Dataframe to a file.
294
271
  """
@@ -299,11 +276,7 @@ def dataframe_to_file(
299
276
  raise NotImplementedError(
300
277
  f"Pandas writer for content type '{content_type}' not "
301
278
  f"implemented. Known content types: "
302
- f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}")
279
+ f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
280
+ )
303
281
  path = block_path_provider(base_path)
304
- writer(
305
- dataframe,
306
- path,
307
- filesystem=file_system,
308
- **writer_kwargs
309
- )
282
+ writer(dataframe, path, filesystem=file_system, **writer_kwargs)
@@ -4,11 +4,8 @@ from typing import Any, Callable, Tuple
4
4
 
5
5
 
6
6
  def invoke_with_perf_counter(
7
- counter: Counter,
8
- counter_key: Any,
9
- func: Callable,
10
- *args,
11
- **kwargs) -> Tuple[Any, float]:
7
+ counter: Counter, counter_key: Any, func: Callable, *args, **kwargs
8
+ ) -> Tuple[Any, float]:
12
9
 
13
10
  start = time.perf_counter()
14
11
  result = func(*args, **kwargs)
@@ -18,10 +15,7 @@ def invoke_with_perf_counter(
18
15
  return result, latency
19
16
 
20
17
 
21
- def timed_invocation(
22
- func: Callable,
23
- *args,
24
- **kwargs) -> Tuple[Any, float]:
18
+ def timed_invocation(func: Callable, *args, **kwargs) -> Tuple[Any, float]:
25
19
 
26
20
  start = time.perf_counter()
27
21
  result = func(*args, **kwargs)