deltacat 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/__init__.py +4 -0
- deltacat/aws/redshift/model/manifest.py +93 -1
- deltacat/compute/compactor_v2/utils/task_options.py +31 -0
- {deltacat-0.2.9.dist-info → deltacat-0.2.11.dist-info}/METADATA +1 -1
- {deltacat-0.2.9.dist-info → deltacat-0.2.11.dist-info}/RECORD +9 -9
- {deltacat-0.2.9.dist-info → deltacat-0.2.11.dist-info}/LICENSE +0 -0
- {deltacat-0.2.9.dist-info → deltacat-0.2.11.dist-info}/WHEEL +0 -0
- {deltacat-0.2.9.dist-info → deltacat-0.2.11.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
from deltacat.aws.redshift.model.manifest import (
|
2
|
+
EntryFileParams,
|
3
|
+
EntryType,
|
2
4
|
Manifest,
|
3
5
|
ManifestAuthor,
|
4
6
|
ManifestEntry,
|
@@ -7,6 +9,8 @@ from deltacat.aws.redshift.model.manifest import (
|
|
7
9
|
)
|
8
10
|
|
9
11
|
__all__ = [
|
12
|
+
"EntryFileParams",
|
13
|
+
"EntryType",
|
10
14
|
"Manifest",
|
11
15
|
"ManifestAuthor",
|
12
16
|
"ManifestEntry",
|
@@ -5,12 +5,63 @@ import itertools
|
|
5
5
|
import logging
|
6
6
|
from typing import Any, Dict, List, Optional
|
7
7
|
from uuid import uuid4
|
8
|
+
from enum import Enum
|
8
9
|
|
9
10
|
from deltacat import logs
|
10
11
|
|
11
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
13
|
|
13
14
|
|
15
|
+
class EntryType(str, Enum):
|
16
|
+
"""
|
17
|
+
Enum representing all possible content categories of an manifest entry file
|
18
|
+
"""
|
19
|
+
|
20
|
+
DATA = "data"
|
21
|
+
POSITIONAL_DELETE = "positional_delete"
|
22
|
+
EQUALITY_DELETE = "equality_delete"
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def get_default(cls):
|
26
|
+
return EntryType.DATA
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def list(cls):
|
30
|
+
return [c.value for c in EntryType]
|
31
|
+
|
32
|
+
|
33
|
+
class EntryFileParams(dict):
|
34
|
+
"""
|
35
|
+
Represents parameters relevant to the underlying contents of manifest entry. Contains all parameters required to support DELETEs
|
36
|
+
equality_column_names: List of column names that would be used to determine row equality for equality deletes. Relevant only to equality deletes
|
37
|
+
position: Ordinal position of a deleted row in the target data file identified by uri, starting at 0. Relevant only to positional deletes
|
38
|
+
"""
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def of(
|
42
|
+
equality_column_names: Optional[List[str]] = None,
|
43
|
+
position: Optional[int] = None,
|
44
|
+
) -> EntryFileParams:
|
45
|
+
entry_file_params = EntryFileParams()
|
46
|
+
if equality_column_names is not None:
|
47
|
+
entry_file_params["equality_column_names"] = equality_column_names
|
48
|
+
if position is not None:
|
49
|
+
entry_file_params["position"] = position
|
50
|
+
return entry_file_params
|
51
|
+
|
52
|
+
@property
|
53
|
+
def equality_column_names(self) -> Optional[List[str]]:
|
54
|
+
return self.get("equality_column_names")
|
55
|
+
|
56
|
+
@property
|
57
|
+
def url(self) -> Optional[str]:
|
58
|
+
return self.get("url")
|
59
|
+
|
60
|
+
@property
|
61
|
+
def position(self) -> Optional[int]:
|
62
|
+
return self.get("position")
|
63
|
+
|
64
|
+
|
14
65
|
class Manifest(dict):
|
15
66
|
@staticmethod
|
16
67
|
def _build_manifest(
|
@@ -18,6 +69,7 @@ class Manifest(dict):
|
|
18
69
|
entries: Optional[ManifestEntryList],
|
19
70
|
author: Optional[ManifestAuthor] = None,
|
20
71
|
uuid: str = None,
|
72
|
+
entry_type: Optional[EntryType] = None,
|
21
73
|
) -> Manifest:
|
22
74
|
if not uuid:
|
23
75
|
uuid = str(uuid4())
|
@@ -29,6 +81,8 @@ class Manifest(dict):
|
|
29
81
|
manifest["entries"] = entries
|
30
82
|
if author is not None:
|
31
83
|
manifest["author"] = author
|
84
|
+
if entry_type is not None:
|
85
|
+
manifest["entry_type"] = entry_type.value
|
32
86
|
return manifest
|
33
87
|
|
34
88
|
@staticmethod
|
@@ -36,6 +90,7 @@ class Manifest(dict):
|
|
36
90
|
entries: ManifestEntryList,
|
37
91
|
author: Optional[ManifestAuthor] = None,
|
38
92
|
uuid: str = None,
|
93
|
+
entry_type: Optional[EntryType] = None,
|
39
94
|
) -> Manifest:
|
40
95
|
if not uuid:
|
41
96
|
uuid = str(uuid4())
|
@@ -78,8 +133,9 @@ class Manifest(dict):
|
|
78
133
|
content_type,
|
79
134
|
content_encoding,
|
80
135
|
total_source_content_length,
|
136
|
+
entry_type=entry_type,
|
81
137
|
)
|
82
|
-
manifest = Manifest._build_manifest(meta, entries, author, uuid)
|
138
|
+
manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
|
83
139
|
return manifest
|
84
140
|
|
85
141
|
@staticmethod
|
@@ -128,6 +184,7 @@ class ManifestMeta(dict):
|
|
128
184
|
source_content_length: Optional[int] = None,
|
129
185
|
credentials: Optional[Dict[str, str]] = None,
|
130
186
|
content_type_parameters: Optional[List[Dict[str, str]]] = None,
|
187
|
+
entry_type: Optional[EntryType] = None,
|
131
188
|
) -> ManifestMeta:
|
132
189
|
manifest_meta = ManifestMeta()
|
133
190
|
if record_count is not None:
|
@@ -144,6 +201,8 @@ class ManifestMeta(dict):
|
|
144
201
|
manifest_meta["content_encoding"] = content_encoding
|
145
202
|
if credentials is not None:
|
146
203
|
manifest_meta["credentials"] = credentials
|
204
|
+
if entry_type is not None:
|
205
|
+
manifest_meta["entry_type"] = entry_type.value
|
147
206
|
return manifest_meta
|
148
207
|
|
149
208
|
@property
|
@@ -178,6 +237,13 @@ class ManifestMeta(dict):
|
|
178
237
|
def credentials(self) -> Optional[Dict[str, str]]:
|
179
238
|
return self.get("credentials")
|
180
239
|
|
240
|
+
@property
|
241
|
+
def entry_type(self) -> Optional[EntryType]:
|
242
|
+
val = self.get("entry_type")
|
243
|
+
if val is not None:
|
244
|
+
return EntryType(self["entry_type"])
|
245
|
+
return val
|
246
|
+
|
181
247
|
|
182
248
|
class ManifestAuthor(dict):
|
183
249
|
@staticmethod
|
@@ -206,6 +272,8 @@ class ManifestEntry(dict):
|
|
206
272
|
mandatory: bool = True,
|
207
273
|
uri: Optional[str] = None,
|
208
274
|
uuid: Optional[str] = None,
|
275
|
+
entry_type: Optional[EntryType] = None,
|
276
|
+
entry_file_params: Optional[EntryFileParams] = None,
|
209
277
|
) -> ManifestEntry:
|
210
278
|
manifest_entry = ManifestEntry()
|
211
279
|
if not (uri or url):
|
@@ -222,6 +290,16 @@ class ManifestEntry(dict):
|
|
222
290
|
manifest_entry["mandatory"] = mandatory
|
223
291
|
if uuid is not None:
|
224
292
|
manifest_entry["id"] = uuid
|
293
|
+
if entry_type is not None:
|
294
|
+
manifest_entry["entry_type"] = entry_type.value
|
295
|
+
if entry_file_params is not None:
|
296
|
+
if entry_file_params.get("url") != manifest_entry.get("url"):
|
297
|
+
msg = (
|
298
|
+
f"Expected manifest entry url: {manifest_entry.url}"
|
299
|
+
f" and entry_file_params: '{entry_file_params.url}' to match"
|
300
|
+
)
|
301
|
+
raise ValueError(msg)
|
302
|
+
manifest_entry["entry_file_params"] = entry_file_params
|
225
303
|
return manifest_entry
|
226
304
|
|
227
305
|
@staticmethod
|
@@ -268,6 +346,20 @@ class ManifestEntry(dict):
|
|
268
346
|
def id(self) -> Optional[str]:
|
269
347
|
return self.get("id")
|
270
348
|
|
349
|
+
@property
|
350
|
+
def entry_type(self) -> Optional[EntryType]:
|
351
|
+
val = self.get("entry_type")
|
352
|
+
if val is not None:
|
353
|
+
return EntryType(self["entry_type"])
|
354
|
+
return val
|
355
|
+
|
356
|
+
@property
|
357
|
+
def entry_file_params(self) -> Optional[EntryFileParams]:
|
358
|
+
val: Dict[str, Any] = self.get("entry_file_params")
|
359
|
+
if val is not None and not isinstance(val, EntryFileParams):
|
360
|
+
self["entry_file_params"] = val = EntryFileParams(val)
|
361
|
+
return val
|
362
|
+
|
271
363
|
|
272
364
|
class ManifestEntryList(List[ManifestEntry]):
|
273
365
|
@staticmethod
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import botocore
|
2
|
+
import logging
|
2
3
|
from typing import Dict, Optional, List, Tuple
|
4
|
+
from deltacat import logs
|
3
5
|
from deltacat.types.media import ContentEncoding, ContentType
|
4
6
|
from deltacat.types.partial_download import PartialParquetParameters
|
5
7
|
from deltacat.storage import (
|
@@ -17,6 +19,8 @@ from deltacat.compute.compactor_v2.constants import (
|
|
17
19
|
PARQUET_TO_PYARROW_INFLATION,
|
18
20
|
)
|
19
21
|
|
22
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
23
|
+
|
20
24
|
|
21
25
|
def _get_parquet_type_params_if_exist(
|
22
26
|
entry: ManifestEntry,
|
@@ -133,11 +137,15 @@ def hash_bucket_resource_options_provider(
|
|
133
137
|
ray_custom_resources: Optional[Dict] = None,
|
134
138
|
**kwargs,
|
135
139
|
) -> Dict:
|
140
|
+
debug_memory_params = {"hash_bucket_task_index": index}
|
136
141
|
size_bytes = 0.0
|
137
142
|
num_rows = 0
|
138
143
|
total_pk_size = 0
|
139
144
|
|
140
145
|
if not item.manifest or not item.manifest.entries:
|
146
|
+
logger.debug(
|
147
|
+
f"[Hash bucket task {index}]: No manifest entries, skipping memory allocation calculation"
|
148
|
+
)
|
141
149
|
return {"CPU": 0.01}
|
142
150
|
|
143
151
|
for entry in item.manifest.entries:
|
@@ -165,9 +173,17 @@ def hash_bucket_resource_options_provider(
|
|
165
173
|
# total size + pk size + pk hash column + hash bucket index column
|
166
174
|
# Refer to hash_bucket step for more details.
|
167
175
|
total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
|
176
|
+
debug_memory_params["size_bytes"] = size_bytes
|
177
|
+
debug_memory_params["num_rows"] = num_rows
|
178
|
+
debug_memory_params["total_pk_size"] = total_pk_size
|
179
|
+
debug_memory_params["total_memory"] = total_memory
|
168
180
|
|
169
181
|
# Consider buffer
|
170
182
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
183
|
+
debug_memory_params["total_memory_with_buffer"] = total_memory
|
184
|
+
logger.debug(
|
185
|
+
f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
|
186
|
+
)
|
171
187
|
|
172
188
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
173
189
|
|
@@ -186,10 +202,13 @@ def merge_resource_options_provider(
|
|
186
202
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
187
203
|
**kwargs,
|
188
204
|
) -> Dict:
|
205
|
+
debug_memory_params = {"merge_task_index": index}
|
189
206
|
hb_group_idx = item[0]
|
190
207
|
|
191
208
|
data_size = hash_group_size_bytes.get(hb_group_idx, 0)
|
192
209
|
num_rows = hash_group_num_rows.get(hb_group_idx, 0)
|
210
|
+
debug_memory_params["data_size_from_hash_group"] = data_size
|
211
|
+
debug_memory_params["num_rows_from_hash_group"] = num_rows
|
193
212
|
|
194
213
|
# upper bound for pk size of incremental
|
195
214
|
pk_size_bytes = data_size
|
@@ -205,10 +224,13 @@ def merge_resource_options_provider(
|
|
205
224
|
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
206
225
|
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
207
226
|
)
|
227
|
+
debug_memory_params["previous_inflation"] = previous_inflation
|
228
|
+
|
208
229
|
average_record_size = (
|
209
230
|
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
210
231
|
/ round_completion_info.compacted_pyarrow_write_result.records
|
211
232
|
)
|
233
|
+
debug_memory_params["average_record_size"] = average_record_size
|
212
234
|
|
213
235
|
iterable = hash_group_index_to_hash_bucket_indices(
|
214
236
|
hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
|
@@ -256,7 +278,16 @@ def merge_resource_options_provider(
|
|
256
278
|
+ num_rows * 20
|
257
279
|
+ incremental_index_array_size
|
258
280
|
)
|
281
|
+
debug_memory_params["data_size"] = data_size
|
282
|
+
debug_memory_params["num_rows"] = num_rows
|
283
|
+
debug_memory_params["pk_size_bytes"] = pk_size_bytes
|
284
|
+
debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
|
285
|
+
debug_memory_params["total_memory"] = total_memory
|
259
286
|
|
260
287
|
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
288
|
+
debug_memory_params["total_memory_with_buffer"] = total_memory
|
289
|
+
logger.debug(
|
290
|
+
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
|
291
|
+
)
|
261
292
|
|
262
293
|
return get_task_options(0.01, total_memory, ray_custom_resources)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=35fh2bJ52CtfEDElc4OAwCeiXNY07uCDVhpQZzewvp8,1778
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
|
4
4
|
deltacat/logs.py,sha256=9XWuTBoWhhAF9rAL6t9veXmnAlJHsaqk0lTxteVPqyQ,5674
|
@@ -6,9 +6,9 @@ deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
deltacat/aws/clients.py,sha256=wWiqXyZPWXezdEbhQ7DLwEVnYV6KiitqzBc5B4UAwc0,6184
|
7
7
|
deltacat/aws/constants.py,sha256=luXWMO_8eatq8f9NlFjNM7q362j77JwzTM2BEVS_8-8,353
|
8
8
|
deltacat/aws/s3u.py,sha256=s2On5X3IQiCsCMKw4lpfV1GfKQVWOXNsdAmIJK5PEM0,18610
|
9
|
-
deltacat/aws/redshift/__init__.py,sha256=
|
9
|
+
deltacat/aws/redshift/__init__.py,sha256=7SvjG-dqox8zZUhFicTsUvpG5vXYDl_QQ3ohlHOgTKc,342
|
10
10
|
deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
deltacat/aws/redshift/model/manifest.py,sha256=
|
11
|
+
deltacat/aws/redshift/model/manifest.py,sha256=ThgpdwzaWz493Zz9e8HSWwuxEheA1nDuypM3pe4vozk,12987
|
12
12
|
deltacat/benchmarking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
deltacat/benchmarking/benchmark_parquet_reads.py,sha256=2BctkvXAYcAxokLwMSTu4TQ6-HGqzkgYcVEAzPN2QQo,1709
|
14
14
|
deltacat/benchmarking/conftest.py,sha256=6M9NJ71vnOpeMxG-Ly9UWRsgZmky5-1GTuoRD-OElng,1604
|
@@ -63,7 +63,7 @@ deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=rNKZisxGrLQOkw
|
|
63
63
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
|
64
64
|
deltacat/compute/compactor_v2/utils/io.py,sha256=jgIfwrfH2mTFUx1M0TgwZGGfrS4IXjP1PmqwaQmNAJM,5092
|
65
65
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=MAscmL35WfwN7Is72aFlD_cGhxtZgjRwwR5kS9Yn2uU,11393
|
66
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
66
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=LA1QbiDv3f9LJQwjKz3-YH3TpK3exL1c5acaGAOF57E,10210
|
67
67
|
deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
68
|
deltacat/compute/metastats/meta_stats.py,sha256=78hN3aN5wLHUFJsZXuv2JLeqA35HZ8mLUWJDMslMj5Q,18731
|
69
69
|
deltacat/compute/metastats/stats.py,sha256=8iUiSXOAjqiEeNP5RIb5gvhykBgpNHD5IKkB8zsPR0E,7363
|
@@ -181,8 +181,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
181
181
|
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
182
182
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
183
183
|
deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
|
184
|
-
deltacat-0.2.
|
185
|
-
deltacat-0.2.
|
186
|
-
deltacat-0.2.
|
187
|
-
deltacat-0.2.
|
188
|
-
deltacat-0.2.
|
184
|
+
deltacat-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
185
|
+
deltacat-0.2.11.dist-info/METADATA,sha256=paqGLkpxBHgQpjHWyXxTRSDC1WiiTkFCeVA64SHJg_Y,1780
|
186
|
+
deltacat-0.2.11.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
187
|
+
deltacat-0.2.11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
188
|
+
deltacat-0.2.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|