deltacat 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "0.2.9"
47
+ __version__ = "0.2.11"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,4 +1,6 @@
1
1
  from deltacat.aws.redshift.model.manifest import (
2
+ EntryFileParams,
3
+ EntryType,
2
4
  Manifest,
3
5
  ManifestAuthor,
4
6
  ManifestEntry,
@@ -7,6 +9,8 @@ from deltacat.aws.redshift.model.manifest import (
7
9
  )
8
10
 
9
11
  __all__ = [
12
+ "EntryFileParams",
13
+ "EntryType",
10
14
  "Manifest",
11
15
  "ManifestAuthor",
12
16
  "ManifestEntry",
@@ -5,12 +5,63 @@ import itertools
5
5
  import logging
6
6
  from typing import Any, Dict, List, Optional
7
7
  from uuid import uuid4
8
+ from enum import Enum
8
9
 
9
10
  from deltacat import logs
10
11
 
11
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
13
 
13
14
 
15
+ class EntryType(str, Enum):
16
+ """
17
+ Enum representing all possible content categories of an manifest entry file
18
+ """
19
+
20
+ DATA = "data"
21
+ POSITIONAL_DELETE = "positional_delete"
22
+ EQUALITY_DELETE = "equality_delete"
23
+
24
+ @classmethod
25
+ def get_default(cls):
26
+ return EntryType.DATA
27
+
28
+ @classmethod
29
+ def list(cls):
30
+ return [c.value for c in EntryType]
31
+
32
+
33
+ class EntryFileParams(dict):
34
+ """
35
+ Represents parameters relevant to the underlying contents of manifest entry. Contains all parameters required to support DELETEs
36
+ equality_column_names: List of column names that would be used to determine row equality for equality deletes. Relevant only to equality deletes
37
+ position: Ordinal position of a deleted row in the target data file identified by uri, starting at 0. Relevant only to positional deletes
38
+ """
39
+
40
+ @staticmethod
41
+ def of(
42
+ equality_column_names: Optional[List[str]] = None,
43
+ position: Optional[int] = None,
44
+ ) -> EntryFileParams:
45
+ entry_file_params = EntryFileParams()
46
+ if equality_column_names is not None:
47
+ entry_file_params["equality_column_names"] = equality_column_names
48
+ if position is not None:
49
+ entry_file_params["position"] = position
50
+ return entry_file_params
51
+
52
+ @property
53
+ def equality_column_names(self) -> Optional[List[str]]:
54
+ return self.get("equality_column_names")
55
+
56
+ @property
57
+ def url(self) -> Optional[str]:
58
+ return self.get("url")
59
+
60
+ @property
61
+ def position(self) -> Optional[int]:
62
+ return self.get("position")
63
+
64
+
14
65
  class Manifest(dict):
15
66
  @staticmethod
16
67
  def _build_manifest(
@@ -18,6 +69,7 @@ class Manifest(dict):
18
69
  entries: Optional[ManifestEntryList],
19
70
  author: Optional[ManifestAuthor] = None,
20
71
  uuid: str = None,
72
+ entry_type: Optional[EntryType] = None,
21
73
  ) -> Manifest:
22
74
  if not uuid:
23
75
  uuid = str(uuid4())
@@ -29,6 +81,8 @@ class Manifest(dict):
29
81
  manifest["entries"] = entries
30
82
  if author is not None:
31
83
  manifest["author"] = author
84
+ if entry_type is not None:
85
+ manifest["entry_type"] = entry_type.value
32
86
  return manifest
33
87
 
34
88
  @staticmethod
@@ -36,6 +90,7 @@ class Manifest(dict):
36
90
  entries: ManifestEntryList,
37
91
  author: Optional[ManifestAuthor] = None,
38
92
  uuid: str = None,
93
+ entry_type: Optional[EntryType] = None,
39
94
  ) -> Manifest:
40
95
  if not uuid:
41
96
  uuid = str(uuid4())
@@ -78,8 +133,9 @@ class Manifest(dict):
78
133
  content_type,
79
134
  content_encoding,
80
135
  total_source_content_length,
136
+ entry_type=entry_type,
81
137
  )
82
- manifest = Manifest._build_manifest(meta, entries, author, uuid)
138
+ manifest = Manifest._build_manifest(meta, entries, author, uuid, entry_type)
83
139
  return manifest
84
140
 
85
141
  @staticmethod
@@ -128,6 +184,7 @@ class ManifestMeta(dict):
128
184
  source_content_length: Optional[int] = None,
129
185
  credentials: Optional[Dict[str, str]] = None,
130
186
  content_type_parameters: Optional[List[Dict[str, str]]] = None,
187
+ entry_type: Optional[EntryType] = None,
131
188
  ) -> ManifestMeta:
132
189
  manifest_meta = ManifestMeta()
133
190
  if record_count is not None:
@@ -144,6 +201,8 @@ class ManifestMeta(dict):
144
201
  manifest_meta["content_encoding"] = content_encoding
145
202
  if credentials is not None:
146
203
  manifest_meta["credentials"] = credentials
204
+ if entry_type is not None:
205
+ manifest_meta["entry_type"] = entry_type.value
147
206
  return manifest_meta
148
207
 
149
208
  @property
@@ -178,6 +237,13 @@ class ManifestMeta(dict):
178
237
  def credentials(self) -> Optional[Dict[str, str]]:
179
238
  return self.get("credentials")
180
239
 
240
+ @property
241
+ def entry_type(self) -> Optional[EntryType]:
242
+ val = self.get("entry_type")
243
+ if val is not None:
244
+ return EntryType(self["entry_type"])
245
+ return val
246
+
181
247
 
182
248
  class ManifestAuthor(dict):
183
249
  @staticmethod
@@ -206,6 +272,8 @@ class ManifestEntry(dict):
206
272
  mandatory: bool = True,
207
273
  uri: Optional[str] = None,
208
274
  uuid: Optional[str] = None,
275
+ entry_type: Optional[EntryType] = None,
276
+ entry_file_params: Optional[EntryFileParams] = None,
209
277
  ) -> ManifestEntry:
210
278
  manifest_entry = ManifestEntry()
211
279
  if not (uri or url):
@@ -222,6 +290,16 @@ class ManifestEntry(dict):
222
290
  manifest_entry["mandatory"] = mandatory
223
291
  if uuid is not None:
224
292
  manifest_entry["id"] = uuid
293
+ if entry_type is not None:
294
+ manifest_entry["entry_type"] = entry_type.value
295
+ if entry_file_params is not None:
296
+ if entry_file_params.get("url") != manifest_entry.get("url"):
297
+ msg = (
298
+ f"Expected manifest entry url: {manifest_entry.url}"
299
+ f" and entry_file_params: '{entry_file_params.url}' to match"
300
+ )
301
+ raise ValueError(msg)
302
+ manifest_entry["entry_file_params"] = entry_file_params
225
303
  return manifest_entry
226
304
 
227
305
  @staticmethod
@@ -268,6 +346,20 @@ class ManifestEntry(dict):
268
346
  def id(self) -> Optional[str]:
269
347
  return self.get("id")
270
348
 
349
+ @property
350
+ def entry_type(self) -> Optional[EntryType]:
351
+ val = self.get("entry_type")
352
+ if val is not None:
353
+ return EntryType(self["entry_type"])
354
+ return val
355
+
356
+ @property
357
+ def entry_file_params(self) -> Optional[EntryFileParams]:
358
+ val: Dict[str, Any] = self.get("entry_file_params")
359
+ if val is not None and not isinstance(val, EntryFileParams):
360
+ self["entry_file_params"] = val = EntryFileParams(val)
361
+ return val
362
+
271
363
 
272
364
  class ManifestEntryList(List[ManifestEntry]):
273
365
  @staticmethod
@@ -1,5 +1,7 @@
1
1
  import botocore
2
+ import logging
2
3
  from typing import Dict, Optional, List, Tuple
4
+ from deltacat import logs
3
5
  from deltacat.types.media import ContentEncoding, ContentType
4
6
  from deltacat.types.partial_download import PartialParquetParameters
5
7
  from deltacat.storage import (
@@ -17,6 +19,8 @@ from deltacat.compute.compactor_v2.constants import (
17
19
  PARQUET_TO_PYARROW_INFLATION,
18
20
  )
19
21
 
22
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
+
20
24
 
21
25
  def _get_parquet_type_params_if_exist(
22
26
  entry: ManifestEntry,
@@ -133,11 +137,15 @@ def hash_bucket_resource_options_provider(
133
137
  ray_custom_resources: Optional[Dict] = None,
134
138
  **kwargs,
135
139
  ) -> Dict:
140
+ debug_memory_params = {"hash_bucket_task_index": index}
136
141
  size_bytes = 0.0
137
142
  num_rows = 0
138
143
  total_pk_size = 0
139
144
 
140
145
  if not item.manifest or not item.manifest.entries:
146
+ logger.debug(
147
+ f"[Hash bucket task {index}]: No manifest entries, skipping memory allocation calculation"
148
+ )
141
149
  return {"CPU": 0.01}
142
150
 
143
151
  for entry in item.manifest.entries:
@@ -165,9 +173,17 @@ def hash_bucket_resource_options_provider(
165
173
  # total size + pk size + pk hash column + hash bucket index column
166
174
  # Refer to hash_bucket step for more details.
167
175
  total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
176
+ debug_memory_params["size_bytes"] = size_bytes
177
+ debug_memory_params["num_rows"] = num_rows
178
+ debug_memory_params["total_pk_size"] = total_pk_size
179
+ debug_memory_params["total_memory"] = total_memory
168
180
 
169
181
  # Consider buffer
170
182
  total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
183
+ debug_memory_params["total_memory_with_buffer"] = total_memory
184
+ logger.debug(
185
+ f"[Hash bucket task {index}]: Params used for calculating hash bucketing memory: {debug_memory_params}"
186
+ )
171
187
 
172
188
  return get_task_options(0.01, total_memory, ray_custom_resources)
173
189
 
@@ -186,10 +202,13 @@ def merge_resource_options_provider(
186
202
  deltacat_storage_kwargs: Optional[Dict] = {},
187
203
  **kwargs,
188
204
  ) -> Dict:
205
+ debug_memory_params = {"merge_task_index": index}
189
206
  hb_group_idx = item[0]
190
207
 
191
208
  data_size = hash_group_size_bytes.get(hb_group_idx, 0)
192
209
  num_rows = hash_group_num_rows.get(hb_group_idx, 0)
210
+ debug_memory_params["data_size_from_hash_group"] = data_size
211
+ debug_memory_params["num_rows_from_hash_group"] = num_rows
193
212
 
194
213
  # upper bound for pk size of incremental
195
214
  pk_size_bytes = data_size
@@ -205,10 +224,13 @@ def merge_resource_options_provider(
205
224
  round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
206
225
  / round_completion_info.compacted_pyarrow_write_result.file_bytes
207
226
  )
227
+ debug_memory_params["previous_inflation"] = previous_inflation
228
+
208
229
  average_record_size = (
209
230
  round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
210
231
  / round_completion_info.compacted_pyarrow_write_result.records
211
232
  )
233
+ debug_memory_params["average_record_size"] = average_record_size
212
234
 
213
235
  iterable = hash_group_index_to_hash_bucket_indices(
214
236
  hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
@@ -256,7 +278,16 @@ def merge_resource_options_provider(
256
278
  + num_rows * 20
257
279
  + incremental_index_array_size
258
280
  )
281
+ debug_memory_params["data_size"] = data_size
282
+ debug_memory_params["num_rows"] = num_rows
283
+ debug_memory_params["pk_size_bytes"] = pk_size_bytes
284
+ debug_memory_params["incremental_index_array_size"] = incremental_index_array_size
285
+ debug_memory_params["total_memory"] = total_memory
259
286
 
260
287
  total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
288
+ debug_memory_params["total_memory_with_buffer"] = total_memory
289
+ logger.debug(
290
+ f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}"
291
+ )
261
292
 
262
293
  return get_task_options(0.01, total_memory, ray_custom_resources)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=9x12tKzGJVcmgVKVWjPCgZHxla7VH_PQf3HUvflyJZc,1777
1
+ deltacat/__init__.py,sha256=35fh2bJ52CtfEDElc4OAwCeiXNY07uCDVhpQZzewvp8,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
4
4
  deltacat/logs.py,sha256=9XWuTBoWhhAF9rAL6t9veXmnAlJHsaqk0lTxteVPqyQ,5674
@@ -6,9 +6,9 @@ deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  deltacat/aws/clients.py,sha256=wWiqXyZPWXezdEbhQ7DLwEVnYV6KiitqzBc5B4UAwc0,6184
7
7
  deltacat/aws/constants.py,sha256=luXWMO_8eatq8f9NlFjNM7q362j77JwzTM2BEVS_8-8,353
8
8
  deltacat/aws/s3u.py,sha256=s2On5X3IQiCsCMKw4lpfV1GfKQVWOXNsdAmIJK5PEM0,18610
9
- deltacat/aws/redshift/__init__.py,sha256=fjuv3jWdPE8IgF4uSrL0YEqV3XUfqDULX3xV27ICceo,266
9
+ deltacat/aws/redshift/__init__.py,sha256=7SvjG-dqox8zZUhFicTsUvpG5vXYDl_QQ3ohlHOgTKc,342
10
10
  deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- deltacat/aws/redshift/model/manifest.py,sha256=N1RRGi1Rbou_9HQieoRCI_wE7eAf5eU_FTZ7dNPvUyY,9682
11
+ deltacat/aws/redshift/model/manifest.py,sha256=ThgpdwzaWz493Zz9e8HSWwuxEheA1nDuypM3pe4vozk,12987
12
12
  deltacat/benchmarking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  deltacat/benchmarking/benchmark_parquet_reads.py,sha256=2BctkvXAYcAxokLwMSTu4TQ6-HGqzkgYcVEAzPN2QQo,1709
14
14
  deltacat/benchmarking/conftest.py,sha256=6M9NJ71vnOpeMxG-Ly9UWRsgZmky5-1GTuoRD-OElng,1604
@@ -63,7 +63,7 @@ deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=rNKZisxGrLQOkw
63
63
  deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
64
64
  deltacat/compute/compactor_v2/utils/io.py,sha256=jgIfwrfH2mTFUx1M0TgwZGGfrS4IXjP1PmqwaQmNAJM,5092
65
65
  deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=MAscmL35WfwN7Is72aFlD_cGhxtZgjRwwR5kS9Yn2uU,11393
66
- deltacat/compute/compactor_v2/utils/task_options.py,sha256=1-wIIXP0gDUJGdl8omMF5Q9kZs2oeu5WddgCnwBh3RE,8681
66
+ deltacat/compute/compactor_v2/utils/task_options.py,sha256=LA1QbiDv3f9LJQwjKz3-YH3TpK3exL1c5acaGAOF57E,10210
67
67
  deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
68
  deltacat/compute/metastats/meta_stats.py,sha256=78hN3aN5wLHUFJsZXuv2JLeqA35HZ8mLUWJDMslMj5Q,18731
69
69
  deltacat/compute/metastats/stats.py,sha256=8iUiSXOAjqiEeNP5RIb5gvhykBgpNHD5IKkB8zsPR0E,7363
@@ -181,8 +181,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
181
181
  deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
182
182
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
183
183
  deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
184
- deltacat-0.2.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
185
- deltacat-0.2.9.dist-info/METADATA,sha256=XnXwpmM03bCIv-C-znj2rwE_6FDmI68H6zFL4icWMII,1779
186
- deltacat-0.2.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
187
- deltacat-0.2.9.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
188
- deltacat-0.2.9.dist-info/RECORD,,
184
+ deltacat-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
185
+ deltacat-0.2.11.dist-info/METADATA,sha256=paqGLkpxBHgQpjHWyXxTRSDC1WiiTkFCeVA64SHJg_Y,1780
186
+ deltacat-0.2.11.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
187
+ deltacat-0.2.11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
188
+ deltacat-0.2.11.dist-info/RECORD,,