deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ from __future__ import annotations
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+
6
+ class ResourceEstimationMethod(str, Enum):
7
+ """
8
+ The default approach executes certain methods in a specific order until the size
9
+ is estimated by any. The order is as follows:
10
+ 1. CONTENT_TYPE_META
11
+ 2. PREVIOUS_INFLATION
12
+ This method expects previous inflation and average record bytes to be passed.
13
+ """
14
+
15
+ DEFAULT = "DEFAULT"
16
+
17
+ """
18
+ This approach combines intelligent estimation and inflation based methods
19
+ and runs them in the order specified below:
20
+ 1. INTELLIGENT_ESTIMATION
21
+ 2. FILE_SAMPLING
22
+ 3. PREVIOUS_INFLATION
23
+ """
24
+ DEFAULT_V2 = "DEFAULT_V2"
25
+
26
+ """
27
+ This approach strictly uses previous inflation and average record size to arrive
28
+ at a resource estimate. It requires users to pass in previous inflation and average
29
+ record sizes.
30
+ """
31
+ PREVIOUS_INFLATION = "PREVIOUS_INFLATION"
32
+
33
+ """
34
+ This approach is similar to PREVIOUS_INFLATION, but it determines average record size
35
+ and previous inflation by sampling few files in the given set of files.
36
+ """
37
+ FILE_SAMPLING = "FILE_SAMPLING"
38
+
39
+ """
40
+ This approach leverages metadata present in content type params.
41
+ """
42
+ CONTENT_TYPE_META = "CONTENT_TYPE_META"
43
+
44
+ """
45
+ This approach leverages parquet metadata and granularly estimate resources for each column and
46
+ then aggregate to arrive at most accurate estimation.
47
+ """
48
+ INTELLIGENT_ESTIMATION = "INTELLIGENT_ESTIMATION"
49
+
50
+
51
+ class EstimateResourcesParams(dict):
52
+ """
53
+ This class represents the parameters required for estimating resources.
54
+ """
55
+
56
+ @staticmethod
57
+ def of(
58
+ resource_estimation_method: ResourceEstimationMethod = ResourceEstimationMethod.DEFAULT,
59
+ previous_inflation: Optional[float] = None,
60
+ parquet_to_pyarrow_inflation: Optional[float] = None,
61
+ average_record_size_bytes: Optional[float] = None,
62
+ max_files_to_sample: Optional[int] = None,
63
+ ) -> EstimateResourcesParams:
64
+ result = EstimateResourcesParams()
65
+ result["previous_inflation"] = previous_inflation
66
+ result["parquet_to_pyarrow_inflation"] = parquet_to_pyarrow_inflation
67
+ result["resource_estimation_method"] = resource_estimation_method
68
+ result["max_files_to_sample"] = max_files_to_sample
69
+ result["average_record_size_bytes"] = average_record_size_bytes
70
+ return result
71
+
72
+ @property
73
+ def resource_estimation_method(self) -> ResourceEstimationMethod:
74
+ return self["resource_estimation_method"]
75
+
76
+ @property
77
+ def max_files_to_sample(self) -> Optional[int]:
78
+ """
79
+ Applicable only for FILE_SAMPLING method. This parameter controls the
80
+ number of files to sample to arrive at average record sizes and previous inflation.
81
+ """
82
+ return self.get("max_files_to_sample")
83
+
84
+ @property
85
+ def previous_inflation(self) -> Optional[float]:
86
+ """
87
+ This parameter is required for PREVIOUS_INFLATION method. The inflation factor determines
88
+ a ratio of in-memory size to the on-disk size.
89
+ """
90
+ return self.get("previous_inflation")
91
+
92
+ @property
93
+ def parquet_to_pyarrow_inflation(self) -> Optional[float]:
94
+ """
95
+ This parameter is required for INTELLIGENT_ESTIMATION or CONTENT_TYPE_META method.
96
+ This determines inflation factor for parquet estimated size to pyarrow in-memory table size.
97
+ """
98
+ return self.get("parquet_to_pyarrow_inflation")
99
+
100
+ @property
101
+ def average_record_size_bytes(self) -> Optional[float]:
102
+ """
103
+ This parameter is required for PREVIOUS_INFLATION method. This determines average size of
104
+ records in bytes in a given file or entity.
105
+ """
106
+ return self.get("average_record_size_bytes")
107
+
108
+
109
+ class OperationType(str, Enum):
110
+ """
111
+ This operation type is used when user would download the given entities using pyarrow library.
112
+ """
113
+
114
+ PYARROW_DOWNLOAD = "DOWNLOAD"
115
+
116
+
117
+ class EstimatedResources(dict):
118
+ """
119
+ This class represents the resource requirements for a certain type of operation.
120
+ For example, downloading a delta requires certain amount of memory.
121
+ """
122
+
123
+ @staticmethod
124
+ def of(memory_bytes: float, statistics: Statistics = None) -> EstimatedResources:
125
+ result = EstimatedResources()
126
+ result["memory_bytes"] = memory_bytes
127
+ result["statistics"] = statistics
128
+ return result
129
+
130
+ @property
131
+ def memory_bytes(self) -> float:
132
+ return self["memory_bytes"]
133
+
134
+ @property
135
+ def statistics(self) -> Optional[Statistics]:
136
+ return self.get("statistics")
137
+
138
+
139
+ class Statistics(dict):
140
+ """
141
+ This class represents the statistics of underlying objects that was used
142
+ to estimate the resource required.
143
+ """
144
+
145
+ @staticmethod
146
+ def of(
147
+ in_memory_size_bytes: float, record_count: int, on_disk_size_bytes: float
148
+ ) -> Statistics:
149
+ result = Statistics()
150
+ result["in_memory_size_bytes"] = in_memory_size_bytes
151
+ result["record_count"] = record_count
152
+ result["on_disk_size_bytes"] = on_disk_size_bytes
153
+ return result
154
+
155
+ @property
156
+ def in_memory_size_bytes(self) -> float:
157
+ return self["in_memory_size_bytes"]
158
+
159
+ @property
160
+ def record_count(self) -> int:
161
+ return self["record_count"]
162
+
163
+ @property
164
+ def on_disk_size_bytes(self) -> float:
165
+ return self["on_disk_size_bytes"]
@@ -0,0 +1,108 @@
1
+ import logging
2
+ from typing import Optional
3
+ from deltacat import logs
4
+ from pyarrow.parquet import ColumnChunkMetaData
5
+ from deltacat.constants import NULL_SIZE_BYTES
6
+
7
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
8
+
9
+
10
+ def _observed_string_size(min_value: str, max_value: str) -> float:
11
+ """
12
+ Pyarrow uses few additional bytes to store each string.
13
+ """
14
+ return (len(min_value) + len(max_value)) / 2 + 4
15
+
16
+
17
+ def _int96_size_estimator(
18
+ column_chunk_metadata: ColumnChunkMetaData,
19
+ ) -> float:
20
+ return column_chunk_metadata.num_values * 12
21
+
22
+
23
+ def _int64_size_estimator(
24
+ column_chunk_metadata: ColumnChunkMetaData,
25
+ ) -> float:
26
+ return column_chunk_metadata.num_values * 8
27
+
28
+
29
+ def _int32_size_estimator(
30
+ column_chunk_metadata: ColumnChunkMetaData,
31
+ ) -> float:
32
+ return column_chunk_metadata.num_values * 4
33
+
34
+
35
+ def _boolean_size_estimator(
36
+ column_chunk_metadata: ColumnChunkMetaData,
37
+ ) -> float:
38
+ return column_chunk_metadata.num_values
39
+
40
+
41
+ def _double_size_estimator(
42
+ column_chunk_metadata: ColumnChunkMetaData,
43
+ ) -> float:
44
+ return column_chunk_metadata.num_values * 8
45
+
46
+
47
+ def _float_size_estimator(
48
+ column_chunk_metadata: ColumnChunkMetaData,
49
+ ) -> float:
50
+ return column_chunk_metadata.num_values * 4
51
+
52
+
53
+ def _byte_array_size_estimator(
54
+ column_chunk_metadata: ColumnChunkMetaData,
55
+ ) -> float:
56
+ uncompressed_size = column_chunk_metadata.total_uncompressed_size
57
+ if column_chunk_metadata.is_stats_set:
58
+ statistics = column_chunk_metadata.statistics
59
+ if (
60
+ statistics.has_min_max
61
+ and isinstance(statistics.min, str)
62
+ and isinstance(statistics.max, str)
63
+ ):
64
+ return max(
65
+ uncompressed_size,
66
+ (
67
+ statistics.num_values
68
+ * _observed_string_size(statistics.min, statistics.max)
69
+ + statistics.null_count * NULL_SIZE_BYTES
70
+ ),
71
+ )
72
+ else:
73
+ # A case of decimal
74
+ return max(column_chunk_metadata.num_values * 16, uncompressed_size)
75
+ else:
76
+ return uncompressed_size
77
+
78
+
79
+ def _fixed_len_byte_array_size_estimator(
80
+ column_chunk_metadata: ColumnChunkMetaData,
81
+ ) -> float:
82
+ return _byte_array_size_estimator(column_chunk_metadata)
83
+
84
+
85
+ _PHYSICAL_TYPE_TO_SIZE_ESTIMATOR = {
86
+ "INT96": _int96_size_estimator,
87
+ "INT64": _int64_size_estimator,
88
+ "INT32": _int32_size_estimator,
89
+ "BOOLEAN": _boolean_size_estimator,
90
+ "DOUBLE": _double_size_estimator,
91
+ "FLOAT": _float_size_estimator,
92
+ "BYTE_ARRAY": _byte_array_size_estimator,
93
+ "FIXED_LEN_BYTE_ARRAY": _fixed_len_byte_array_size_estimator,
94
+ }
95
+
96
+
97
+ def parquet_column_chunk_size_estimator(
98
+ column_meta: ColumnChunkMetaData,
99
+ ) -> Optional[float]:
100
+ physical_type = column_meta.physical_type
101
+ if physical_type in _PHYSICAL_TYPE_TO_SIZE_ESTIMATOR:
102
+ return _PHYSICAL_TYPE_TO_SIZE_ESTIMATOR[physical_type](column_meta)
103
+ else:
104
+ logger.warning(
105
+ f"Unsupported physical type: {physical_type}. "
106
+ "Returning total_uncompressed_size."
107
+ )
108
+ return column_meta.total_uncompressed_size
deltacat/constants.py CHANGED
@@ -28,6 +28,8 @@ DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME = env_string(
28
28
  "DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME",
29
29
  "application.debug.log",
30
30
  )
31
+ # A json context which will be logged along with other context args.
32
+ DELTACAT_LOGGER_CONTEXT = env_string("DELTACAT_LOGGER_CONTEXT", None)
31
33
 
32
34
  # Byte Units
33
35
  BYTES_PER_KIBIBYTE = 2**10
@@ -53,3 +55,6 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
53
55
  PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
54
56
 
55
57
  MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
58
+
59
+ # The number of bytes allocated to null values in string physical type in parquet
60
+ NULL_SIZE_BYTES = 4
deltacat/exceptions.py CHANGED
@@ -299,7 +299,7 @@ def _categorize_tenacity_error(e: tenacity.RetryError):
299
299
  def _categorize_dependency_pyarrow_error(e: ArrowException):
300
300
  if isinstance(e, ArrowInvalid):
301
301
  raise DependencyPyarrowInvalidError(
302
- f"Pyarrow Invalid error occurred. Reason: {e}"
302
+ f"Pyarrow Invalid error occurred. {e}"
303
303
  ) from e
304
304
  elif isinstance(e, ArrowCapacityError):
305
305
  raise DependencyPyarrowCapacityError("Pyarrow Capacity error occurred.") from e
@@ -308,9 +308,7 @@ def _categorize_dependency_pyarrow_error(e: ArrowException):
308
308
 
309
309
 
310
310
  def _categorize_assertion_error(e: BaseException):
311
- raise ValidationError(
312
- f"One of the assertions in DeltaCAT has failed. Reason: {e}"
313
- ) from e
311
+ raise ValidationError(f"One of the assertions in DeltaCAT has failed. {e}") from e
314
312
 
315
313
 
316
314
  def _categorize_daft_error(e: DaftCoreException):
deltacat/logs.py CHANGED
@@ -17,6 +17,7 @@ from deltacat.constants import (
17
17
  DELTACAT_SYS_INFO_LOG_BASE_FILE_NAME,
18
18
  DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
19
19
  DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
20
+ DELTACAT_LOGGER_CONTEXT,
20
21
  )
21
22
 
22
23
  DEFAULT_LOG_LEVEL = "INFO"
@@ -66,6 +67,13 @@ class JsonFormatter(logging.Formatter):
66
67
  self.ray_runtime_ctx = None
67
68
  self.context = {}
68
69
 
70
+ if DELTACAT_LOGGER_CONTEXT is not None:
71
+ try:
72
+ env_context = json.loads(DELTACAT_LOGGER_CONTEXT)
73
+ self.additional_context.update(env_context)
74
+ except Exception:
75
+ pass
76
+
69
77
  def usesTime(self) -> bool:
70
78
  """
71
79
  Overwritten to look for the attribute in the format dict values instead of the fmt string.
@@ -848,6 +848,83 @@ MULTIPLE_ROUNDS_TEST_CASES = {
848
848
  assert_compaction_audit=None,
849
849
  num_rounds=3,
850
850
  ),
851
+ # 4 input deltas (3 upsert, 1 delete delta), 2 rounds requested
852
+ # Expect to see a table that aggregates 10 records total
853
+ # (12 upserts - 2 deletes (null PK) = 10 records)
854
+ # (dropDuplicates = False)
855
+ "9-multiple-rounds-delete-deltas-with-null-pk": MultipleRoundsTestCaseParams(
856
+ primary_keys={"pk_col_1"},
857
+ sort_keys=ZERO_VALUED_SORT_KEY,
858
+ partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
859
+ partition_values=["1"],
860
+ input_deltas=[
861
+ (
862
+ pa.Table.from_arrays(
863
+ [
864
+ pa.array([None, 11, 12, 13]),
865
+ pa.array(["a", "b", "c", "d"]),
866
+ ],
867
+ names=["pk_col_1", "col_1"],
868
+ ),
869
+ DeltaType.UPSERT,
870
+ None,
871
+ ),
872
+ (
873
+ pa.Table.from_arrays(
874
+ [
875
+ pa.array([14, 15, 16, 17]),
876
+ pa.array(["e", "f", "g", "h"]),
877
+ ],
878
+ names=["pk_col_1", "col_1"],
879
+ ),
880
+ DeltaType.UPSERT,
881
+ None,
882
+ ),
883
+ (
884
+ pa.Table.from_arrays(
885
+ [
886
+ pa.array([18, 19, 20, 21]),
887
+ pa.array(["i", "j", "k", "l"]),
888
+ ],
889
+ names=["pk_col_1", "col_1"],
890
+ ),
891
+ DeltaType.UPSERT,
892
+ None,
893
+ ),
894
+ (
895
+ pa.Table.from_arrays(
896
+ [pa.array([None, 11]), pa.array(["a", "b"])],
897
+ names=["pk_col_1", "col_1"],
898
+ ),
899
+ DeltaType.DELETE,
900
+ DeleteParameters.of(["pk_col_1", "col_1"]),
901
+ ),
902
+ ],
903
+ rebase_expected_compact_partition_result=pa.Table.from_arrays(
904
+ [
905
+ pa.array([i for i in range(12, 22)]),
906
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
907
+ ],
908
+ names=["pk_col_1", "col_1"],
909
+ ),
910
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
911
+ [
912
+ pa.array([i for i in range(12, 22)]),
913
+ pa.array(["c", "d", "e", "f", "g", "h", "i", "j", "k", "l"]),
914
+ ],
915
+ names=["pk_col_1", "col_1"],
916
+ ),
917
+ expected_terminal_exception=None,
918
+ expected_terminal_exception_message=None,
919
+ do_create_placement_group=False,
920
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
921
+ hash_bucket_count=DEFAULT_HASH_BUCKET_COUNT,
922
+ read_kwargs_provider=None,
923
+ drop_duplicates=False,
924
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
925
+ assert_compaction_audit=None,
926
+ num_rounds=2,
927
+ ),
851
928
  }
852
929
 
853
930
  MULTIPLE_ROUNDS_TEST_CASES = with_compactor_version_func_test_param(