deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
1
1
  import uuid
2
2
  import logging
3
+ from collections import defaultdict
4
+
3
5
  from pyiceberg.exceptions import NoSuchTableError
6
+ from pyiceberg.manifest import DataFileContent
7
+ from deltacat.compute.converter.pyiceberg.overrides import (
8
+ parquet_files_dict_to_iceberg_data_files,
9
+ )
4
10
  from deltacat import logs
5
11
 
6
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -14,9 +20,6 @@ def get_s3_file_system():
14
20
  secret_key="password",
15
21
  endpoint_override="http://localhost:9000",
16
22
  )
17
- # 'region="us-east-1", proxy_options={'scheme': 'http', 'host': 'localhost',
18
- # 'port': 9000, 'username': 'admin',
19
- # 'password': 'password'})
20
23
 
21
24
 
22
25
  def write_equality_data_table(
@@ -110,10 +113,16 @@ def commit_equality_delete_to_table(
110
113
  )
111
114
  ]
112
115
 
113
- add_equality_data_files(
114
- file_paths=data_files, partition_value=partition_value, table=table
116
+ equality_delete_dict_list = defaultdict()
117
+ equality_delete_dict_list[partition_value] = data_files
118
+ equality_file_list = parquet_files_dict_to_iceberg_data_files(
119
+ io=table.io,
120
+ table_metadata=table.metadata,
121
+ files_dict=equality_delete_dict_list,
122
+ file_content_type=DataFileContent.EQUALITY_DELETES,
115
123
  )
116
- return data_files
124
+
125
+ return equality_file_list
117
126
 
118
127
 
119
128
  def drop_table_if_exists(table, catalog):
@@ -1,4 +1,4 @@
1
- import deltacat.tests.local_deltacat_storage as ds
1
+ from deltacat.storage import metastore
2
2
  from deltacat.types.media import ContentType
3
3
  import pytest
4
4
  from deltacat.storage import Delta
@@ -21,7 +21,7 @@ Function scoped fixtures
21
21
 
22
22
 
23
23
  @pytest.fixture(scope="function")
24
- def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
24
+ def parquet_delta_with_manifest(main_deltacat_storage_kwargs):
25
25
  """
26
26
  These fixtures are function scoped as functions can modify the delta.
27
27
  """
@@ -31,7 +31,7 @@ def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
31
31
  "test_namespace",
32
32
  file_paths=[DELTA_CSV_FILE_PATH],
33
33
  content_type=ContentType.PARQUET,
34
- **local_deltacat_storage_kwargs
34
+ **main_deltacat_storage_kwargs
35
35
  )
36
36
 
37
37
  result.meta["source_content_length"] = 0
@@ -44,14 +44,14 @@ def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
44
44
 
45
45
 
46
46
  @pytest.fixture(scope="function")
47
- def utsv_delta_with_manifest(local_deltacat_storage_kwargs):
47
+ def utsv_delta_with_manifest(main_deltacat_storage_kwargs):
48
48
  from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
49
49
 
50
50
  result = create_delta_from_csv_file(
51
51
  "test_namespace",
52
52
  file_paths=[DELTA_CSV_FILE_PATH],
53
53
  content_type=ContentType.UNESCAPED_TSV,
54
- **local_deltacat_storage_kwargs
54
+ **main_deltacat_storage_kwargs
55
55
  )
56
56
 
57
57
  result.meta["source_content_length"] = 0
@@ -64,14 +64,14 @@ def utsv_delta_with_manifest(local_deltacat_storage_kwargs):
64
64
 
65
65
 
66
66
  @pytest.fixture(scope="function")
67
- def delta_without_manifest(local_deltacat_storage_kwargs):
67
+ def delta_without_manifest(main_deltacat_storage_kwargs):
68
68
  from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
69
69
 
70
70
  delta = create_delta_from_csv_file(
71
71
  "test_namespace",
72
72
  file_paths=[DELTA_CSV_FILE_PATH],
73
73
  content_type=ContentType.PARQUET,
74
- **local_deltacat_storage_kwargs
74
+ **main_deltacat_storage_kwargs
75
75
  )
76
76
 
77
77
  # now we intentionally remove manifest
@@ -83,14 +83,14 @@ def delta_without_manifest(local_deltacat_storage_kwargs):
83
83
 
84
84
 
85
85
  @pytest.fixture(scope="function")
86
- def delta_with_populated_meta(local_deltacat_storage_kwargs):
86
+ def delta_with_populated_meta(main_deltacat_storage_kwargs):
87
87
  from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
88
88
 
89
89
  delta = create_delta_from_csv_file(
90
90
  "test_namespace",
91
91
  file_paths=[DELTA_CSV_FILE_PATH],
92
92
  content_type=ContentType.PARQUET,
93
- **local_deltacat_storage_kwargs
93
+ **main_deltacat_storage_kwargs
94
94
  )
95
95
 
96
96
  return delta
@@ -98,14 +98,14 @@ def delta_with_populated_meta(local_deltacat_storage_kwargs):
98
98
 
99
99
  class TestEstimateResourcesRequiredToProcessDelta:
100
100
  def test_delta_with_prepopulated_meta_returns_directly(
101
- self, local_deltacat_storage_kwargs, delta_with_populated_meta: Delta
101
+ self, main_deltacat_storage_kwargs, delta_with_populated_meta: Delta
102
102
  ):
103
103
 
104
104
  result = estimate_resources_required_to_process_delta(
105
105
  delta=delta_with_populated_meta,
106
106
  operation_type=OperationType.PYARROW_DOWNLOAD,
107
- deltacat_storage=ds,
108
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
107
+ deltacat_storage=metastore,
108
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
109
109
  )
110
110
 
111
111
  assert (
@@ -125,7 +125,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
125
125
  )
126
126
 
127
127
  def test_delta_manifest_empty_when_default_method(
128
- self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
128
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
129
129
  ):
130
130
  params = EstimateResourcesParams.of(
131
131
  resource_estimation_method=ResourceEstimationMethod.DEFAULT,
@@ -136,8 +136,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
136
136
  result = estimate_resources_required_to_process_delta(
137
137
  delta=delta_without_manifest,
138
138
  operation_type=OperationType.PYARROW_DOWNLOAD,
139
- deltacat_storage=ds,
140
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
139
+ deltacat_storage=metastore,
140
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
141
141
  estimate_resources_params=params,
142
142
  )
143
143
 
@@ -156,7 +156,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
156
156
  )
157
157
 
158
158
  def test_delta_manifest_exists_when_default_method(
159
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
159
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
160
160
  ):
161
161
  params = EstimateResourcesParams.of(
162
162
  resource_estimation_method=ResourceEstimationMethod.DEFAULT,
@@ -167,8 +167,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
167
167
  result = estimate_resources_required_to_process_delta(
168
168
  delta=parquet_delta_with_manifest,
169
169
  operation_type=OperationType.PYARROW_DOWNLOAD,
170
- deltacat_storage=ds,
171
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
170
+ deltacat_storage=metastore,
171
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
172
172
  estimate_resources_params=params,
173
173
  )
174
174
 
@@ -191,7 +191,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
191
191
  )
192
192
 
193
193
  def test_previous_inflation_arg_not_passed_when_default_method(
194
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
194
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
195
195
  ):
196
196
  with pytest.raises(AssertionError):
197
197
  params = EstimateResourcesParams.of(
@@ -202,13 +202,13 @@ class TestEstimateResourcesRequiredToProcessDelta:
202
202
  estimate_resources_required_to_process_delta(
203
203
  delta=parquet_delta_with_manifest,
204
204
  operation_type=OperationType.PYARROW_DOWNLOAD,
205
- deltacat_storage=ds,
206
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
205
+ deltacat_storage=metastore,
206
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
207
207
  estimate_resources_params=params,
208
208
  )
209
209
 
210
210
  def test_estimate_resources_params_not_passed_assumes_default(
211
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
211
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
212
212
  ):
213
213
  params = EstimateResourcesParams.of(
214
214
  previous_inflation=7,
@@ -218,8 +218,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
218
218
  result = estimate_resources_required_to_process_delta(
219
219
  delta=parquet_delta_with_manifest,
220
220
  operation_type=OperationType.PYARROW_DOWNLOAD,
221
- deltacat_storage=ds,
222
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
221
+ deltacat_storage=metastore,
222
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
223
223
  estimate_resources_params=params,
224
224
  )
225
225
 
@@ -242,7 +242,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
242
242
  )
243
243
 
244
244
  def test_delta_manifest_empty_when_content_type_meta(
245
- self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
245
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
246
246
  ):
247
247
  params = EstimateResourcesParams.of(
248
248
  resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
@@ -252,8 +252,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
252
252
  result = estimate_resources_required_to_process_delta(
253
253
  delta=delta_without_manifest,
254
254
  operation_type=OperationType.PYARROW_DOWNLOAD,
255
- deltacat_storage=ds,
256
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
255
+ deltacat_storage=metastore,
256
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
257
257
  estimate_resources_params=params,
258
258
  )
259
259
 
@@ -267,7 +267,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
267
267
  assert result.statistics.record_count == 7
268
268
 
269
269
  def test_delta_manifest_exists_when_content_type_meta(
270
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
270
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
271
271
  ):
272
272
  params = EstimateResourcesParams.of(
273
273
  resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
@@ -277,8 +277,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
277
277
  result = estimate_resources_required_to_process_delta(
278
278
  delta=parquet_delta_with_manifest,
279
279
  operation_type=OperationType.PYARROW_DOWNLOAD,
280
- deltacat_storage=ds,
281
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
280
+ deltacat_storage=metastore,
281
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
282
282
  estimate_resources_params=params,
283
283
  )
284
284
 
@@ -292,7 +292,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
292
292
  assert result.statistics.record_count == 7
293
293
 
294
294
  def test_delta_manifest_empty_when_intelligent_estimation(
295
- self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
295
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
296
296
  ):
297
297
  params = EstimateResourcesParams.of(
298
298
  resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
@@ -302,8 +302,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
302
302
  result = estimate_resources_required_to_process_delta(
303
303
  delta=delta_without_manifest,
304
304
  operation_type=OperationType.PYARROW_DOWNLOAD,
305
- deltacat_storage=ds,
306
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
305
+ deltacat_storage=metastore,
306
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
307
307
  estimate_resources_params=params,
308
308
  )
309
309
 
@@ -317,7 +317,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
317
317
  assert result.statistics.record_count == 7
318
318
 
319
319
  def test_delta_manifest_exists_when_intelligent_estimation(
320
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
320
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
321
321
  ):
322
322
  params = EstimateResourcesParams.of(
323
323
  resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
@@ -327,8 +327,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
327
327
  result = estimate_resources_required_to_process_delta(
328
328
  delta=parquet_delta_with_manifest,
329
329
  operation_type=OperationType.PYARROW_DOWNLOAD,
330
- deltacat_storage=ds,
331
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
330
+ deltacat_storage=metastore,
331
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
332
332
  estimate_resources_params=params,
333
333
  )
334
334
 
@@ -342,7 +342,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
342
342
  assert result.statistics.record_count == 7
343
343
 
344
344
  def test_delta_manifest_exists_inflation_absent_when_intelligent_estimation(
345
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
345
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
346
346
  ):
347
347
  params = EstimateResourcesParams.of(
348
348
  resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
@@ -352,15 +352,15 @@ class TestEstimateResourcesRequiredToProcessDelta:
352
352
  result = estimate_resources_required_to_process_delta(
353
353
  delta=parquet_delta_with_manifest,
354
354
  operation_type=OperationType.PYARROW_DOWNLOAD,
355
- deltacat_storage=ds,
356
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
355
+ deltacat_storage=metastore,
356
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
357
357
  estimate_resources_params=params,
358
358
  )
359
359
 
360
360
  assert result is None
361
361
 
362
362
  def test_delta_utsv_data_when_intelligent_estimation(
363
- self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
363
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
364
364
  ):
365
365
  params = EstimateResourcesParams.of(
366
366
  resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
@@ -370,15 +370,15 @@ class TestEstimateResourcesRequiredToProcessDelta:
370
370
  result = estimate_resources_required_to_process_delta(
371
371
  delta=utsv_delta_with_manifest,
372
372
  operation_type=OperationType.PYARROW_DOWNLOAD,
373
- deltacat_storage=ds,
374
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
373
+ deltacat_storage=metastore,
374
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
375
375
  estimate_resources_params=params,
376
376
  )
377
377
 
378
378
  assert result is None
379
379
 
380
380
  def test_empty_delta_sampled_when_file_sampling(
381
- self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
381
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
382
382
  ):
383
383
  params = EstimateResourcesParams.of(
384
384
  resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
@@ -388,8 +388,31 @@ class TestEstimateResourcesRequiredToProcessDelta:
388
388
  result = estimate_resources_required_to_process_delta(
389
389
  delta=delta_without_manifest,
390
390
  operation_type=OperationType.PYARROW_DOWNLOAD,
391
- deltacat_storage=ds,
392
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
391
+ deltacat_storage=metastore,
392
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
393
+ estimate_resources_params=params,
394
+ )
395
+
396
+ assert delta_without_manifest.manifest is not None
397
+ assert result.memory_bytes is not None
398
+ assert (
399
+ result.statistics.on_disk_size_bytes
400
+ == delta_without_manifest.meta.content_length
401
+ )
402
+
403
+ def test_empty_delta_sampled_when_file_sampling_with_previous_inflation(
404
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
405
+ ):
406
+ params = EstimateResourcesParams.of(
407
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
408
+ max_files_to_sample=2,
409
+ )
410
+
411
+ result = estimate_resources_required_to_process_delta(
412
+ delta=delta_without_manifest,
413
+ operation_type=OperationType.PYARROW_DOWNLOAD,
414
+ deltacat_storage=metastore,
415
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
393
416
  estimate_resources_params=params,
394
417
  )
395
418
 
@@ -401,7 +424,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
401
424
  )
402
425
 
403
426
  def test_delta_manifest_parquet_when_file_sampling(
404
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
427
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
405
428
  ):
406
429
  params = EstimateResourcesParams.of(
407
430
  resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
@@ -411,8 +434,29 @@ class TestEstimateResourcesRequiredToProcessDelta:
411
434
  result = estimate_resources_required_to_process_delta(
412
435
  delta=parquet_delta_with_manifest,
413
436
  operation_type=OperationType.PYARROW_DOWNLOAD,
414
- deltacat_storage=ds,
415
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
437
+ deltacat_storage=metastore,
438
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
439
+ estimate_resources_params=params,
440
+ )
441
+ assert result.memory_bytes is not None
442
+ assert (
443
+ result.statistics.on_disk_size_bytes
444
+ == parquet_delta_with_manifest.meta.content_length
445
+ )
446
+
447
+ def test_delta_manifest_parquet_when_file_sampling_with_previous_inflation(
448
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
449
+ ):
450
+ params = EstimateResourcesParams.of(
451
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
452
+ max_files_to_sample=2,
453
+ )
454
+
455
+ result = estimate_resources_required_to_process_delta(
456
+ delta=parquet_delta_with_manifest,
457
+ operation_type=OperationType.PYARROW_DOWNLOAD,
458
+ deltacat_storage=metastore,
459
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
416
460
  estimate_resources_params=params,
417
461
  )
418
462
  assert result.memory_bytes is not None
@@ -423,7 +467,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
423
467
 
424
468
  def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
425
469
  self,
426
- local_deltacat_storage_kwargs,
470
+ main_deltacat_storage_kwargs,
427
471
  parquet_delta_with_manifest: Delta,
428
472
  monkeypatch,
429
473
  ):
@@ -441,13 +485,13 @@ class TestEstimateResourcesRequiredToProcessDelta:
441
485
 
442
486
  return MockedValue()
443
487
 
444
- monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func)
488
+ monkeypatch.setattr(metastore, "download_delta_manifest_entry", mock_func)
445
489
 
446
490
  result = estimate_resources_required_to_process_delta(
447
491
  delta=parquet_delta_with_manifest,
448
492
  operation_type=OperationType.PYARROW_DOWNLOAD,
449
- deltacat_storage=ds,
450
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
493
+ deltacat_storage=metastore,
494
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
451
495
  estimate_resources_params=params,
452
496
  )
453
497
 
@@ -459,7 +503,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
459
503
  )
460
504
 
461
505
  def test_delta_manifest_utsv_when_file_sampling(
462
- self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
506
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
463
507
  ):
464
508
  params = EstimateResourcesParams.of(
465
509
  resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
@@ -469,8 +513,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
469
513
  result = estimate_resources_required_to_process_delta(
470
514
  delta=utsv_delta_with_manifest,
471
515
  operation_type=OperationType.PYARROW_DOWNLOAD,
472
- deltacat_storage=ds,
473
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
516
+ deltacat_storage=metastore,
517
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
474
518
  estimate_resources_params=params,
475
519
  )
476
520
  assert result.memory_bytes is not None
@@ -480,7 +524,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
480
524
  )
481
525
 
482
526
  def test_delta_manifest_utsv_when_file_sampling_zero_files_to_sample(
483
- self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
527
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
484
528
  ):
485
529
  params = EstimateResourcesParams.of(
486
530
  resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
@@ -490,14 +534,36 @@ class TestEstimateResourcesRequiredToProcessDelta:
490
534
  result = estimate_resources_required_to_process_delta(
491
535
  delta=utsv_delta_with_manifest,
492
536
  operation_type=OperationType.PYARROW_DOWNLOAD,
493
- deltacat_storage=ds,
494
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
537
+ deltacat_storage=metastore,
538
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
495
539
  estimate_resources_params=params,
496
540
  )
497
541
  assert result is None
498
542
 
543
+ def test_delta_manifest_utsv_when_file_sampling_with_previous_inflation_zero_files_to_sample(
544
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
545
+ ):
546
+ previous_inflation = 7
547
+ params = EstimateResourcesParams.of(
548
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
549
+ max_files_to_sample=None,
550
+ previous_inflation=previous_inflation,
551
+ )
552
+
553
+ result = estimate_resources_required_to_process_delta(
554
+ delta=utsv_delta_with_manifest,
555
+ operation_type=OperationType.PYARROW_DOWNLOAD,
556
+ deltacat_storage=metastore,
557
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
558
+ estimate_resources_params=params,
559
+ )
560
+ assert result is not None
561
+ assert result.memory_bytes == (
562
+ utsv_delta_with_manifest.meta.content_length * previous_inflation
563
+ )
564
+
499
565
  def test_empty_delta_when_default_v2(
500
- self, local_deltacat_storage_kwargs, delta_without_manifest: Delta
566
+ self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
501
567
  ):
502
568
  params = EstimateResourcesParams.of(
503
569
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -509,8 +575,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
509
575
  result = estimate_resources_required_to_process_delta(
510
576
  delta=delta_without_manifest,
511
577
  operation_type=OperationType.PYARROW_DOWNLOAD,
512
- deltacat_storage=ds,
513
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
578
+ deltacat_storage=metastore,
579
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
514
580
  estimate_resources_params=params,
515
581
  )
516
582
 
@@ -522,7 +588,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
522
588
  )
523
589
 
524
590
  def test_parquet_delta_when_default_v2(
525
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
591
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
526
592
  ):
527
593
  params = EstimateResourcesParams.of(
528
594
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -535,8 +601,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
535
601
  result = estimate_resources_required_to_process_delta(
536
602
  delta=parquet_delta_with_manifest,
537
603
  operation_type=OperationType.PYARROW_DOWNLOAD,
538
- deltacat_storage=ds,
539
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
604
+ deltacat_storage=metastore,
605
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
540
606
  estimate_resources_params=params,
541
607
  )
542
608
 
@@ -548,7 +614,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
548
614
  )
549
615
 
550
616
  def test_parquet_delta_when_default_v2_without_avg_record_size_and_sampling(
551
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
617
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
552
618
  ):
553
619
  params = EstimateResourcesParams.of(
554
620
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -559,8 +625,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
559
625
  result = estimate_resources_required_to_process_delta(
560
626
  delta=parquet_delta_with_manifest,
561
627
  operation_type=OperationType.PYARROW_DOWNLOAD,
562
- deltacat_storage=ds,
563
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
628
+ deltacat_storage=metastore,
629
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
564
630
  estimate_resources_params=params,
565
631
  )
566
632
 
@@ -572,7 +638,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
572
638
  )
573
639
 
574
640
  def test_parquet_delta_when_default_v2_and_files_to_sample_zero(
575
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
641
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
576
642
  ):
577
643
  params = EstimateResourcesParams.of(
578
644
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -585,8 +651,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
585
651
  result = estimate_resources_required_to_process_delta(
586
652
  delta=parquet_delta_with_manifest,
587
653
  operation_type=OperationType.PYARROW_DOWNLOAD,
588
- deltacat_storage=ds,
589
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
654
+ deltacat_storage=metastore,
655
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
590
656
  estimate_resources_params=params,
591
657
  )
592
658
 
@@ -598,7 +664,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
598
664
  )
599
665
 
600
666
  def test_utsv_delta_when_default_v2(
601
- self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
667
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
602
668
  ):
603
669
  params = EstimateResourcesParams.of(
604
670
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -611,8 +677,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
611
677
  result = estimate_resources_required_to_process_delta(
612
678
  delta=utsv_delta_with_manifest,
613
679
  operation_type=OperationType.PYARROW_DOWNLOAD,
614
- deltacat_storage=ds,
615
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
680
+ deltacat_storage=metastore,
681
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
616
682
  estimate_resources_params=params,
617
683
  )
618
684
 
@@ -624,7 +690,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
624
690
  )
625
691
 
626
692
  def test_utsv_delta_when_default_v2_without_avg_record_size(
627
- self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
693
+ self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
628
694
  ):
629
695
  params = EstimateResourcesParams.of(
630
696
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -636,8 +702,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
636
702
  result = estimate_resources_required_to_process_delta(
637
703
  delta=utsv_delta_with_manifest,
638
704
  operation_type=OperationType.PYARROW_DOWNLOAD,
639
- deltacat_storage=ds,
640
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
705
+ deltacat_storage=metastore,
706
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
641
707
  estimate_resources_params=params,
642
708
  )
643
709
 
@@ -650,7 +716,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
650
716
  )
651
717
 
652
718
  def test_parquet_delta_without_inflation_when_default_v2(
653
- self, local_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
719
+ self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
654
720
  ):
655
721
  params = EstimateResourcesParams.of(
656
722
  resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
@@ -663,8 +729,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
663
729
  result = estimate_resources_required_to_process_delta(
664
730
  delta=parquet_delta_with_manifest,
665
731
  operation_type=OperationType.PYARROW_DOWNLOAD,
666
- deltacat_storage=ds,
667
- deltacat_storage_kwargs=local_deltacat_storage_kwargs,
732
+ deltacat_storage=metastore,
733
+ deltacat_storage_kwargs=main_deltacat_storage_kwargs,
668
734
  estimate_resources_params=params,
669
735
  )
670
736