deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,9 @@ from deltacat.storage.model.types import (
12
12
  from deltacat.storage.model.schema import FieldLocator
13
13
  from deltacat.storage.model.transform import Transform
14
14
 
15
+ UNSORTED_SCHEME_NAME = "unsorted_scheme"
16
+ UNSORTED_SCHEME_ID = "deadbeef-7277-49a4-a195-fdc8ed235d42"
17
+
15
18
 
16
19
  class SortKey(tuple):
17
20
  @staticmethod
@@ -103,6 +106,10 @@ class SortKeyList(List[SortKey]):
103
106
  self[item] = val = SortKey(val)
104
107
  return val
105
108
 
109
+ def __iter__(self):
110
+ for i in range(len(self)):
111
+ yield self[i] # This triggers __getitem__ conversion
112
+
106
113
 
107
114
  class SortScheme(dict):
108
115
  @staticmethod
@@ -112,6 +119,19 @@ class SortScheme(dict):
112
119
  scheme_id: Optional[str] = None,
113
120
  native_object: Optional[Any] = None,
114
121
  ) -> SortScheme:
122
+ # Validate keys if provided
123
+ if keys is not None:
124
+ # Check for empty keys list
125
+ if len(keys) == 0:
126
+ raise ValueError("Sort scheme cannot have empty keys list")
127
+
128
+ # Check for duplicate keys
129
+ key_names = []
130
+ for key in keys:
131
+ if key.key[0] in key_names:
132
+ raise ValueError(f"Duplicate sort key found: {key.key[0]}")
133
+ key_names.append(key.key[0])
134
+
115
135
  return SortScheme(
116
136
  {
117
137
  "keys": keys,
@@ -132,6 +152,15 @@ class SortScheme(dict):
132
152
  return False
133
153
  if not isinstance(other, SortScheme):
134
154
  other = SortScheme(other)
155
+ # If both have None keys, they are equivalent (for unsorted schemes)
156
+ if self.keys is None and other.keys is None:
157
+ return not check_identifiers or (
158
+ self.name == other.name and self.id == other.id
159
+ )
160
+ # If only one has None keys, they are not equivalent
161
+ if self.keys is None or other.keys is None:
162
+ return False
163
+ # Compare keys if both have them
135
164
  for i in range(len(self.keys)):
136
165
  if not self.keys[i].equivalent_to(other.keys[i]):
137
166
  return False
@@ -173,6 +202,13 @@ class SortScheme(dict):
173
202
  return self.get("nativeObject")
174
203
 
175
204
 
205
+ UNSORTED_SCHEME = SortScheme.of(
206
+ keys=None,
207
+ name=UNSORTED_SCHEME_NAME,
208
+ scheme_id=UNSORTED_SCHEME_ID,
209
+ )
210
+
211
+
176
212
  class SortSchemeList(List[SortScheme]):
177
213
  @staticmethod
178
214
  def of(items: List[SortScheme]) -> SortSchemeList:
@@ -188,3 +224,7 @@ class SortSchemeList(List[SortScheme]):
188
224
  if val is not None and not isinstance(val, SortScheme):
189
225
  self[item] = val = SortScheme(val)
190
226
  return val
227
+
228
+ def __iter__(self):
229
+ for i in range(len(self)):
230
+ yield self[i] # This triggers __getitem__ conversion
@@ -178,6 +178,13 @@ class Stream(Metafile):
178
178
  return stream_locator.table_version
179
179
  return None
180
180
 
181
+ def url(self, catalog_name: Optional[str] = None) -> str:
182
+ return (
183
+ f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/"
184
+ if catalog_name
185
+ else f"table://{self.namespace}/{self.table_name}/{self.table_version}/{self.stream_format}/"
186
+ )
187
+
181
188
  def to_serializable(self) -> Stream:
182
189
  serializable = self
183
190
  if serializable.table_locator:
@@ -382,8 +389,8 @@ class StreamLocatorAlias(Locator, dict):
382
389
  ),
383
390
  }
384
391
  )
385
- if parent_stream.state == CommitState.COMMITTED
386
- else None # only committed streams can be resolved by alias
392
+ if parent_stream.state != CommitState.STAGED
393
+ else None # staged streams cannot be resolved by alias
387
394
  )
388
395
 
389
396
  @property
@@ -13,8 +13,9 @@ from deltacat.storage.model.namespace import (
13
13
  )
14
14
  from deltacat.storage.model.metafile import Metafile, MetafileRevisionInfo
15
15
  from deltacat.constants import TXN_DIR_NAME
16
+ from deltacat.types.tables import TableProperty
16
17
 
17
- TableProperties = dict[str, Any]
18
+ TableProperties = Dict[str, Any]
18
19
 
19
20
 
20
21
  class Table(Metafile):
@@ -126,6 +127,16 @@ class Table(Metafile):
126
127
  if table_locator:
127
128
  table_locator.table_name = table_name
128
129
 
130
+ def url(self, catalog_name: Optional[str] = None) -> str:
131
+ return (
132
+ f"dc://{catalog_name}/{self.namespace}/{self.table_name}/"
133
+ if catalog_name
134
+ else f"table://{self.namespace}/{self.table_name}/"
135
+ )
136
+
137
+ def read_table_property(self, property: TableProperty) -> Any:
138
+ return TableProperty.read_table_property(self, property)
139
+
129
140
  def to_serializable(self) -> Table:
130
141
  serializable = self
131
142
  if serializable.namespace_locator:
@@ -34,6 +34,7 @@ from deltacat.storage.model.table import (
34
34
  from deltacat.types.media import ContentType
35
35
  from deltacat.storage.model.sort_key import SortScheme, SortSchemeList
36
36
  from deltacat.storage.model.types import LifecycleState
37
+ from deltacat.types.tables import TableProperty
37
38
 
38
39
  TableVersionProperties = Dict[str, Any]
39
40
 
@@ -251,6 +252,13 @@ class TableVersion(Metafile):
251
252
  return table_version_locator.table_version
252
253
  return None
253
254
 
255
+ def url(self, catalog_name: Optional[str] = None) -> str:
256
+ return (
257
+ f"dc://{catalog_name}/{self.namespace}/{self.table_name}/{self.table_version}/"
258
+ if catalog_name
259
+ else f"table://{self.namespace}/{self.table_name}/{self.table_version}/"
260
+ )
261
+
254
262
  def is_supported_content_type(self, content_type: ContentType):
255
263
  supported_content_types = self.content_types
256
264
  return (not supported_content_types) or (
@@ -355,6 +363,9 @@ class TableVersion(Metafile):
355
363
  )
356
364
  return int(version_number) if version_number is not None else None
357
365
 
366
+ def read_table_property(self, property: TableProperty) -> Any:
367
+ return TableProperty.read_table_property(self, property)
368
+
358
369
  @staticmethod
359
370
  def next_version(previous_version: Optional[str] = None) -> str:
360
371
  """