deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
deltacat/utils/url.py CHANGED
@@ -43,6 +43,28 @@ from deltacat.storage import (
43
43
  TableVersionLocator,
44
44
  )
45
45
 
46
+
47
+ def _normalize_partition_values_from_json(partition_values):
48
+ """
49
+ Normalize partition values parsed from JSON URLs.
50
+
51
+ Both None and empty list [] represent unpartitioned data, but they should be
52
+ normalized to None for consistent lookup and validation.
53
+
54
+ Args:
55
+ partition_values: Partition values parsed from JSON
56
+
57
+ Returns:
58
+ None for unpartitioned data (both None and [] inputs),
59
+ original value for partitioned data
60
+ """
61
+ if partition_values is None or (
62
+ isinstance(partition_values, list) and len(partition_values) == 0
63
+ ):
64
+ return None
65
+ return partition_values
66
+
67
+
46
68
  RAY_DATASTORE_TYPE_TO_READER = {
47
69
  DatastoreType.AUDIO: lambda url: functools.partial(
48
70
  ray.data.read_audio,
@@ -753,7 +775,7 @@ class DeltaCatUrl:
753
775
  ):
754
776
  self.stream = StreamFormat.DELTACAT
755
777
  else:
756
- self.stream = StreamFormat(self.stream)
778
+ self.stream = StreamFormat(self.unresolved_stream)
757
779
 
758
780
  def __str__(self):
759
781
  return self.url
@@ -762,6 +784,23 @@ class DeltaCatUrl:
762
784
  return self.url
763
785
 
764
786
 
787
+ def _list_table_versions(table: Table, catalog: CatalogProperties):
788
+ return metastore.list_table_versions(
789
+ namespace=table.namespace,
790
+ table_name=table.table_name,
791
+ catalog=catalog,
792
+ )
793
+
794
+
795
+ def _list_streams(table_version: TableVersion, catalog: CatalogProperties):
796
+ return metastore.list_streams(
797
+ namespace=table_version.namespace,
798
+ table_name=table_version.table_name,
799
+ table_version=table_version.table_version,
800
+ catalog=catalog,
801
+ )
802
+
803
+
765
804
  class DeltaCatUrlReader:
766
805
  def __init__(
767
806
  self,
@@ -1004,14 +1043,11 @@ class DeltaCatUrlReader:
1004
1043
  catalog=url.catalog,
1005
1044
  )
1006
1045
  table_version_lister = functools.partial(
1007
- metastore.list_table_versions,
1008
- namespace=url.namespace,
1046
+ _list_table_versions,
1009
1047
  catalog=url.catalog,
1010
1048
  )
1011
1049
  stream_lister = functools.partial(
1012
- metastore.list_streams,
1013
- namespace=url.namespace,
1014
- table_name=url.table,
1050
+ _list_streams,
1015
1051
  catalog=url.catalog,
1016
1052
  )
1017
1053
  partition_lister = functools.partial(
@@ -1025,8 +1061,8 @@ class DeltaCatUrlReader:
1025
1061
  return [
1026
1062
  (namespace_lister, None, None),
1027
1063
  (table_lister, "namespace", lambda x: x.namespace),
1028
- (table_version_lister, "table_name", lambda x: x.table_name),
1029
- (stream_lister, "table_version", lambda x: x.table_version),
1064
+ (table_version_lister, "table", lambda x: x),
1065
+ (stream_lister, "table_version", lambda x: x),
1030
1066
  (partition_lister, "stream", lambda x: x),
1031
1067
  (delta_lister, "partition_like", lambda x: x),
1032
1068
  ]
@@ -1094,7 +1130,11 @@ def _stage_and_commit_partition(
1094
1130
  namespace=partition.namespace,
1095
1131
  table_name=partition.table_name,
1096
1132
  table_version=partition.table_version,
1097
- stream_format=StreamFormat(partition.stream_format),
1133
+ stream_format=StreamFormat(
1134
+ partition.stream_format or StreamFormat.DELTACAT.value
1135
+ ),
1136
+ *args,
1137
+ **kwargs,
1098
1138
  )
1099
1139
  partition = metastore.stage_partition(
1100
1140
  stream=stream,
@@ -1119,6 +1159,7 @@ class DeltaCatUrlWriter:
1119
1159
  ):
1120
1160
  self._url = url
1121
1161
  self._metafile = metafile
1162
+
1122
1163
  if url.is_deltacat_catalog_url():
1123
1164
  if url.path_elements:
1124
1165
  url.resolve_catalog()
@@ -1170,7 +1211,7 @@ class DeltaCatUrlWriter:
1170
1211
  # TODO(pdames): Honor deep vs. shallow copies. Deep copies require
1171
1212
  # first ensuring that all files in the source delta manifest are
1172
1213
  # staged to the target catalog before commit. For deltas whose
1173
- # manifests reference local files, shallow delta copies will be
1214
+ # manifests reference local files, shallow delta copies may be
1174
1215
  # invalid in the target catalog, and should be blocked or
1175
1216
  # converted to a deep copy automatically.
1176
1217
  return functools.partial(
@@ -1187,6 +1228,7 @@ class DeltaCatUrlWriter:
1187
1228
  stream_id=None,
1188
1229
  stream_format=url.stream,
1189
1230
  partition_values=json.loads(url.partition),
1231
+ partition_id=None,
1190
1232
  )
1191
1233
  return functools.partial(
1192
1234
  _stage_and_commit_partition,
@@ -1219,13 +1261,12 @@ class DeltaCatUrlWriter:
1219
1261
  namespace=table_version.namespace,
1220
1262
  table_name=table_version.table_name,
1221
1263
  table_version=table_version.table_version,
1264
+ lifecycle_state=table_version.state,
1222
1265
  schema=table_version.schema,
1223
1266
  partition_scheme=table_version.partition_scheme,
1224
1267
  sort_keys=table_version.sort_scheme,
1225
1268
  table_version_description=table_version.description,
1226
1269
  table_version_properties=table_version.properties,
1227
- table_description=table_version.description,
1228
- table_properties=table_version.properties,
1229
1270
  supported_content_types=table_version.content_types,
1230
1271
  catalog=url.catalog,
1231
1272
  )
@@ -1236,11 +1277,11 @@ class DeltaCatUrlWriter:
1236
1277
  table_name=url.table,
1237
1278
  )
1238
1279
  return functools.partial(
1239
- metastore.create_table_version,
1280
+ metastore.create_table,
1240
1281
  namespace=table.namespace,
1241
1282
  table_name=table.table_name,
1242
- table_description=table.description,
1243
- table_properties=table.properties,
1283
+ description=table.description,
1284
+ properties=table.properties,
1244
1285
  catalog=url.catalog,
1245
1286
  )
1246
1287
  if url.unresolved_namespace: