deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,11 @@
1
1
  from __future__ import annotations
2
+
3
+ import base64
2
4
  from enum import Enum
3
5
  from typing import Dict, Any, Optional
6
+ import pyarrow as pa
7
+
8
+ from deltacat.constants import METAFILE_FORMAT, METAFILE_FORMAT_JSON
4
9
 
5
10
 
6
11
  class TransformName(str, Enum):
@@ -36,6 +41,18 @@ class BucketingStrategy(str, Enum):
36
41
  ICEBERG = "iceberg"
37
42
 
38
43
 
44
+ class TruncateStrategy(str, Enum):
45
+ """
46
+ A truncation strategy for the transform
47
+ """
48
+
49
+ # Default DeltaCAT truncate strategy.
50
+ DEFAULT = "default"
51
+
52
+ # Iceberg-compliant truncate strategy.
53
+ ICEBERG = "iceberg"
54
+
55
+
39
56
  class BucketTransformParameters(TransformParameters):
40
57
  """
41
58
  Parameters for the bucket transform.
@@ -44,7 +61,7 @@ class BucketTransformParameters(TransformParameters):
44
61
  @staticmethod
45
62
  def of(
46
63
  num_buckets: int,
47
- bucketing_strategy: BucketingStrategy,
64
+ bucketing_strategy: BucketingStrategy = BucketingStrategy.DEFAULT,
48
65
  ) -> BucketTransformParameters:
49
66
  bucket_transform_parameters = BucketTransformParameters()
50
67
  bucket_transform_parameters["numBuckets"] = num_buckets
@@ -73,9 +90,13 @@ class TruncateTransformParameters(TransformParameters):
73
90
  """
74
91
 
75
92
  @staticmethod
76
- def of(width: int) -> TruncateTransformParameters:
93
+ def of(
94
+ width: int,
95
+ truncate_strategy: TruncateStrategy = TruncateStrategy.DEFAULT,
96
+ ) -> TruncateTransformParameters:
77
97
  truncate_transform_parameters = TruncateTransformParameters()
78
98
  truncate_transform_parameters["width"] = width
99
+ truncate_transform_parameters["truncateStrategy"] = truncate_strategy
79
100
  return truncate_transform_parameters
80
101
 
81
102
  @property
@@ -85,6 +106,13 @@ class TruncateTransformParameters(TransformParameters):
85
106
  """
86
107
  return self["width"]
87
108
 
109
+ @property
110
+ def truncate_strategy(self) -> TruncateStrategy:
111
+ """
112
+ The truncate strategy to use.
113
+ """
114
+ return TruncateStrategy(self["truncateStrategy"])
115
+
88
116
 
89
117
  class Transform(dict):
90
118
  """
@@ -112,10 +140,44 @@ class Transform(dict):
112
140
  ) -> None:
113
141
  NAME_TO_TRANSFORM[self.name].parameters = parameters
114
142
 
143
+ @property
144
+ def return_type(self) -> Optional[pa.DataType]:
145
+ """
146
+ The PyArrow data type that this transform returns.
147
+ A return value of "None" indicates that the return type is the same
148
+ as the source type. Transforms that always return null return pa.null().
149
+ """
150
+ return_type = self.get("return_type")
151
+ if return_type is not None:
152
+ schema_bytes = (
153
+ base64.b64decode(return_type)
154
+ if METAFILE_FORMAT == METAFILE_FORMAT_JSON
155
+ else return_type
156
+ )
157
+ return_type = pa.ipc.read_schema(
158
+ pa.py_buffer(schema_bytes),
159
+ )[0].type
160
+ return return_type
161
+
162
+ @return_type.setter
163
+ def return_type(self, return_type: pa.Schema) -> None:
164
+ """
165
+ Set the PyArrow data type that this transform returns.
166
+ """
167
+ self["return_type"] = return_type.serialize().to_pybytes()
168
+
169
+ @property
170
+ def is_multi_field_transform(self) -> bool:
171
+ """
172
+ Whether this transform is a multi-field transform.
173
+ """
174
+ return False
175
+
115
176
 
116
177
  class BucketTransform(Transform):
117
178
  """
118
179
  A transform that hashes field values into a fixed number of buckets.
180
+ Returns a PyArrow int32() type.
119
181
  """
120
182
 
121
183
  @staticmethod
@@ -123,6 +185,7 @@ class BucketTransform(Transform):
123
185
  transform = BucketTransform()
124
186
  transform.name = TransformName.BUCKET
125
187
  transform.parameters = parameters
188
+ transform.return_type = pa.schema([("return_type", pa.int32())])
126
189
  return transform
127
190
 
128
191
  @property
@@ -139,10 +202,15 @@ class BucketTransform(Transform):
139
202
  ) -> None:
140
203
  self["parameters"] = parameters
141
204
 
205
+ @property
206
+ def is_multi_field_transform(self) -> bool:
207
+ return True
208
+
142
209
 
143
210
  class TruncateTransform(Transform):
144
211
  """
145
212
  A transform that truncates field values to a fixed width.
213
+ Returns the same type as the input field.
146
214
  """
147
215
 
148
216
  @staticmethod
@@ -170,6 +238,7 @@ class TruncateTransform(Transform):
170
238
  class IdentityTransform(Transform):
171
239
  """
172
240
  A no-op transform that returns unmodified field values.
241
+ Returns the same PyArrow type as the input.
173
242
  """
174
243
 
175
244
  @staticmethod
@@ -182,60 +251,70 @@ class IdentityTransform(Transform):
182
251
  class HourTransform(Transform):
183
252
  """
184
253
  A transform that returns the hour of a datetime value.
254
+ Returns a PyArrow int32 type representing the hour (0-23).
185
255
  """
186
256
 
187
257
  @staticmethod
188
258
  def of() -> HourTransform:
189
259
  transform = HourTransform()
190
260
  transform.name = TransformName.HOUR
261
+ transform.return_type = pa.schema([("return_type", pa.int32())])
191
262
  return transform
192
263
 
193
264
 
194
265
  class DayTransform(Transform):
195
266
  """
196
267
  A transform that returns the day of a datetime value.
268
+ Returns a PyArrow int32 type representing the day (1-31).
197
269
  """
198
270
 
199
271
  @staticmethod
200
272
  def of() -> DayTransform:
201
273
  transform = DayTransform()
202
274
  transform.name = TransformName.DAY
275
+ transform.return_type = pa.schema([("return_type", pa.int32())])
203
276
  return transform
204
277
 
205
278
 
206
279
  class MonthTransform(Transform):
207
280
  """
208
281
  A transform that returns the month of a datetime value.
282
+ Returns a PyArrow int32 type representing the month (1-12).
209
283
  """
210
284
 
211
285
  @staticmethod
212
286
  def of() -> MonthTransform:
213
287
  transform = MonthTransform()
214
288
  transform.name = TransformName.MONTH
289
+ transform.return_type = pa.schema([("return_type", pa.int32())])
215
290
  return transform
216
291
 
217
292
 
218
293
  class YearTransform(Transform):
219
294
  """
220
295
  A transform that returns the year of a datetime value.
296
+ Returns a PyArrow int32 type representing the year.
221
297
  """
222
298
 
223
299
  @staticmethod
224
300
  def of() -> YearTransform:
225
301
  transform = YearTransform()
226
302
  transform.name = TransformName.YEAR
303
+ transform.return_type = pa.schema([("return_type", pa.int32())])
227
304
  return transform
228
305
 
229
306
 
230
307
  class VoidTransform(Transform):
231
308
  """
232
309
  A transform that coerces all field values to None.
310
+ Returns a PyArrow null type.
233
311
  """
234
312
 
235
313
  @staticmethod
236
314
  def of() -> VoidTransform:
237
315
  transform = VoidTransform()
238
316
  transform.name = TransformName.VOID
317
+ transform.return_type = pa.schema([("return_type", pa.null())])
239
318
  return transform
240
319
 
241
320
 
@@ -6,13 +6,21 @@ from typing import List, Union
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  import pyarrow as pa
9
+ import polars as pl
9
10
  from ray.data.dataset import Dataset as RayDataset
10
11
  from daft import DataFrame as DaftDataFrame
11
12
 
13
+ from deltacat.constants import (
14
+ RUNNING_TXN_DIR_NAME,
15
+ PAUSED_TXN_DIR_NAME,
16
+ FAILED_TXN_DIR_NAME,
17
+ SUCCESS_TXN_DIR_NAME,
18
+ )
12
19
 
13
20
  LocalTable = Union[
14
21
  pa.Table,
15
22
  pd.DataFrame,
23
+ pl.DataFrame,
16
24
  np.ndarray,
17
25
  pa.parquet.ParquetFile,
18
26
  ]
@@ -36,34 +44,10 @@ class DeltaType(str, Enum):
36
44
  DELETE = "delete"
37
45
 
38
46
 
39
- class TransactionType(str, Enum):
40
- # the transaction reads existing data
41
- # does not conflict with any other transaction types
42
- READ = "read"
43
- # the transaction only appends new data
44
- # conflicts with other transaction types can be auto-resolved
45
- APPEND = "append"
46
- # the transaction alters existing data
47
- # (even if it also appends data)
48
- # conflicts with other alters/overwrites/restates/deletes fail
49
- ALTER = "alter"
50
- # the transaction overwrites existing data
51
- # (even if it also appends or alters data)
52
- # conflicts with other alters/overwrites/restates/deletes fail
53
- OVERWRITE = "overwrite"
54
- # the transaction restates existing data with a new layout
55
- # (even if it appends, alters, or overwrites data to do so)
56
- # conflicts with other alters/overwrites/restates/deletes fail
57
- RESTATE = "restate"
58
- # the transaction deletes existing data
59
- # (even if it also appends, alters, overwrites, or restates data)
60
- # conflicts with other alters/overwrites/restates/deletes fail
61
- DELETE = "delete"
62
-
63
-
64
47
  class TransactionOperationType(str, Enum):
65
48
  CREATE = "create"
66
49
  UPDATE = "update"
50
+ REPLACE = "replace"
67
51
  DELETE = "delete"
68
52
 
69
53
  READ_SIBLINGS = "read_siblings"
@@ -76,6 +60,7 @@ class TransactionOperationType(str, Enum):
76
60
  return {
77
61
  TransactionOperationType.CREATE,
78
62
  TransactionOperationType.UPDATE,
63
+ TransactionOperationType.REPLACE,
79
64
  TransactionOperationType.DELETE,
80
65
  }
81
66
 
@@ -92,7 +77,44 @@ class TransactionOperationType(str, Enum):
92
77
  return self in TransactionOperationType.write_operations()
93
78
 
94
79
  def is_read_operation(self) -> bool:
95
- return self in TransactionOperationType.read_operatins()
80
+ return self in TransactionOperationType.read_operations()
81
+
82
+
83
+ class TransactionStatus(str, Enum):
84
+ """
85
+ Transaction user status types. Every transaction status maps to a distinct
86
+ transaction log directory.
87
+ """
88
+
89
+ SUCCESS = "SUCCESS"
90
+ RUNNING = "RUNNING"
91
+ PAUSED = "PAUSED"
92
+ FAILED = "FAILED"
93
+
94
+ def dir_name(self) -> str:
95
+ if self == TransactionStatus.RUNNING:
96
+ return RUNNING_TXN_DIR_NAME
97
+ elif self == TransactionStatus.PAUSED:
98
+ return PAUSED_TXN_DIR_NAME
99
+ elif self == TransactionStatus.FAILED:
100
+ return FAILED_TXN_DIR_NAME
101
+ elif self == TransactionStatus.SUCCESS:
102
+ return SUCCESS_TXN_DIR_NAME
103
+
104
+
105
+ class TransactionState(str, Enum):
106
+ """
107
+ Transaction system state types. Transaction states do not map to distinct transaction log directories,
108
+ but can be inferred by its presence in one or more directories. These states are used to infer whether
109
+ to run system activities like transaction cleanup jobs.
110
+ """
111
+
112
+ FAILED = "FAILED"
113
+ PURGED = "PURGED"
114
+ TIMEOUT = "TIMEOUT"
115
+ RUNNING = "RUNNING"
116
+ SUCCESS = "SUCCESS"
117
+ PAUSED = "PAUSED"
96
118
 
97
119
 
98
120
  class LifecycleState(str, Enum):
@@ -45,7 +45,11 @@ class TestCloudpickleBugFix(unittest.TestCase):
45
45
  def test_sanity(self):
46
46
  ray.init(local_mode=True, ignore_reinit_error=True)
47
47
 
48
- result = ray.get(calculate_pickled_length.remote(AnyObject()))
49
-
50
- self.assertTrue(result[0] < 1000)
51
- self.assertTrue(result[1] >= 5000000)
48
+ try:
49
+ result = ray.get(calculate_pickled_length.remote(AnyObject()))
50
+
51
+ self.assertTrue(result[0] < 1000)
52
+ self.assertTrue(result[1] >= 5000000)
53
+ finally:
54
+ if ray.is_initialized():
55
+ ray.shutdown()
@@ -2,9 +2,8 @@ import unittest
2
2
 
3
3
  import botocore
4
4
 
5
- from deltacat.aws.constants import RETRYABLE_TRANSIENT_ERRORS
6
- from deltacat.aws.s3u import UuidBlockWritePathProvider, CapturedBlockWritePaths
7
-
5
+ from deltacat.constants import RETRYABLE_TRANSIENT_ERRORS
6
+ from deltacat.types.tables import CapturedBlockWritePaths, UuidBlockWritePathProvider
8
7
 
9
8
  import os
10
9
  from unittest import mock
@@ -99,34 +98,6 @@ class TestDownloadUpload(unittest.TestCase):
99
98
 
100
99
  assert mock_s3.put_object.call_count > 3
101
100
 
102
- @patch("deltacat.aws.s3u.UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 1)
103
- @patch("deltacat.aws.s3u.ManifestEntry")
104
- @patch("deltacat.aws.s3u._get_metadata")
105
- @patch("deltacat.aws.s3u.CapturedBlockWritePaths")
106
- def test_upload_sliced_table_retry(
107
- self,
108
- mock_captured_block_write_paths,
109
- mock_get_metadata,
110
- mock_manifest_entry,
111
- ):
112
- mock_manifest_entry.from_s3_obj_url.side_effect = OSError(
113
- "Please reduce your request rate.."
114
- )
115
- mock_get_metadata.return_value = [mock.MagicMock()]
116
- cbwp = CapturedBlockWritePaths()
117
- cbwp._write_paths = ["s3_write_path"]
118
- cbwp._block_refs = [mock.MagicMock()]
119
- mock_captured_block_write_paths.return_value = cbwp
120
- with pytest.raises(RetryError):
121
- s3u.upload_sliced_table(
122
- mock.MagicMock(),
123
- "s3-prefix",
124
- mock.MagicMock(),
125
- mock.MagicMock(),
126
- mock.MagicMock(),
127
- mock.MagicMock(),
128
- )
129
-
130
101
  @patch("deltacat.aws.s3u.UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 1)
131
102
  @patch("deltacat.aws.s3u.s3_client_cache")
132
103
  def test_upload_transient_error_retry(self, mock_s3_client_cache):
File without changes
File without changes
@@ -0,0 +1,130 @@
1
+ import shutil
2
+
3
+ from deltacat.catalog import get_catalog_properties
4
+ from deltacat.exceptions import NamespaceAlreadyExistsError
5
+ import pytest
6
+ import tempfile
7
+ import deltacat.catalog.main.impl as catalog
8
+
9
+
10
+ class TestCatalogNamespaceOperations:
11
+ temp_dir = None
12
+ property_catalog = None
13
+ catalog = None
14
+
15
+ @classmethod
16
+ def setup_class(cls):
17
+ cls.temp_dir = tempfile.mkdtemp()
18
+ cls.catalog_properties = get_catalog_properties(root=cls.temp_dir)
19
+
20
+ @classmethod
21
+ def teardown_class(cls):
22
+ shutil.rmtree(cls.temp_dir)
23
+
24
+ def test_create_namespace(self):
25
+ """Test creating a namespace with properties"""
26
+ namespace = "test_create_namespace"
27
+ properties = {"description": "Test Namespace", "owner": "test-user"}
28
+
29
+ # Create namespace
30
+ catalog.create_namespace(
31
+ namespace=namespace, properties=properties, inner=self.catalog_properties
32
+ )
33
+
34
+ # Verify namespace exists
35
+ assert catalog.namespace_exists(namespace, inner=self.catalog_properties)
36
+
37
+ # Get namespace and verify properties
38
+ namespace = catalog.get_namespace(namespace, inner=self.catalog_properties)
39
+ assert namespace.namespace == "test_create_namespace"
40
+ assert namespace.properties["description"] == "Test Namespace"
41
+
42
+ def test_get_namespace(self):
43
+ """Test getting namespace properties"""
44
+ namespace = "test_get_namespace"
45
+ properties = {"description": "foo", "created_by": "bar"}
46
+
47
+ # Create namespace
48
+ catalog.create_namespace(
49
+ namespace=namespace, properties=properties, inner=self.catalog_properties
50
+ )
51
+
52
+ # Get namespace properties
53
+ namespace = catalog.get_namespace(namespace, inner=self.catalog_properties)
54
+
55
+ # Verify properties
56
+ assert namespace.namespace == "test_get_namespace"
57
+ assert namespace.properties["created_by"] == "bar"
58
+
59
+ def test_namespace_exists(self):
60
+ """Test checking if a namespace exists"""
61
+ existing_namespace = "test_namespace_exists"
62
+ non_existing_namespace = "non_existing_namespace"
63
+
64
+ # Create namespace
65
+ catalog.create_namespace(
66
+ namespace=existing_namespace, properties={}, inner=self.catalog_properties
67
+ )
68
+
69
+ # Check existing namespace
70
+ assert catalog.namespace_exists(
71
+ existing_namespace, inner=self.catalog_properties
72
+ )
73
+
74
+ # Check non-existing namespace
75
+ assert not catalog.namespace_exists(
76
+ non_existing_namespace, inner=self.catalog_properties
77
+ )
78
+
79
+ def test_create_namespace_already_exists(self):
80
+ """Test creating a namespace that already exists should fail"""
81
+ namespace = "test_create_namespace_already_exists"
82
+ properties = {"description": "Test namespace", "owner": "test-user"}
83
+
84
+ # Create namespace first time
85
+ catalog.create_namespace(
86
+ namespace=namespace,
87
+ properties=properties,
88
+ inner=self.catalog_properties,
89
+ )
90
+
91
+ # Verify namespace exists
92
+ assert catalog.namespace_exists(namespace, inner=self.catalog_properties)
93
+
94
+ # Try to create the same namespace again, should raise ValueError
95
+ with pytest.raises(NamespaceAlreadyExistsError, match=namespace):
96
+ catalog.create_namespace(
97
+ namespace=namespace,
98
+ properties=properties,
99
+ inner=self.catalog_properties,
100
+ )
101
+
102
+ def test_drop_namespace(self):
103
+ """Test dropping a namespace"""
104
+ namespace = "test_drop_namespace"
105
+ properties = {"description": "Test Namespace", "owner": "test-user"}
106
+
107
+ # Create namespace
108
+ catalog.create_namespace(
109
+ namespace=namespace,
110
+ properties=properties,
111
+ inner=self.catalog_properties,
112
+ )
113
+
114
+ # Verify namespace exists
115
+ assert catalog.namespace_exists(
116
+ namespace,
117
+ inner=self.catalog_properties,
118
+ )
119
+
120
+ # Drop namespace
121
+ catalog.drop_namespace(
122
+ namespace,
123
+ inner=self.catalog_properties,
124
+ )
125
+
126
+ # Verify namespace does not exist
127
+ assert not catalog.namespace_exists(
128
+ namespace,
129
+ inner=self.catalog_properties,
130
+ )