deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,18 @@
1
1
  from __future__ import annotations
2
- from typing import List
3
2
  from enum import Enum
3
+ from typing import Dict, Any, Optional
4
4
 
5
5
 
6
6
  class TransformName(str, Enum):
7
7
  IDENTITY = "identity"
8
8
  BUCKET = "bucket"
9
+ YEAR = "year"
10
+ MONTH = "month"
11
+ DAY = "day"
12
+ HOUR = "hour"
13
+ TRUNCATE = "truncate"
14
+ VOID = "void"
15
+ UNKNOWN = "unknown"
9
16
 
10
17
 
11
18
  class TransformParameters(dict):
@@ -17,63 +24,30 @@ class TransformParameters(dict):
17
24
  pass
18
25
 
19
26
 
20
- class IdentityTransformParameters(TransformParameters):
21
- """
22
- This class is used to pass parameters to the identity transform
23
- """
24
-
25
- @staticmethod
26
- def of(column_name: str) -> IdentityTransformParameters:
27
- identify_transform_parameters = IdentityTransformParameters()
28
- identify_transform_parameters["columnName"] = column_name
29
- return identify_transform_parameters
30
-
31
- @property
32
- def column_name(self) -> str:
33
- """
34
- The name of the column to use for identity transform
35
- """
36
- return self["columnName"]
37
-
38
- @column_name.setter
39
- def column_name(self, value: str) -> None:
40
- self["columnName"] = value
41
-
42
-
43
27
  class BucketingStrategy(str, Enum):
44
28
  """
45
29
  A bucketing strategy for the transform
46
30
  """
47
31
 
48
- # Uses default deltacat bucketing strategy.
49
- # This strategy supports hashing on composite keys
50
- # and uses SHA1 hashing for determining the bucket.
51
- # If no columns passed, it will use a random UUID
52
- # for determining the bucket.
32
+ # Default DeltaCAT SHA-1 based hash bucketing strategy.
53
33
  DEFAULT = "default"
54
34
 
55
- # Uses iceberg compliant bucketing strategy.
56
- # As indicated in the iceberg spec, it does not support
57
- # composite keys and uses murmur3 hash for determining
58
- # the bucket.
59
- # See https://iceberg.apache.org/spec/#partitioning
35
+ # Iceberg-compliant murmur3 based hash bucketing strategy.
60
36
  ICEBERG = "iceberg"
61
37
 
62
38
 
63
39
  class BucketTransformParameters(TransformParameters):
64
40
  """
65
- Encapsulates parameters for the bucket transform.
41
+ Parameters for the bucket transform.
66
42
  """
67
43
 
44
+ @staticmethod
68
45
  def of(
69
- self,
70
46
  num_buckets: int,
71
- column_names: List[str],
72
47
  bucketing_strategy: BucketingStrategy,
73
48
  ) -> BucketTransformParameters:
74
49
  bucket_transform_parameters = BucketTransformParameters()
75
50
  bucket_transform_parameters["numBuckets"] = num_buckets
76
- bucket_transform_parameters["columnNames"] = column_names
77
51
  bucket_transform_parameters["bucketingStrategy"] = bucketing_strategy
78
52
 
79
53
  return bucket_transform_parameters
@@ -81,47 +55,210 @@ class BucketTransformParameters(TransformParameters):
81
55
  @property
82
56
  def num_buckets(self) -> int:
83
57
  """
84
- The total number of buckets to create for values of the column
58
+ The total number of buckets to create.
85
59
  """
86
60
  return self["numBuckets"]
87
61
 
88
62
  @property
89
- def column_names(self) -> List[str]:
63
+ def bucketing_strategy(self) -> BucketingStrategy:
90
64
  """
91
- An ordered list of unique column names from the table schema
92
- to use for bucketings.
65
+ The bucketing strategy to use.
93
66
  """
94
- return self["columnNames"]
67
+ return BucketingStrategy(self["bucketingStrategy"])
68
+
69
+
70
+ class TruncateTransformParameters(TransformParameters):
71
+ """
72
+ Parameters for the truncate transform.
73
+ """
74
+
75
+ @staticmethod
76
+ def of(width: int) -> TruncateTransformParameters:
77
+ truncate_transform_parameters = TruncateTransformParameters()
78
+ truncate_transform_parameters["width"] = width
79
+ return truncate_transform_parameters
95
80
 
96
81
  @property
97
- def bucketing_strategy(self) -> BucketingStrategy:
82
+ def width(self) -> int:
98
83
  """
99
- The bucketing strategy to used.
84
+ The width to truncate the field to.
100
85
  """
101
- return self["bucketingStrategy"]
86
+ return self["width"]
102
87
 
103
88
 
104
89
  class Transform(dict):
105
90
  """
106
- A transform is represents how a particular column value can be
107
- transformed into a new value. This is mostly used in the context
108
- of partitioning the data files in a table.
91
+ A transform represents how a particular column value can be
92
+ transformed into a new value. For example, transforms may be used
93
+ to determine partition or sort values for table records.
94
+ """
95
+
96
+ @property
97
+ def name(self) -> TransformName:
98
+ return TransformName(self["name"])
99
+
100
+ @name.setter
101
+ def name(self, name: TransformName) -> None:
102
+ self["name"] = name
103
+
104
+ @property
105
+ def parameters(self) -> Optional[TransformParameters]:
106
+ return NAME_TO_TRANSFORM[self.name].parameters
107
+
108
+ @parameters.setter
109
+ def parameters(
110
+ self,
111
+ parameters: Optional[TransformParameters] = None,
112
+ ) -> None:
113
+ NAME_TO_TRANSFORM[self.name].parameters = parameters
114
+
115
+
116
+ class BucketTransform(Transform):
117
+ """
118
+ A transform that hashes field values into a fixed number of buckets.
109
119
  """
110
120
 
111
121
  @staticmethod
112
- def of(
113
- name: TransformName,
114
- parameters: TransformParameters,
115
- ) -> Transform:
116
- partition_transform = Transform()
117
- partition_transform["name"] = name
118
- partition_transform["parameters"] = parameters
119
- return partition_transform
122
+ def of(parameters: BucketTransformParameters) -> BucketTransform:
123
+ transform = BucketTransform()
124
+ transform.name = TransformName.BUCKET
125
+ transform.parameters = parameters
126
+ return transform
120
127
 
121
128
  @property
122
- def name(self) -> TransformName:
123
- return self["name"]
129
+ def parameters(self) -> BucketTransformParameters:
130
+ val: Dict[str, Any] = self.get("parameters")
131
+ if val is not None and not isinstance(val, BucketTransformParameters.__class__):
132
+ self["parameters"] = val = BucketTransformParameters(val)
133
+ return val
134
+
135
+ @parameters.setter
136
+ def parameters(
137
+ self,
138
+ parameters: Optional[BucketTransformParameters] = None,
139
+ ) -> None:
140
+ self["parameters"] = parameters
141
+
142
+
143
+ class TruncateTransform(Transform):
144
+ """
145
+ A transform that truncates field values to a fixed width.
146
+ """
147
+
148
+ @staticmethod
149
+ def of(parameters: TruncateTransformParameters) -> TruncateTransform:
150
+ transform = TruncateTransform()
151
+ transform.name = TransformName.TRUNCATE
152
+ transform.parameters = parameters
153
+ return transform
124
154
 
125
155
  @property
126
- def parameters(self) -> TransformParameters:
127
- return self["parameters"]
156
+ def parameters(self) -> TruncateTransformParameters:
157
+ val: Dict[str, Any] = self.get("parameters")
158
+ if val is not None and not isinstance(val, TruncateTransformParameters):
159
+ self["parameters"] = val = TruncateTransformParameters(val)
160
+ return val
161
+
162
+ @parameters.setter
163
+ def parameters(
164
+ self,
165
+ parameters: Optional[TruncateTransformParameters] = None,
166
+ ) -> None:
167
+ self["parameters"] = parameters
168
+
169
+
170
+ class IdentityTransform(Transform):
171
+ """
172
+ A no-op transform that returns unmodified field values.
173
+ """
174
+
175
+ @staticmethod
176
+ def of() -> IdentityTransform:
177
+ transform = IdentityTransform()
178
+ transform.name = TransformName.IDENTITY
179
+ return transform
180
+
181
+
182
+ class HourTransform(Transform):
183
+ """
184
+ A transform that returns the hour of a datetime value.
185
+ """
186
+
187
+ @staticmethod
188
+ def of() -> HourTransform:
189
+ transform = HourTransform()
190
+ transform.name = TransformName.HOUR
191
+ return transform
192
+
193
+
194
+ class DayTransform(Transform):
195
+ """
196
+ A transform that returns the day of a datetime value.
197
+ """
198
+
199
+ @staticmethod
200
+ def of() -> DayTransform:
201
+ transform = DayTransform()
202
+ transform.name = TransformName.DAY
203
+ return transform
204
+
205
+
206
+ class MonthTransform(Transform):
207
+ """
208
+ A transform that returns the month of a datetime value.
209
+ """
210
+
211
+ @staticmethod
212
+ def of() -> MonthTransform:
213
+ transform = MonthTransform()
214
+ transform.name = TransformName.MONTH
215
+ return transform
216
+
217
+
218
+ class YearTransform(Transform):
219
+ """
220
+ A transform that returns the year of a datetime value.
221
+ """
222
+
223
+ @staticmethod
224
+ def of() -> YearTransform:
225
+ transform = YearTransform()
226
+ transform.name = TransformName.YEAR
227
+ return transform
228
+
229
+
230
+ class VoidTransform(Transform):
231
+ """
232
+ A transform that coerces all field values to None.
233
+ """
234
+
235
+ @staticmethod
236
+ def of() -> VoidTransform:
237
+ transform = VoidTransform()
238
+ transform.name = TransformName.VOID
239
+ return transform
240
+
241
+
242
+ class UnknownTransform(Transform):
243
+ """
244
+ An unknown or invalid transform.
245
+ """
246
+
247
+ @staticmethod
248
+ def of() -> UnknownTransform:
249
+ transform = UnknownTransform()
250
+ transform.name = TransformName.UNKNOWN
251
+ return transform
252
+
253
+
254
+ NAME_TO_TRANSFORM: Dict[TransformName, Transform] = {
255
+ TransformName.IDENTITY: IdentityTransform,
256
+ TransformName.BUCKET: BucketTransform,
257
+ TransformName.YEAR: YearTransform,
258
+ TransformName.MONTH: MonthTransform,
259
+ TransformName.DAY: DayTransform,
260
+ TransformName.HOUR: HourTransform,
261
+ TransformName.TRUNCATE: TruncateTransform,
262
+ TransformName.VOID: VoidTransform,
263
+ TransformName.UNKNOWN: UnknownTransform,
264
+ }
@@ -1,25 +1,100 @@
1
+ from __future__ import annotations
2
+
1
3
  from enum import Enum
2
4
  from typing import List, Union
3
5
 
4
- from pyarrow.parquet import ParquetFile
5
6
  import numpy as np
6
7
  import pandas as pd
7
8
  import pyarrow as pa
8
9
  from ray.data.dataset import Dataset
9
10
  from daft import DataFrame as DaftDataFrame
10
11
 
11
- LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
12
+
13
+ LocalTable = Union[
14
+ pa.Table,
15
+ pd.DataFrame,
16
+ np.ndarray,
17
+ pa.parquet.ParquetFile,
18
+ ]
12
19
  LocalDataset = List[LocalTable]
13
20
  DistributedDataset = Union[Dataset, DaftDataFrame]
14
21
 
15
22
 
23
+ class StreamFormat(str, Enum):
24
+ DELTACAT = "deltacat"
25
+ ICEBERG = "iceberg"
26
+ HUDI = "hudi"
27
+ DELTA_LAKE = "delta_lake"
28
+ SQLITE3 = "SQLITE3" # used by tests
29
+
30
+
16
31
  class DeltaType(str, Enum):
17
32
  APPEND = "append"
18
33
  UPSERT = "upsert"
19
34
  DELETE = "delete"
20
35
 
21
36
 
37
+ class TransactionType(str, Enum):
38
+ # the transaction reads existing data
39
+ # does not conflict with any other transaction types
40
+ READ = "read"
41
+ # the transaction only appends new data
42
+ # conflicts with other transaction types can be auto-resolved
43
+ APPEND = "append"
44
+ # the transaction alters existing data
45
+ # (even if it also appends data)
46
+ # conflicts with other alters/overwrites/restates/deletes fail
47
+ ALTER = "alter"
48
+ # the transaction overwrites existing data
49
+ # (even if it also appends or alters data)
50
+ # conflicts with other alters/overwrites/restates/deletes fail
51
+ OVERWRITE = "overwrite"
52
+ # the transaction restates existing data with a new layout
53
+ # (even if it appends, alters, or overwrites data to do so)
54
+ # conflicts with other alters/overwrites/restates/deletes fail
55
+ RESTATE = "restate"
56
+ # the transaction deletes existing data
57
+ # (even if it also appends, alters, overwrites, or restates data)
58
+ # conflicts with other alters/overwrites/restates/deletes fail
59
+ DELETE = "delete"
60
+
61
+
62
+ class TransactionOperationType(str, Enum):
63
+ CREATE = "create"
64
+ UPDATE = "update"
65
+ DELETE = "delete"
66
+
67
+ READ_SIBLINGS = "read_siblings"
68
+ READ_CHILDREN = "read_children"
69
+ READ_LATEST = "read_latest"
70
+ READ_EXISTS = "read_exists"
71
+
72
+ @staticmethod
73
+ def write_operations():
74
+ return {
75
+ TransactionOperationType.CREATE,
76
+ TransactionOperationType.UPDATE,
77
+ TransactionOperationType.DELETE,
78
+ }
79
+
80
+ @staticmethod
81
+ def read_operations():
82
+ return {
83
+ TransactionOperationType.READ_SIBLINGS,
84
+ TransactionOperationType.READ_CHILDREN,
85
+ TransactionOperationType.READ_LATEST,
86
+ TransactionOperationType.READ_EXISTS,
87
+ }
88
+
89
+ def is_write_operation(self) -> bool:
90
+ return self in TransactionOperationType.write_operations()
91
+
92
+ def is_read_operation(self) -> bool:
93
+ return self in TransactionOperationType.read_operatins()
94
+
95
+
22
96
  class LifecycleState(str, Enum):
97
+ CREATED = "created"
23
98
  UNRELEASED = "unreleased"
24
99
  ACTIVE = "active"
25
100
  DEPRECATED = "deprecated"
@@ -35,22 +110,45 @@ class CommitState(str, Enum):
35
110
 
36
111
  class SchemaConsistencyType(str, Enum):
37
112
  """
38
- Schemas are optional for DeltaCAT tables and can be used to inform the data
39
- consistency checks run for each field. If a schema is present, it can be
40
- used to enforce the following column-level data consistency policies at
41
- table load time:
113
+ DeltaCAT table schemas can be used to inform the data consistency checks
114
+ run for each field. When present, the schema can be used to enforce the
115
+ following field-level data consistency policies at table load time:
42
116
 
43
- NONE: No consistency checks are run. May be mixed with the below two
44
- policies by specifying column names to pass through together with
45
- column names to coerce/validate.
117
+ NONE: No consistency checks are run.
46
118
 
47
- COERCE: Coerce fields to fit the schema whenever possible. An explicit
48
- subset of column names to coerce may optionally be specified.
119
+ COERCE: Coerce fields to fit the schema whenever possible.
49
120
 
50
- VALIDATE: Raise an error for any fields that don't fit the schema. An
51
- explicit subset of column names to validate may optionally be specified.
121
+ VALIDATE: Raise an error for any fields that don't fit the schema.
52
122
  """
53
123
 
54
124
  NONE = "none"
55
125
  COERCE = "coerce"
56
126
  VALIDATE = "validate"
127
+
128
+
129
+ class SortOrder(str, Enum):
130
+ ASCENDING = "ascending"
131
+ DESCENDING = "descending"
132
+
133
+ @classmethod
134
+ def _missing_(cls, value: str):
135
+ # pyiceberg.table.sorting.SortDirection mappings
136
+ if value.lower() == "asc":
137
+ return SortOrder.ASCENDING
138
+ elif value.lower() == "desc":
139
+ return SortOrder.DESCENDING
140
+ return None
141
+
142
+
143
+ class NullOrder(str, Enum):
144
+ AT_START = "at_start"
145
+ AT_END = "at_end"
146
+
147
+ @classmethod
148
+ def _missing_(cls, value: str):
149
+ # pyiceberg.table.sorting.NullOrder mappings
150
+ if value.lower() == "nulls-first":
151
+ return NullOrder.AT_START
152
+ elif value.lower() == "nulls-last":
153
+ return NullOrder.AT_END
154
+ return None
@@ -0,0 +1,11 @@
1
+ from .schema.schema import Schema
2
+ from .schema.schema import Field
3
+ from .dataset import Dataset
4
+ from .schema.schema import Datatype
5
+
6
+ __all__ = [
7
+ "Schema",
8
+ "Field",
9
+ "Dataset",
10
+ "Datatype",
11
+ ]
File without changes
@@ -0,0 +1,75 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Iterator, List, Any
3
+ import pyarrow as pa
4
+
5
+ from deltacat.storage.rivulet.metastore.sst import SSTableRow
6
+ from deltacat.storage.rivulet import Schema
7
+ from deltacat.storage.rivulet.serializer import DataSerializer, MEMTABLE_DATA
8
+ from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
+
10
+
11
+ class ArrowSerializer(DataSerializer, ABC):
12
+ """
13
+ Utility class which can serialize data by first converting to arrow as intermediate format
14
+ and then using the provided serialization function
15
+ """
16
+
17
+ def __init__(self, file_provider: FileProvider, schema: Schema):
18
+ self.schema = schema
19
+ self.file_provider = file_provider
20
+ self.arrow_schema = self.schema.to_pyarrow()
21
+
22
+ @abstractmethod
23
+ def serialize(self, table: pa.Table) -> List[SSTableRow]:
24
+ """
25
+ Write an Arrow table to the specified output file
26
+
27
+ :param table: PyArrow table to write
28
+ :return: Number of row groups in the written file
29
+ """
30
+ pass
31
+
32
+ def _to_arrow_table(self, sorted_records: MEMTABLE_DATA) -> pa.Table:
33
+ """
34
+ Convert input records to an Arrow table
35
+ """
36
+ if isinstance(sorted_records, pa.Table):
37
+ return sorted_records
38
+ elif isinstance(sorted_records, (Iterator, List)):
39
+ return pa.Table.from_pylist(sorted_records, schema=self.arrow_schema)
40
+ else:
41
+ raise ValueError(f"Unsupported record type: {type(sorted_records)}")
42
+
43
+ def _get_min_max_key(self, table: pa.Table) -> (Any, Any):
44
+ """
45
+ Get min and max values for the merge key from the table
46
+ """
47
+ key_col = table[self.schema.get_merge_key()]
48
+ return key_col[0].as_py(), key_col[len(key_col) - 1].as_py()
49
+
50
+ def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
51
+ """
52
+ Write records to new parquet file as row group
53
+ For now, we will depend on pyarrow to write to parquet
54
+
55
+ :param sorted_records: record batch in SORTED ORDER
56
+ :return: metadata for constructing SSTable
57
+ """
58
+ if not sorted_records:
59
+ return []
60
+
61
+ table = self._to_arrow_table(sorted_records)
62
+ return self.serialize(table)
63
+
64
+ def write(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
65
+ """
66
+ Write records using the provided serialization function
67
+
68
+ :param sorted_records: record batch in SORTED ORDER
69
+ :return: metadata for constructing SSTable
70
+ """
71
+ if not sorted_records:
72
+ return []
73
+
74
+ table = self._to_arrow_table(sorted_records)
75
+ return self.serialize(table)