deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,369 +0,0 @@
1
- from typing import Any, Dict, List, Optional, Set, Union, Tuple
2
- import pyarrow as pa
3
- import logging
4
- from deltacat.catalog.model.table_definition import TableDefinition
5
- from deltacat.storage.model.sort_key import SortKey
6
- from deltacat.storage.model.list_result import ListResult
7
- from deltacat.storage.model.namespace import Namespace
8
- from deltacat.storage.model.types import (
9
- DistributedDataset,
10
- LifecycleState,
11
- LocalDataset,
12
- LocalTable,
13
- SchemaConsistencyType,
14
- )
15
- from deltacat.storage.model.partition import PartitionLocator, Partition
16
- from deltacat.storage.model.table_version import TableVersion
17
- from deltacat.compute.merge_on_read.model.merge_on_read_params import MergeOnReadParams
18
- from deltacat.storage.model.delta import DeltaType
19
- import deltacat.storage.interface as deltacat_storage
20
- from deltacat.types.media import ContentType, TableType, DistributedDatasetType
21
- from deltacat.types.tables import TableWriteMode
22
- from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
23
- from deltacat import logs
24
-
25
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
-
27
- STORAGE = None
28
-
29
-
30
- # table functions
31
- def write_to_table(
32
- data: Union[LocalTable, LocalDataset, DistributedDataset], # type: ignore
33
- table: str,
34
- namespace: Optional[str] = None,
35
- mode: TableWriteMode = TableWriteMode.AUTO,
36
- content_type: ContentType = ContentType.PARQUET,
37
- *args,
38
- **kwargs,
39
- ) -> None:
40
- """Write local or distributed data to a table. Raises an error if the
41
- table does not exist and the table write mode is not CREATE or AUTO.
42
-
43
- When creating a table, all `create_table` parameters may be optionally
44
- specified as additional keyword arguments. When appending to, or replacing,
45
- an existing table, all `alter_table` parameters may be optionally specified
46
- as additional keyword arguments."""
47
- raise NotImplementedError("write_to_table not implemented")
48
-
49
-
50
- def read_table(
51
- table: str,
52
- namespace: Optional[str] = None,
53
- table_version: Optional[str] = None,
54
- table_type: Optional[TableType] = TableType.PYARROW,
55
- distributed_dataset_type: Optional[
56
- DistributedDatasetType
57
- ] = DistributedDatasetType.RAY_DATASET,
58
- partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
59
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
60
- merge_on_read: Optional[bool] = False,
61
- reader_kwargs: Optional[Dict[Any, Any]] = None,
62
- deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
63
- *args,
64
- **kwargs,
65
- ) -> DistributedDataset: # type: ignore
66
- """Read a table into a distributed dataset."""
67
-
68
- if reader_kwargs is None:
69
- reader_kwargs = {}
70
-
71
- if deltacat_storage_kwargs is None:
72
- deltacat_storage_kwargs = {}
73
-
74
- _validate_read_table_args(
75
- namespace=namespace,
76
- table_type=table_type,
77
- distributed_dataset_type=distributed_dataset_type,
78
- merge_on_read=merge_on_read,
79
- )
80
-
81
- table_version_obj = _get_latest_or_given_table_version(
82
- namespace=namespace,
83
- table_name=table,
84
- table_version=table_version,
85
- **deltacat_storage_kwargs,
86
- )
87
- table_version = table_version_obj.table_version
88
-
89
- if (
90
- table_version_obj.content_types is None
91
- or len(table_version_obj.content_types) != 1
92
- ):
93
- raise ValueError(
94
- "Expected exactly one content type but "
95
- f"found {table_version_obj.content_types}."
96
- )
97
-
98
- logger.info(
99
- f"Reading metadata for table={namespace}/{table}/{table_version} "
100
- f"with partition_filters={partition_filter} and stream position"
101
- f" range={stream_position_range_inclusive}"
102
- )
103
-
104
- if partition_filter is None:
105
- logger.info(
106
- f"Reading all partitions metadata in the table={table} "
107
- "as partition_filter was None."
108
- )
109
- partition_filter = STORAGE.list_partitions(
110
- table_name=table,
111
- namespace=namespace,
112
- table_version=table_version,
113
- **deltacat_storage_kwargs,
114
- ).all_items()
115
-
116
- qualified_deltas = _get_deltas_from_partition_filter(
117
- stream_position_range_inclusive=stream_position_range_inclusive,
118
- partition_filter=partition_filter,
119
- **deltacat_storage_kwargs,
120
- )
121
-
122
- logger.info(
123
- f"Total qualified deltas={len(qualified_deltas)} "
124
- f"from {len(partition_filter)} partitions."
125
- )
126
-
127
- merge_on_read_params = MergeOnReadParams.of(
128
- {
129
- "deltas": qualified_deltas,
130
- "deltacat_storage": STORAGE,
131
- "deltacat_storage_kwargs": deltacat_storage_kwargs,
132
- "reader_kwargs": reader_kwargs,
133
- }
134
- )
135
-
136
- return MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE[distributed_dataset_type.value](
137
- params=merge_on_read_params, **kwargs
138
- )
139
-
140
-
141
- def alter_table(
142
- table: str,
143
- namespace: Optional[str] = None,
144
- lifecycle_state: Optional[LifecycleState] = None,
145
- schema_updates: Optional[Dict[str, Any]] = None,
146
- partition_updates: Optional[Dict[str, Any]] = None,
147
- primary_keys: Optional[Set[str]] = None,
148
- sort_keys: Optional[List[SortKey]] = None,
149
- description: Optional[str] = None,
150
- properties: Optional[Dict[str, str]] = None,
151
- *args,
152
- **kwargs,
153
- ) -> None:
154
- """Alter table definition."""
155
- raise NotImplementedError("alter_table not implemented")
156
-
157
-
158
- def create_table(
159
- table: str,
160
- namespace: Optional[str] = None,
161
- lifecycle_state: Optional[LifecycleState] = None,
162
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
163
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
164
- partition_keys: Optional[List[Dict[str, Any]]] = None,
165
- primary_keys: Optional[Set[str]] = None,
166
- sort_keys: Optional[List[SortKey]] = None,
167
- description: Optional[str] = None,
168
- properties: Optional[Dict[str, str]] = None,
169
- permissions: Optional[Dict[str, Any]] = None,
170
- content_types: Optional[List[ContentType]] = None,
171
- replace_existing_table: bool = False,
172
- *args,
173
- **kwargs,
174
- ) -> TableDefinition:
175
- """Create an empty table. Raises an error if the table already exists and
176
- `replace_existing_table` is False."""
177
- raise NotImplementedError("create_table not implemented")
178
-
179
-
180
- def drop_table(
181
- table: str, namespace: Optional[str] = None, purge: bool = False, *args, **kwargs
182
- ) -> None:
183
- """Drop a table from the catalog and optionally purge it. Raises an error
184
- if the table does not exist."""
185
- raise NotImplementedError("drop_table not implemented")
186
-
187
-
188
- def refresh_table(table: str, namespace: Optional[str] = None, *args, **kwargs) -> None:
189
- """Refresh metadata cached on the Ray cluster for the given table."""
190
- raise NotImplementedError("refresh_table not implemented")
191
-
192
-
193
- def list_tables(
194
- namespace: Optional[str] = None, *args, **kwargs
195
- ) -> ListResult[TableDefinition]:
196
- """List a page of table definitions. Raises an error if the given namespace
197
- does not exist."""
198
- raise NotImplementedError("list_tables not implemented")
199
-
200
-
201
- def get_table(
202
- table: str, namespace: Optional[str] = None, *args, **kwargs
203
- ) -> Optional[TableDefinition]:
204
- """Get table definition metadata. Returns None if the given table does not
205
- exist."""
206
- raise NotImplementedError("get_table not implemented")
207
-
208
-
209
- def truncate_table(
210
- table: str, namespace: Optional[str] = None, *args, **kwargs
211
- ) -> None:
212
- """Truncate table data. Raises an error if the table does not exist."""
213
- raise NotImplementedError("truncate_table not implemented")
214
-
215
-
216
- def rename_table(
217
- table: str, new_name: str, namespace: Optional[str] = None, *args, **kwargs
218
- ) -> None:
219
- """Rename a table."""
220
- raise NotImplementedError("rename_table not implemented")
221
-
222
-
223
- def table_exists(table: str, namespace: Optional[str] = None, *args, **kwargs) -> bool:
224
- """Returns True if the given table exists, False if not."""
225
- raise NotImplementedError("table_exists not implemented")
226
-
227
-
228
- # namespace functions
229
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
230
- """List a page of table namespaces."""
231
- raise NotImplementedError("list_namespaces not implemented")
232
-
233
-
234
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
235
- """Gets table namespace metadata for the specified table namespace. Returns
236
- None if the given namespace does not exist."""
237
- raise NotImplementedError("get_namespace not implemented")
238
-
239
-
240
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
241
- """Returns True if the given table namespace exists, False if not."""
242
- raise NotImplementedError("namespace_exists not implemented")
243
-
244
-
245
- def create_namespace(
246
- namespace: str, permissions: Dict[str, Any], *args, **kwargs
247
- ) -> Namespace:
248
- """Creates a table namespace with the given name and permissions. Returns
249
- the created namespace. Raises an error if the namespace already exists."""
250
- raise NotImplementedError("create_namespace not implemented")
251
-
252
-
253
- def alter_namespace(
254
- namespace: str,
255
- permissions: Optional[Dict[str, Any]] = None,
256
- new_namespace: Optional[str] = None,
257
- *args,
258
- **kwargs,
259
- ) -> None:
260
- """Alter table namespace definition."""
261
- raise NotImplementedError("alter_namespace not implemented")
262
-
263
-
264
- def drop_namespace(namespace: str, purge: bool = False, *args, **kwargs) -> None:
265
- """Drop the given namespace and all of its tables from the catalog,
266
- optionally purging them."""
267
- raise NotImplementedError("drop_namespace not implemented")
268
-
269
-
270
- def default_namespace() -> str:
271
- """Returns the default namespace for the catalog."""
272
- raise NotImplementedError("default_namespace not implemented")
273
-
274
-
275
- # catalog functions
276
- def initialize(ds: deltacat_storage, *args, **kwargs) -> None:
277
- """Initializes the data catalog with the given arguments."""
278
- global STORAGE
279
- STORAGE = ds
280
-
281
-
282
- def _validate_read_table_args(
283
- namespace: Optional[str] = None,
284
- table_type: Optional[TableType] = None,
285
- distributed_dataset_type: Optional[DistributedDatasetType] = None,
286
- merge_on_read: Optional[bool] = None,
287
- ):
288
- if STORAGE is None:
289
- raise ValueError(
290
- "Catalog not initialized. Did you miss calling "
291
- "initialize(ds=<deltacat_storage>)?"
292
- )
293
-
294
- if merge_on_read:
295
- raise ValueError("Merge on read not supported currently.")
296
-
297
- if table_type is not TableType.PYARROW:
298
- raise ValueError("Only PYARROW table type is supported as of now")
299
-
300
- if distributed_dataset_type is not DistributedDatasetType.DAFT:
301
- raise ValueError("Only DAFT dataset type is supported as of now")
302
-
303
- if namespace is None:
304
- raise ValueError(
305
- "namespace must be passed to uniquely identify a table in the catalog."
306
- )
307
-
308
-
309
- def _get_latest_or_given_table_version(
310
- namespace: str,
311
- table_name: str,
312
- table_version: Optional[str] = None,
313
- *args,
314
- **kwargs,
315
- ) -> TableVersion:
316
- table_version_obj = None
317
- if table_version is None:
318
- table_version_obj = STORAGE.get_latest_table_version(
319
- namespace=namespace, table_name=table_name, *args, **kwargs
320
- )
321
- table_version = table_version_obj.table_version
322
- else:
323
- table_version_obj = STORAGE.get_table_version(
324
- namespace=namespace,
325
- table_name=table_name,
326
- table_version=table_version,
327
- *args,
328
- **kwargs,
329
- )
330
-
331
- return table_version_obj
332
-
333
-
334
- def _get_deltas_from_partition_filter(
335
- partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
336
- stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
337
- *args,
338
- **kwargs,
339
- ):
340
-
341
- result_deltas = []
342
- start_stream_position, end_stream_position = stream_position_range_inclusive or (
343
- None,
344
- None,
345
- )
346
- for partition_like in partition_filter:
347
- deltas = STORAGE.list_partition_deltas(
348
- partition_like=partition_like,
349
- ascending_order=True,
350
- include_manifest=True,
351
- start_stream_position=start_stream_position,
352
- last_stream_position=end_stream_position,
353
- *args,
354
- **kwargs,
355
- ).all_items()
356
-
357
- for delta in deltas:
358
- if (
359
- start_stream_position is None
360
- or delta.stream_position >= start_stream_position
361
- ) and (
362
- end_stream_position is None
363
- or delta.stream_position <= end_stream_position
364
- ):
365
- if delta.type == DeltaType.DELETE:
366
- raise ValueError("DELETE type deltas are not supported")
367
- result_deltas.append(delta)
368
-
369
- return result_deltas
deltacat/io/dataset.py DELETED
@@ -1,73 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import Any, Callable, Dict, Optional, TypeVar, Union, cast
5
-
6
- import pyarrow as pa
7
- import s3fs
8
- from ray.data import Dataset
9
-
10
- T = TypeVar("T")
11
-
12
-
13
- class DeltacatDataset(Dataset[T]):
14
- @staticmethod
15
- def from_dataset(dataset: Dataset[T]) -> DeltacatDataset[T]:
16
- # cast to DeltacatDataset in-place since it only adds new methods
17
- dataset.__class__ = DeltacatDataset
18
- return cast(DeltacatDataset[T], dataset)
19
-
20
- def write_redshift(
21
- self,
22
- path: str,
23
- *,
24
- filesystem: Optional[Union[pa.fs.FileSystem, s3fs.S3FileSystem]] = None,
25
- try_create_dir: bool = True,
26
- arrow_open_stream_args: Optional[Dict[str, Any]] = None,
27
- arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
28
- **arrow_parquet_args,
29
- ) -> None:
30
- """Writes the dataset to Parquet files and commits a Redshift manifest
31
- back to S3 indexing the files written. The output can be loaded into
32
- Redshift by providing it to the Redshift COPY command, or via AWS Data
33
- Wrangler's `wr.redshift.copy_from_files()` API.
34
-
35
- This is only supported for datasets convertible to Arrow records.
36
- To control the number of files, use ``.repartition()``.
37
-
38
- Unless a custom block path provider is given, the format of the output
39
- files will be {uuid}_{block_idx}.parquet, where ``uuid`` is an unique
40
- id for the dataset.
41
-
42
- The Redshift manifest will be written to ``f"{path}/manifest``
43
-
44
- Examples:
45
- >>> ds.write_redshift("s3://bucket/path")
46
-
47
- Time complexity: O(dataset size / parallelism)
48
-
49
- Args:
50
- path: The path to the destination root directory where Parquet
51
- files and the Redshift manifest will be written to.
52
- filesystem: The filesystem implementation to write to. This should
53
- be either PyArrow's S3FileSystem or s3fs.
54
- try_create_dir: Try to create all directories in destination path
55
- if True. Does nothing if all directories already exist.
56
- arrow_open_stream_args: kwargs passed to
57
- pyarrow.fs.FileSystem.open_output_stream
58
- filename_provider: FilenameProvider implementation
59
- to write each dataset block to a custom output path.
60
- arrow_parquet_args_fn: Callable that returns a dictionary of write
61
- arguments to use when writing each block to a file. Overrides
62
- any duplicate keys from arrow_parquet_args. This should be used
63
- instead of arrow_parquet_args if any of your write arguments
64
- cannot be pickled, or if you'd like to lazily resolve the write
65
- arguments for each dataset block.
66
- arrow_parquet_args: Options to pass to
67
- pyarrow.parquet.write_table(), which is used to write out each
68
- block to a file.
69
- """
70
- raise NotImplementedError(
71
- "Writing to Redshift is not yet supported. "
72
- "Please use DeltacatDataset.write_parquet() instead."
73
- )
deltacat/io/read_api.py DELETED
@@ -1,143 +0,0 @@
1
- from typing import Any, Callable, Dict, List, Optional, Union
2
-
3
- import pyarrow as pa
4
- import s3fs
5
- from ray.data import read_datasource
6
- from ray.data._internal.arrow_block import ArrowRow
7
-
8
- from deltacat import ContentType
9
- from deltacat.io.aws.redshift.redshift_datasource import (
10
- HivePartitionParser,
11
- RedshiftDatasource,
12
- RedshiftUnloadTextArgs,
13
- S3PathType,
14
- )
15
- from deltacat.io.dataset import DeltacatDataset
16
- from deltacat.utils.common import ReadKwargsProvider
17
-
18
-
19
- def read_redshift(
20
- paths: Union[str, List[str]],
21
- *,
22
- path_type: S3PathType = S3PathType.MANIFEST,
23
- filesystem: Optional[Union[pa.fs.S3FileSystem, s3fs.S3FileSystem]] = None,
24
- columns: Optional[List[str]] = None,
25
- schema: Optional[pa.Schema] = None,
26
- unload_text_args: RedshiftUnloadTextArgs = RedshiftUnloadTextArgs(),
27
- partitioning: HivePartitionParser = None,
28
- content_type_provider: Callable[[str], ContentType] = lambda p: ContentType.PARQUET
29
- if p.endswith(".parquet")
30
- else ContentType.CSV,
31
- parallelism: int = 200,
32
- ray_remote_args: Dict[str, Any] = None,
33
- arrow_open_stream_args: Optional[Dict[str, Any]] = None,
34
- pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
35
- **kwargs,
36
- ) -> DeltacatDataset[ArrowRow]:
37
- """Reads Redshift UNLOAD results from either S3 Parquet or delimited text
38
- files into a Ray Dataset.
39
-
40
- Examples:
41
- >>> # Read all files contained in a Redshift Manifest:
42
- >>> import deltacat as dc
43
- >>> dc.io.read_redshift("/bucket/dir/manifest")
44
-
45
- >>> # Read all files matching the given key prefix. If this prefix
46
- >>> # refers to multiple files, like s3://bucket/data.parquet,
47
- >>> # s3://bucket/data.1.csv, etc. then all will be read. The dataset
48
- >>> # schema will be inferred from the first parquet file and used for
49
- >>> # explicit type conversion of all CSV files:
50
- >>> dc.io.read_redshift(
51
- >>> "s3://bucket/data.txt",
52
- >>> path_type=S3PathType.PREFIX)
53
-
54
- >>> # Read all files matching the given key prefix. If this prefix
55
- >>> # refers to multiple files or folders, like s3://bucket/dir/,
56
- >>> # s3://bucket/dir1/, s3://bucket/dir.txt, s3://bucket/dir.txt.1,
57
- >>> # then all files and subfolder contents will be read.
58
- >>> dc.io.read_redshift(
59
- >>> "/bucket/dir",
60
- >>> path_type=S3PathType.PREFIX)
61
-
62
- >>> # Read multiple files and folders:
63
- >>> dc.io.read_redshift(
64
- >>> ["/bucket/file1", "/bucket/folder1/"],
65
- >>> path_type=S3PathType.FILES_AND_FOLDERS)
66
-
67
- >>> # Read multiple Parquet and CSV files. The dataset schema will be
68
- >>> # inferred from the first parquet file and used for explicit type
69
- >>> # conversion of all CSV files:
70
- >>> dc.io.read_redshift(
71
- >>> ["/bucket/file.parquet", "/bucket/file.csv"],
72
- >>> path_type=S3PathType.FILES_AND_FOLDERS)
73
-
74
- Args:
75
- paths: Paths to S3 files and folders to read. If `path_type` is
76
- `MANIFEST` then this must be an S3 Redshift Manifest JSON file. If
77
- `path_type` is `PREFIX` then this must be a valid S3 key prefix.
78
- All files matching the key prefix, including files in matching
79
- subdirectories, will be read. Unless custom
80
- `content_type_extensions` are specified, file content types will be
81
- inferred by file extension with ".parquet" used for Parquet files,
82
- and all others assumed to be delimited text (e.g. CSV). It's
83
- recommended to specify the path to a manifest unloaded with the
84
- VERBOSE option whenever possible to improve the correctness and
85
- performance of Dataset reads, compute operations, and writes.
86
- `FILES_AND_FOLDERS` is not recommended when reading thousands of
87
- files due to its relatively high-latency.
88
- path_type: Determines how the `paths` parameter is interpreted.
89
- filesystem: The filesystem implementation to read from. This should be
90
- either PyArrow's S3FileSystem or s3fs.
91
- columns: A list of column names to read. Reads all columns if None or
92
- empty.
93
- schema: PyArrow schema used to determine delimited text column
94
- names and types. If not specified and both Parquet and delimited
95
- text files are read as input, then the first Parquet file schema
96
- discovered is used instead.
97
- unload_text_args: Arguments used when running Redshift `UNLOAD` to
98
- text file formats (e.g. CSV). These arguments ensure that all input
99
- text files will be correctly parsed. If not specified, then all
100
- text files read are assumed to use Redshift UNLOAD's default
101
- pipe-delimited text format.
102
- partition_base_dir: Base directory to start searching for partitions
103
- (exclusive). File paths outside of this directory will not be parsed
104
- for partitions and automatically added to the dataset without passing
105
- through any partition filter. Specify `None` or an empty string to
106
- search for partitions in all file path directories.
107
- partition_filter_fn: Callback used to filter `PARTITION` columns. Receives a
108
- dictionary mapping partition keys to values as input, returns `True` to
109
- read a partition, and `False` to skip it. Each partition key and value
110
- is a string parsed directly from an S3 key using hive-style
111
- partition directory names of the form "{key}={value}". For example:
112
- ``lambda x:
113
- True if x["month"] == "January" and x["year"] == "2022" else False``
114
- content_type_provider: Takes a file path as input and returns the file
115
- content type as output.
116
- parallelism: The requested parallelism of the read. Parallelism may be
117
- limited by the number of files of the dataset.
118
- ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
119
- arrow_open_stream_args: kwargs passed to to
120
- `pa.fs.open_input_stream()`.
121
- pa_read_func_kwargs_provider: Callback that takes a `ContentType` value
122
- string as input, and provides read options to pass to either
123
- `pa.csv.open_csv()` or `pa.parquet.read_table()` as output.
124
- Returns:
125
- Dataset holding Arrow records read from the specified paths.
126
- """
127
- dataset = read_datasource(
128
- RedshiftDatasource(),
129
- parallelism=parallelism,
130
- paths=paths,
131
- content_type_provider=content_type_provider,
132
- path_type=path_type,
133
- filesystem=filesystem,
134
- columns=columns,
135
- schema=schema,
136
- unload_args=unload_text_args,
137
- partitioning=partitioning,
138
- ray_remote_args=ray_remote_args,
139
- open_stream_args=arrow_open_stream_args,
140
- read_kwargs_provider=pa_read_func_kwargs_provider,
141
- **kwargs,
142
- )
143
- return DeltacatDataset.from_dataset(dataset)
@@ -1,40 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import List, Optional
5
-
6
-
7
- class DeleteParameters(dict):
8
- """
9
- Contains all parameters required to support DELETEs
10
- equality_column_names: List of column names that would be used to determine row equality for equality deletes. Relevant only to equality deletes
11
- """
12
-
13
- @staticmethod
14
- def of(
15
- equality_column_names: Optional[List[str]] = None,
16
- ) -> DeleteParameters:
17
- delete_parameters = DeleteParameters()
18
- if equality_column_names is not None:
19
- delete_parameters["equality_column_names"] = equality_column_names
20
- return delete_parameters
21
-
22
- @property
23
- def equality_column_names(self) -> Optional[List[str]]:
24
- return self.get("equality_column_names")
25
-
26
- @staticmethod
27
- def merge_delete_parameters(
28
- delete_parameters: List[DeleteParameters],
29
- ) -> Optional[DeleteParameters]:
30
- if len(delete_parameters) < 2:
31
- return delete_parameters
32
- equality_column_names = delete_parameters[0].equality_column_names
33
- assert all(
34
- delete_prev.equality_column_names == delete_curr.equality_column_names
35
- for delete_prev, delete_curr in zip(
36
- delete_parameters, delete_parameters[1:]
37
- )
38
- ), "We cannot merge two delete parameters if their equality column names are different."
39
- merge_delete_parameters = DeleteParameters.of(equality_column_names)
40
- return merge_delete_parameters
@@ -1,71 +0,0 @@
1
- from __future__ import annotations
2
- from typing import List, Optional, Any
3
- from deltacat.storage.model.transform import Transform
4
-
5
- """
6
- An ordered list of partition values determining the values of
7
- ordered transforms specified in the partition spec.
8
- """
9
- PartitionValues = List[Any]
10
-
11
-
12
- class PartitionFilter(dict):
13
- """
14
- This class represents a filter for partitions.
15
- It is used to filter partitions based on certain criteria.
16
- """
17
-
18
- @staticmethod
19
- def of(
20
- partition_values: Optional[PartitionValues] = None,
21
- ) -> PartitionFilter:
22
- """
23
- Creates a new PartitionFilter instance with the specified partition key and value.
24
- """
25
- partition_filter = PartitionFilter()
26
- partition_filter["partitionValues"] = partition_values
27
- return partition_filter
28
-
29
- @property
30
- def partition_values(self) -> Optional[PartitionValues]:
31
- return self.get("partitionValues")
32
-
33
-
34
- class PartitionSpec(dict):
35
- """
36
- This class determines how the underlying entities in the
37
- hierarchy are partitioned. Stream partitions deltas and
38
- delta partitions files.
39
- """
40
-
41
- @staticmethod
42
- def of(ordered_transforms: List[Transform] = None) -> PartitionSpec:
43
- partition_spec = PartitionSpec()
44
- partition_spec.ordered_transforms = ordered_transforms
45
- return partition_spec
46
-
47
- @property
48
- def ordered_transforms(self) -> List[Transform]:
49
- return self.get("orderedTransforms")
50
-
51
- @ordered_transforms.setter
52
- def ordered_transforms(self, value: List[Transform]) -> None:
53
- self["orderedTransforms"] = value
54
-
55
-
56
- class StreamPartitionSpec(PartitionSpec):
57
- """
58
- A class representing a stream partition specification.
59
- A stream partitions deltas into multiple different Partition
60
- """
61
-
62
- pass
63
-
64
-
65
- class DeltaPartitionSpec(PartitionSpec):
66
- """
67
- A class representing delta partition specification.
68
- The manifest entries in delta are partitioned based on this spec.
69
- """
70
-
71
- pass