deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,368 @@
1
+ import logging
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from daft import DataFrame
6
+
7
+ from deltacat import logs
8
+ from deltacat.catalog.model.table_definition import TableDefinition
9
+ from deltacat.exceptions import TableAlreadyExistsError
10
+ from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
11
+ from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
12
+ from deltacat.storage.model.partition import PartitionScheme
13
+ from deltacat.storage.iceberg.impl import _get_native_catalog
14
+ from deltacat.storage.model.sort_key import SortScheme
15
+ from deltacat.storage.model.list_result import ListResult
16
+ from deltacat.storage.model.namespace import Namespace, NamespaceProperties
17
+ from deltacat.storage.model.schema import Schema
18
+ from deltacat.storage.model.table import TableProperties
19
+ from deltacat.storage.model.types import (
20
+ DistributedDataset,
21
+ LifecycleState,
22
+ LocalDataset,
23
+ LocalTable,
24
+ StreamFormat,
25
+ )
26
+ from deltacat.storage.iceberg import impl as IcebergStorage
27
+ from deltacat.types.media import ContentType
28
+ from deltacat.types.tables import TableWriteMode
29
+ from deltacat.constants import DEFAULT_NAMESPACE
30
+ from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
31
+
32
+ from pyiceberg.catalog import Catalog, load_catalog
33
+ from pyiceberg.transforms import BucketTransform
34
+
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ # catalog functions
39
+ def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
40
+ """
41
+ Initializes an Iceberg catalog with the given config.
42
+
43
+ NOTE: because PyIceberg catalogs are not pickle-able, we cannot accept them as catalog initialization parameters,
44
+ since catalog initialization parameters are passed to Ray actors (see: :class:`deltacat.catalog.Catalogs`)
45
+
46
+ Args:
47
+ **kwargs: Arguments to be passed to PyIceberg Catalog.
48
+ If 'catalog' is provided as a PyIceberg Catalog instance, it will be used directly.
49
+ Otherwise, the arguments will be used to load a catalog via pyiceberg.catalog.load_catalog.
50
+
51
+ Returns:
52
+ IcebergCatalogConfig: Configuration wrapper containing the PyIceberg Catalog.
53
+ """
54
+
55
+ # If no catalog is provided, try to load one with PyIceberg
56
+
57
+ load_catalog_args = {"type": config.type.value, **config.properties, **kwargs}
58
+ catalog = load_catalog(**load_catalog_args)
59
+ return catalog
60
+
61
+
62
+ # table functions
63
+ def write_to_table(
64
+ data: Union[LocalTable, LocalDataset, DistributedDataset],
65
+ table: str,
66
+ *args,
67
+ namespace: Optional[str] = None,
68
+ mode: TableWriteMode = TableWriteMode.AUTO,
69
+ content_type: ContentType = ContentType.PARQUET,
70
+ **kwargs,
71
+ ) -> None:
72
+ """Write local or distributed data to a table. Raises an error if the
73
+ table does not exist and the table write mode is not CREATE or AUTO.
74
+
75
+ When creating a table, all `create_table` parameters may be optionally
76
+ specified as additional keyword arguments. When appending to, or replacing,
77
+ an existing table, all `alter_table` parameters may be optionally specified
78
+ as additional keyword arguments."""
79
+
80
+ # TODO (pdames): derive schema automatically from data if not
81
+ # explicitly specified in kwargs, and table needs to be created
82
+ # kwargs["schema"] = kwargs["schema"] or derived_schema
83
+ kwargs["fail_if_exists"] = mode == TableWriteMode.CREATE
84
+ table_definition = (
85
+ create_table(
86
+ table,
87
+ namespace=namespace,
88
+ *args,
89
+ **kwargs,
90
+ )
91
+ if (mode == TableWriteMode.AUTO or mode == TableWriteMode.CREATE)
92
+ else get_table(table, namespace=namespace, *args, **kwargs)
93
+ )
94
+
95
+ # TODO(pdames): Use native DeltaCAT models to map from Iceberg partitioning to Daft partitioning...
96
+ # this lets us re-use a single model-mapper instead of different per-catalog model mappers
97
+ schema = SchemaMapper.unmap(table_definition.table_version.schema)
98
+ partition_spec = PartitionSchemeMapper.unmap(
99
+ table_definition.table_version.partition_scheme,
100
+ schema,
101
+ )
102
+ if isinstance(data, DataFrame):
103
+ for partition_field in partition_spec.fields:
104
+ if isinstance(partition_field.transform, BucketTransform):
105
+ ice_bucket_transform: BucketTransform = partition_field.transform
106
+ # TODO(pdames): Get a type-checked Iceberg Table automatically via unmap()
107
+ table_location = table_definition.table.native_object.location()
108
+ path = kwargs.get("path") or f"{table_location}/data"
109
+ if content_type == ContentType.PARQUET:
110
+ source_field = schema.find_field(
111
+ name_or_id=partition_field.source_id
112
+ )
113
+ out_df = data.write_parquet(
114
+ path,
115
+ partition_cols=[
116
+ data[source_field.name].partitioning.iceberg_bucket(
117
+ ice_bucket_transform.num_buckets
118
+ ),
119
+ ],
120
+ )
121
+ # TODO(pdames): only append s3:// to output file paths when writing to S3!
122
+ out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
123
+ from deltacat.catalog.iceberg import overrides
124
+
125
+ overrides.append(
126
+ table_definition.table.native_object,
127
+ out_file_paths,
128
+ )
129
+ else:
130
+ raise NotImplementedError(
131
+ f"iceberg writes not implemented for content type: {content_type}"
132
+ )
133
+ else:
134
+ raise NotImplementedError(
135
+ f"daft partitioning not implemented for iceberg transform: {partition_field.transform}"
136
+ )
137
+ else:
138
+ raise NotImplementedError(
139
+ f"iceberg write-back not implemented for data type: {type(data)}"
140
+ )
141
+
142
+
143
+ def read_table(
144
+ table: str, *args, namespace: Optional[str] = None, **kwargs
145
+ ) -> DistributedDataset:
146
+ """Read a table into a distributed dataset."""
147
+ raise NotImplementedError("read_table not implemented")
148
+
149
+
150
+ def alter_table(
151
+ table: str,
152
+ *args,
153
+ namespace: Optional[str] = None,
154
+ lifecycle_state: Optional[LifecycleState] = None,
155
+ schema_updates: Optional[Dict[str, Any]] = None,
156
+ partition_updates: Optional[Dict[str, Any]] = None,
157
+ sort_keys: Optional[SortScheme] = None,
158
+ description: Optional[str] = None,
159
+ properties: Optional[TableProperties] = None,
160
+ **kwargs,
161
+ ) -> None:
162
+ """Alter table definition."""
163
+ raise NotImplementedError("alter_table not implemented")
164
+
165
+
166
+ def create_table(
167
+ name: str,
168
+ *args,
169
+ namespace: Optional[str] = None,
170
+ version: Optional[str] = None,
171
+ lifecycle_state: Optional[LifecycleState] = None,
172
+ schema: Optional[Schema] = None,
173
+ partition_scheme: Optional[PartitionScheme] = None,
174
+ sort_keys: Optional[SortScheme] = None,
175
+ description: Optional[str] = None,
176
+ table_properties: Optional[TableProperties] = None,
177
+ namespace_properties: Optional[NamespaceProperties] = None,
178
+ content_types: Optional[List[ContentType]] = None,
179
+ fail_if_exists: bool = True,
180
+ **kwargs,
181
+ ) -> TableDefinition:
182
+ """Create an empty table in the catalog"""
183
+
184
+ namespace = namespace or default_namespace()
185
+ existing_table = get_table(
186
+ name,
187
+ *args,
188
+ namespace=namespace,
189
+ **kwargs,
190
+ )
191
+ if existing_table:
192
+ if fail_if_exists:
193
+ err_msg = (
194
+ f"Table `{namespace}.{name}` already exists. "
195
+ f"To suppress this error, rerun `create_table()` with "
196
+ f"`fail_if_exists=False`."
197
+ )
198
+ raise TableAlreadyExistsError(err_msg)
199
+ else:
200
+ logger.debug(f"Returning existing table: `{namespace}.{name}`")
201
+ return existing_table
202
+
203
+ if not IcebergStorage.namespace_exists(namespace, **kwargs):
204
+ logger.debug(f"Namespace {namespace} doesn't exist. Creating it...")
205
+ IcebergStorage.create_namespace(
206
+ namespace,
207
+ properties=namespace_properties or {},
208
+ **kwargs,
209
+ )
210
+
211
+ IcebergStorage.create_table_version(
212
+ namespace=namespace,
213
+ table_name=name,
214
+ table_version=version,
215
+ schema=schema,
216
+ partition_scheme=partition_scheme,
217
+ sort_keys=sort_keys,
218
+ table_properties=table_properties,
219
+ **kwargs,
220
+ )
221
+
222
+ return get_table(
223
+ name,
224
+ *args,
225
+ namespace=namespace,
226
+ **kwargs,
227
+ )
228
+
229
+
230
+ def drop_table(
231
+ name: str,
232
+ *args,
233
+ namespace: Optional[str] = None,
234
+ table_version: Optional[str] = None,
235
+ purge: bool = False,
236
+ **kwargs,
237
+ ) -> None:
238
+ """Drop a table from the catalog and optionally purge it. Raises an error
239
+ if the table does not exist."""
240
+ raise NotImplementedError("drop_table not implemented")
241
+
242
+
243
+ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
244
+ """Refresh metadata cached on the Ray cluster for the given table."""
245
+ raise NotImplementedError("refresh_table not implemented")
246
+
247
+
248
+ def list_tables(
249
+ *args, namespace: Optional[str] = None, **kwargs
250
+ ) -> ListResult[TableDefinition]:
251
+ """List a page of table definitions. Raises an error if the given namespace
252
+ does not exist."""
253
+ raise NotImplementedError("list_tables not implemented")
254
+
255
+
256
+ def get_table(
257
+ name: str,
258
+ *args,
259
+ namespace: Optional[str] = None,
260
+ table_version: Optional[str] = None,
261
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
262
+ **kwargs,
263
+ ) -> Optional[TableDefinition]:
264
+ """Get table definition metadata.
265
+
266
+ Args:
267
+ name: Name of the table to retrieve
268
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
269
+ table_version: Optional specific version of the table to retrieve.
270
+ If not specified, the latest version is used.
271
+ stream_format: Optional stream format to retrieve
272
+
273
+ Returns:
274
+ Deltacat TableDefinition if the table exists, None otherwise.
275
+ """
276
+ namespace = namespace or default_namespace()
277
+ stream = IcebergStorage.get_stream(namespace=namespace, table_name=name, **kwargs)
278
+ if not stream:
279
+ return None
280
+ table_obj = IcebergStorage.get_table(namespace=namespace, table_name=name, **kwargs)
281
+ if not table_obj:
282
+ return None
283
+ table_version_obj = None
284
+ if table_version:
285
+ table_version_obj = IcebergStorage.get_table_version(
286
+ namespace=namespace, table_name=name, table_version=table_version, **kwargs
287
+ )
288
+ else:
289
+ table_version_obj = IcebergStorage.get_latest_table_version(
290
+ namespace=namespace, table_name=name, **kwargs
291
+ )
292
+ if not table_version_obj:
293
+ return None
294
+ scan_planner = IcebergScanPlanner(_get_native_catalog(**kwargs))
295
+ return TableDefinition.of(
296
+ table=table_obj,
297
+ table_version=table_version_obj,
298
+ stream=stream,
299
+ native_object=table_obj.native_object,
300
+ scan_planner=scan_planner,
301
+ )
302
+
303
+
304
+ def truncate_table(
305
+ table: str, *args, namespace: Optional[str] = None, **kwargs
306
+ ) -> None:
307
+ """Truncate table data. Raises an error if the table does not exist."""
308
+ raise NotImplementedError("truncate_table not implemented")
309
+
310
+
311
+ def rename_table(
312
+ table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
313
+ ) -> None:
314
+ """Rename a table."""
315
+ raise NotImplementedError("rename_table not implemented")
316
+
317
+
318
+ def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
319
+ """Returns True if the given table exists, False if not."""
320
+ namespace = namespace or default_namespace()
321
+ return IcebergStorage.table_exists(namespace=namespace, table_name=table, **kwargs)
322
+
323
+
324
+ # namespace functions
325
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
326
+ """List a page of table namespaces."""
327
+ return IcebergStorage.list_namespaces(**kwargs)
328
+
329
+
330
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
331
+ """Gets table namespace metadata for the specified table namespace. Returns
332
+ None if the given namespace does not exist."""
333
+ return IcebergStorage.get_namespace(namespace, **kwargs)
334
+
335
+
336
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
337
+ """Returns True if the given table namespace exists, False if not."""
338
+ return IcebergStorage.namespace_exists(namespace, **kwargs)
339
+
340
+
341
+ def create_namespace(
342
+ namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
343
+ ) -> Namespace:
344
+ """Creates a table namespace with the given name and properties. Returns
345
+ the created namespace. Raises an error if the namespace already exists."""
346
+ raise NotImplementedError("create_namespace not implemented")
347
+
348
+
349
+ def alter_namespace(
350
+ namespace: str,
351
+ *args,
352
+ properties: Optional[NamespaceProperties] = None,
353
+ new_namespace: Optional[str] = None,
354
+ **kwargs,
355
+ ) -> None:
356
+ """Alter table namespace definition."""
357
+ raise NotImplementedError("alter_namespace not implemented")
358
+
359
+
360
+ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
361
+ """Drop the given namespace and all of its tables from the catalog,
362
+ optionally purging them."""
363
+ raise NotImplementedError("drop_namespace not implemented")
364
+
365
+
366
+ def default_namespace(*args, **kwargs) -> str:
367
+ """Returns the default namespace for the catalog."""
368
+ return DEFAULT_NAMESPACE
@@ -0,0 +1,74 @@
1
+ import pyarrow.parquet as pq
2
+
3
+ from typing import Iterator, List
4
+
5
+ from pyarrow.fs import FileSystem
6
+
7
+ from pyiceberg.io.pyarrow import (
8
+ fill_parquet_file_metadata,
9
+ compute_statistics_plan,
10
+ parquet_path_to_id_mapping,
11
+ )
12
+ from pyiceberg.table import Table, _MergingSnapshotProducer
13
+ from pyiceberg.table.snapshots import Operation
14
+ from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
15
+ from pyiceberg.types import StructType, NestedField, IntegerType
16
+ from pyiceberg.typedef import Record
17
+
18
+
19
+ def append(table: Table, paths: List[str]) -> None:
20
+ """
21
+ Append files to the table.
22
+ """
23
+ # if len(table.sort_order().fields) > 0:
24
+ # raise ValueError("Cannot write to tables with a sort-order")
25
+
26
+ data_files = write_file(table, paths)
27
+ merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=table)
28
+ for data_file in data_files:
29
+ merge.append_data_file(data_file)
30
+
31
+ merge.commit()
32
+
33
+
34
+ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
35
+ data_files = []
36
+ for file_path in paths:
37
+ partition_dir = file_path.split("/")[-2]
38
+ partition_value = int(partition_dir.split("=")[-1])
39
+ fs_tuple = FileSystem.from_uri(file_path)
40
+ fs = fs_tuple[0]
41
+ fs_path = fs_tuple[1]
42
+ with fs.open_input_file(fs_path) as native_file:
43
+ parquet_metadata = pq.read_metadata(native_file)
44
+ data_file = DataFile(
45
+ content=DataFileContent.DATA,
46
+ file_path=file_path,
47
+ file_format=FileFormat.PARQUET,
48
+ partition=Record(
49
+ **{
50
+ "struct": StructType(
51
+ NestedField(
52
+ 0,
53
+ table.spec().fields[0].name,
54
+ IntegerType(),
55
+ required=False,
56
+ )
57
+ ),
58
+ **{table.spec().fields[0].name: partition_value},
59
+ }
60
+ ),
61
+ file_size_in_bytes=native_file.size(),
62
+ sort_order_id=None,
63
+ spec_id=table.spec().spec_id,
64
+ equality_ids=None,
65
+ key_metadata=None,
66
+ )
67
+ fill_parquet_file_metadata(
68
+ data_file=data_file,
69
+ parquet_metadata=parquet_metadata,
70
+ stats_columns=compute_statistics_plan(table.schema(), table.properties),
71
+ parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
72
+ )
73
+ data_files.append(data_file)
74
+ return data_files