deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,106 @@
1
+ import math
2
+ from random import shuffle
3
+ import pytest
4
+ from deltacat.storage.rivulet.dataset import Dataset
5
+ from deltacat.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
7
+ from deltacat.storage.rivulet.schema.schema import Schema
8
+ from deltacat.benchmarking.benchmark_engine import BenchmarkEngine
9
+ from deltacat.benchmarking.benchmark_report import BenchmarkRun, BenchmarkReport
10
+ from deltacat.benchmarking.benchmark_suite import BenchmarkSuite
11
+ from deltacat.benchmarking.data.random_row_generator import RandomRowGenerator
12
+ from deltacat.benchmarking.data.row_generator import RowGenerator
13
+ from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
14
+
15
+ pytestmark = pytest.mark.benchmark
16
+
17
+
18
+ @pytest.fixture
19
+ def schema():
20
+ return Schema(
21
+ [
22
+ ("id", Datatype.int32()),
23
+ ("source", Datatype.string()),
24
+ ("media", Datatype.image("png")),
25
+ ],
26
+ "id",
27
+ )
28
+
29
+
30
+ class LoadAndScanSuite(BenchmarkSuite):
31
+ """Load some number of rows and scan"""
32
+
33
+ schema_name = "LoadAndScanSuite"
34
+
35
+ def __init__(self, dataset: Dataset, schema: Schema, generator, description=None):
36
+ self.suite = "ReadSuite"
37
+ self.dataset: Dataset = dataset
38
+ self.schema = schema
39
+ self.dataset.add_schema(schema, LoadAndScanSuite.schema_name)
40
+ self.generator: RowGenerator = generator
41
+ self.description: str = description or f"{self.dataset} x {self.generator}"
42
+
43
+ def run(self) -> BenchmarkRun:
44
+ container = BenchmarkEngine(self.dataset)
45
+ run = BenchmarkRun(self.suite, self.description)
46
+ # load a large number of rows
47
+ manifest_uri, step = container.load_and_commit(
48
+ LoadAndScanSuite.schema_name, self.generator, 1000
49
+ )
50
+ run.add(step)
51
+ # do a full scan of all rows (and eagerly load them)
52
+ keys, step = container.scan()
53
+ run.add(step)
54
+ # randomly retrieve all keys one-by-one from the dataset
55
+ random_keys = list(keys)
56
+ shuffle(random_keys)
57
+ step = container.run_queries(
58
+ "load all keys individually",
59
+ manifest_uri,
60
+ [QueryExpression().with_key(k) for k in random_keys],
61
+ )
62
+ run.add(step)
63
+ # split into 4 key ranges and get them individually
64
+ quartiles = self._generate_quartiles(keys)
65
+ expressions = [
66
+ QueryExpression().with_range(start, end) for (start, end) in quartiles
67
+ ]
68
+ step = container.run_queries(
69
+ "load key ranges by quartile", manifest_uri, expressions
70
+ )
71
+ run.add(step)
72
+ return run
73
+
74
+ @staticmethod
75
+ def _generate_quartiles(keys):
76
+ sorted_keys = sorted(keys)
77
+ size = len(keys)
78
+ starts = list(range(0, size, math.ceil(size / 4)))
79
+ ends = list([x - 1 for x in starts[1:]])
80
+ ends.append(size - 1)
81
+ quartiles = list(zip(starts, ends))
82
+ return [(sorted_keys[start], sorted_keys[end]) for (start, end) in quartiles]
83
+
84
+
85
+ def test_suite1(schema: Schema, report: BenchmarkReport):
86
+ with temp_dir_autocleanup() as temp_dir:
87
+ generator = RandomRowGenerator(123, temp_dir)
88
+ report.add(
89
+ LoadAndScanSuite(
90
+ Dataset(dataset_name="test_suite1_ds1", metadata_uri=temp_dir),
91
+ schema,
92
+ generator,
93
+ "SST (rand)",
94
+ ).run()
95
+ )
96
+
97
+ with temp_dir_autocleanup() as temp_dir:
98
+ generator = RandomRowGenerator(123, temp_dir)
99
+ report.add(
100
+ LoadAndScanSuite(
101
+ Dataset(dataset_name="test_suite1_ds2", metadata_uri=temp_dir),
102
+ schema,
103
+ generator,
104
+ "dupe",
105
+ ).run()
106
+ )
@@ -0,0 +1,14 @@
1
+ from deltacat.catalog.model.properties import ( # noqa: F401
2
+ CatalogProperties,
3
+ get_catalog_properties,
4
+ )
5
+ from deltacat.catalog.model.catalog import Catalog, Catalogs # noqa: F401
6
+ from deltacat.catalog.main import impl as DeltacatCatalog
7
+
8
+ __all__ = [
9
+ "CatalogProperties",
10
+ "get_catalog_properties",
11
+ "Catalog",
12
+ "Catalogs",
13
+ "DeltacatCatalog",
14
+ ]
@@ -1,53 +1,33 @@
1
- from typing import Any, Dict, List, Optional, Set, Union
1
+ from typing import Any, Dict, List, Optional, Union
2
2
 
3
- import pyarrow as pa
4
- import ray
5
-
6
- from deltacat.catalog.model.catalog import Catalog, all_catalogs
3
+ from deltacat.catalog.model.catalog import get_catalog
7
4
  from deltacat.catalog.model.table_definition import TableDefinition
8
- from deltacat.storage.model.sort_key import SortKey
5
+ from deltacat.storage.model.partition import PartitionScheme
6
+ from deltacat.storage.model.sort_key import SortScheme
9
7
  from deltacat.storage.model.list_result import ListResult
10
- from deltacat.storage.model.namespace import Namespace
8
+ from deltacat.storage.model.namespace import Namespace, NamespaceProperties
9
+ from deltacat.storage.model.schema import Schema
10
+ from deltacat.storage.model.table import TableProperties
11
11
  from deltacat.storage.model.types import (
12
12
  DistributedDataset,
13
13
  LifecycleState,
14
14
  LocalDataset,
15
15
  LocalTable,
16
- SchemaConsistencyType,
16
+ StreamFormat,
17
17
  )
18
18
  from deltacat.types.media import ContentType
19
19
  from deltacat.types.tables import TableWriteMode
20
20
 
21
21
 
22
- def _get_catalog(name: Optional[str] = None) -> Catalog:
23
- if not all_catalogs:
24
- raise ValueError(
25
- "No catalogs available! Call "
26
- "`deltacat.init(catalogs={...})` to register one or more "
27
- "catalogs then retry."
28
- )
29
- catalog = (
30
- ray.get(all_catalogs.get.remote(name))
31
- if name
32
- else ray.get(all_catalogs.default.remote())
33
- )
34
- if not catalog:
35
- available_catalogs = ray.get(all_catalogs.all.remote()).values()
36
- raise ValueError(
37
- f"Catalog '{name}' not found. Available catalogs: " f"{available_catalogs}."
38
- )
39
- return catalog
40
-
41
-
42
22
  # table functions
43
23
  def write_to_table(
44
24
  data: Union[LocalTable, LocalDataset, DistributedDataset],
45
25
  table: str,
26
+ *args,
46
27
  namespace: Optional[str] = None,
47
28
  catalog: Optional[str] = None,
48
29
  mode: TableWriteMode = TableWriteMode.AUTO,
49
30
  content_type: ContentType = ContentType.PARQUET,
50
- *args,
51
31
  **kwargs,
52
32
  ) -> None:
53
33
  """Write local or distributed data to a table. Raises an error if the
@@ -57,228 +37,341 @@ def write_to_table(
57
37
  specified as additional keyword arguments. When appending to, or replacing,
58
38
  an existing table, all `alter_table` parameters may be optionally specified
59
39
  as additional keyword arguments."""
60
- _get_catalog(catalog).impl.write_to_table(
61
- data, table, namespace, mode, content_type, *args, **kwargs
40
+ catalog_obj = get_catalog(catalog)
41
+ catalog_obj.impl.write_to_table(
42
+ data,
43
+ table,
44
+ *args,
45
+ namespace=namespace,
46
+ mode=mode,
47
+ content_type=content_type,
48
+ inner=catalog_obj.inner,
49
+ **kwargs,
62
50
  )
63
51
 
64
52
 
65
53
  def read_table(
66
54
  table: str,
55
+ *args,
67
56
  namespace: Optional[str] = None,
68
57
  catalog: Optional[str] = None,
69
- *args,
70
58
  **kwargs,
71
59
  ) -> DistributedDataset:
72
60
  """Read a table into a distributed dataset."""
73
- return _get_catalog(catalog).impl.read_table(table, namespace, *args, **kwargs)
61
+ catalog_obj = get_catalog(catalog)
62
+ return catalog_obj.impl.read_table(
63
+ table,
64
+ *args,
65
+ namespace=namespace,
66
+ inner=catalog_obj.inner,
67
+ **kwargs,
68
+ )
74
69
 
75
70
 
76
71
  def alter_table(
77
72
  table: str,
73
+ *args,
78
74
  namespace: Optional[str] = None,
79
75
  catalog: Optional[str] = None,
80
76
  lifecycle_state: Optional[LifecycleState] = None,
81
77
  schema_updates: Optional[Dict[str, Any]] = None,
82
78
  partition_updates: Optional[Dict[str, Any]] = None,
83
- primary_keys: Optional[Set[str]] = None,
84
- sort_keys: Optional[List[SortKey]] = None,
79
+ sort_keys: Optional[SortScheme] = None,
85
80
  description: Optional[str] = None,
86
- properties: Optional[Dict[str, str]] = None,
87
- *args,
81
+ properties: Optional[TableProperties] = None,
88
82
  **kwargs,
89
83
  ) -> None:
90
84
  """Alter table definition."""
91
- _get_catalog(catalog).impl.alter_table(
85
+ catalog_obj = get_catalog(catalog)
86
+ catalog_obj.impl.alter_table(
92
87
  table,
93
- namespace,
94
- lifecycle_state,
95
- schema_updates,
96
- partition_updates,
97
- primary_keys,
98
- sort_keys,
99
- description,
100
- properties,
101
88
  *args,
89
+ namespace=namespace,
90
+ lifecycle_state=lifecycle_state,
91
+ schema_updates=schema_updates,
92
+ partition_updates=partition_updates,
93
+ sort_keys=sort_keys,
94
+ description=description,
95
+ properties=properties,
96
+ inner=catalog_obj.inner,
102
97
  **kwargs,
103
98
  )
104
99
 
105
100
 
106
101
  def create_table(
107
- table: str,
102
+ name: str,
103
+ *args,
108
104
  namespace: Optional[str] = None,
109
105
  catalog: Optional[str] = None,
110
- lifecycle_state: Optional[LifecycleState] = None,
111
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
112
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
113
- partition_keys: Optional[List[Dict[str, Any]]] = None,
114
- primary_keys: Optional[Set[str]] = None,
115
- sort_keys: Optional[List[SortKey]] = None,
106
+ version: Optional[str] = None,
107
+ lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
108
+ schema: Optional[Schema] = None,
109
+ partition_scheme: Optional[PartitionScheme] = None,
110
+ sort_keys: Optional[SortScheme] = None,
116
111
  description: Optional[str] = None,
117
- properties: Optional[Dict[str, str]] = None,
118
- permissions: Optional[Dict[str, Any]] = None,
112
+ table_properties: Optional[TableProperties] = None,
113
+ namespace_properties: Optional[NamespaceProperties] = None,
119
114
  content_types: Optional[List[ContentType]] = None,
120
- replace_existing_table: bool = False,
121
- *args,
115
+ fail_if_exists: bool = True,
122
116
  **kwargs,
123
117
  ) -> TableDefinition:
124
118
  """Create an empty table. Raises an error if the table already exists and
125
- `replace_existing_table` is False."""
126
- return _get_catalog(catalog).impl.create_table(
127
- table,
128
- namespace,
129
- lifecycle_state,
130
- schema,
131
- schema_consistency,
132
- partition_keys,
133
- primary_keys,
134
- sort_keys,
135
- description,
136
- properties,
137
- permissions,
138
- content_types,
139
- replace_existing_table,
119
+ `fail_if_exists` is True (default behavior)."""
120
+ catalog_obj = get_catalog(catalog)
121
+ return catalog_obj.impl.create_table(
122
+ name,
140
123
  *args,
124
+ namespace=namespace,
125
+ version=version,
126
+ lifecycle_state=lifecycle_state,
127
+ schema=schema,
128
+ partition_scheme=partition_scheme,
129
+ sort_keys=sort_keys,
130
+ description=description,
131
+ table_properties=table_properties,
132
+ namespace_properties=namespace_properties,
133
+ content_types=content_types,
134
+ fail_if_exists=fail_if_exists,
135
+ inner=catalog_obj.inner,
141
136
  **kwargs,
142
137
  )
143
138
 
144
139
 
145
140
  def drop_table(
146
- table: str,
141
+ name: str,
142
+ *args,
147
143
  namespace: Optional[str] = None,
148
144
  catalog: Optional[str] = None,
145
+ table_version: Optional[str] = None,
149
146
  purge: bool = False,
150
- *args,
151
147
  **kwargs,
152
148
  ) -> None:
153
149
  """Drop a table from the catalog and optionally purge it. Raises an error
154
150
  if the table does not exist."""
155
- _get_catalog(catalog).impl.drop_table(table, namespace, purge, *args, **kwargs)
151
+ catalog_obj = get_catalog(catalog)
152
+ catalog_obj.impl.drop_table(
153
+ name,
154
+ *args,
155
+ namespace=namespace,
156
+ table_version=table_version,
157
+ purge=purge,
158
+ inner=catalog_obj.inner,
159
+ **kwargs,
160
+ )
156
161
 
157
162
 
158
163
  def refresh_table(
159
164
  table: str,
165
+ *args,
160
166
  namespace: Optional[str] = None,
161
167
  catalog: Optional[str] = None,
162
- *args,
163
168
  **kwargs,
164
169
  ) -> None:
165
170
  """Refresh metadata cached on the Ray cluster for the given table."""
166
- _get_catalog(catalog).impl.refresh_table(table, namespace, *args, **kwargs)
171
+ catalog_obj = get_catalog(catalog)
172
+ catalog_obj.impl.refresh_table(
173
+ table,
174
+ *args,
175
+ namespace=namespace,
176
+ inner=catalog_obj.inner,
177
+ **kwargs,
178
+ )
167
179
 
168
180
 
169
181
  def list_tables(
170
- namespace: Optional[str] = None, catalog: Optional[str] = None, *args, **kwargs
182
+ *args, namespace: Optional[str] = None, catalog: Optional[str] = None, **kwargs
171
183
  ) -> ListResult[TableDefinition]:
172
184
  """List a page of table definitions. Raises an error if the given namespace
173
185
  does not exist."""
174
- return _get_catalog(catalog).impl.list_tables(namespace, *args, **kwargs)
186
+ catalog_obj = get_catalog(catalog)
187
+ return catalog_obj.impl.list_tables(
188
+ *args,
189
+ namespace=namespace,
190
+ inner=catalog_obj.inner,
191
+ **kwargs,
192
+ )
175
193
 
176
194
 
177
195
  def get_table(
178
- table: str,
196
+ name: str,
197
+ *args,
179
198
  namespace: Optional[str] = None,
180
199
  catalog: Optional[str] = None,
181
- *args,
200
+ table_version: Optional[str] = None,
201
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
182
202
  **kwargs,
183
203
  ) -> Optional[TableDefinition]:
184
204
  """Get table definition metadata. Returns None if the given table does not
185
205
  exist."""
186
- return _get_catalog(catalog).impl.get_table(table, namespace, *args, **kwargs)
206
+ catalog_obj = get_catalog(catalog)
207
+ return catalog_obj.impl.get_table(
208
+ name,
209
+ *args,
210
+ namespace=namespace,
211
+ table_version=table_version,
212
+ stream_format=stream_format,
213
+ inner=catalog_obj.inner,
214
+ **kwargs,
215
+ )
187
216
 
188
217
 
189
218
  def truncate_table(
190
219
  table: str,
220
+ *args,
191
221
  namespace: Optional[str] = None,
192
222
  catalog: Optional[str] = None,
193
- *args,
194
223
  **kwargs,
195
224
  ) -> None:
196
225
  """Truncate table data. Raises an error if the table does not exist."""
197
- _get_catalog(catalog).impl.truncate_table(table, namespace, *args, **kwargs)
226
+ catalog_obj = get_catalog(catalog)
227
+ catalog_obj.impl.truncate_table(
228
+ table,
229
+ *args,
230
+ namespace=namespace,
231
+ inner=catalog_obj.inner,
232
+ **kwargs,
233
+ )
198
234
 
199
235
 
200
236
  def rename_table(
201
237
  table: str,
202
238
  new_name: str,
239
+ *args,
203
240
  namespace: Optional[str] = None,
204
241
  catalog: Optional[str] = None,
205
- *args,
206
242
  **kwargs,
207
243
  ) -> None:
208
244
  """Rename a table."""
209
- _get_catalog(catalog).impl.rename_table(table, new_name, namespace, *args, **kwargs)
245
+ catalog_obj = get_catalog(catalog)
246
+ catalog_obj.impl.rename_table(
247
+ table,
248
+ new_name,
249
+ *args,
250
+ namespace=namespace,
251
+ inner=catalog_obj.inner,
252
+ **kwargs,
253
+ )
210
254
 
211
255
 
212
256
  def table_exists(
213
257
  table: str,
258
+ *args,
214
259
  namespace: Optional[str] = None,
215
260
  catalog: Optional[str] = None,
216
- *args,
217
261
  **kwargs,
218
262
  ) -> bool:
219
263
  """Returns True if the given table exists, False if not."""
220
- return _get_catalog(catalog).impl.table_exists(table, namespace, *args, **kwargs)
264
+ catalog_obj = get_catalog(catalog)
265
+ return catalog_obj.impl.table_exists(
266
+ table,
267
+ *args,
268
+ namespace=namespace,
269
+ inner=catalog_obj.inner,
270
+ **kwargs,
271
+ )
221
272
 
222
273
 
223
274
  # namespace functions
224
275
  def list_namespaces(
225
- catalog: Optional[str] = None, *args, **kwargs
276
+ *args, catalog: Optional[str] = None, **kwargs
226
277
  ) -> ListResult[Namespace]:
227
278
  """List a page of table namespaces."""
228
- return _get_catalog(catalog).impl.list_namespaces(*args, **kwargs)
279
+ catalog_obj = get_catalog(catalog)
280
+ return catalog_obj.impl.list_namespaces(
281
+ *args,
282
+ inner=catalog_obj.inner,
283
+ **kwargs,
284
+ )
229
285
 
230
286
 
231
287
  def get_namespace(
232
- namespace: str, catalog: Optional[str] = None, *args, **kwargs
288
+ namespace: str,
289
+ catalog: Optional[str] = None,
290
+ *args,
291
+ **kwargs,
233
292
  ) -> Optional[Namespace]:
234
293
  """Get table namespace metadata for the specified table namespace. Returns
235
294
  None if the given namespace does not exist."""
236
- return _get_catalog(catalog).impl.get_namespace(namespace, *args, **kwargs)
295
+ catalog_obj = get_catalog(catalog)
296
+ return catalog_obj.impl.get_namespace(
297
+ namespace,
298
+ *args,
299
+ inner=catalog_obj.inner,
300
+ **kwargs,
301
+ )
237
302
 
238
303
 
239
304
  def namespace_exists(
240
- namespace: str, catalog: Optional[str] = None, *args, **kwargs
305
+ namespace: str,
306
+ catalog: Optional[str] = None,
307
+ *args,
308
+ **kwargs,
241
309
  ) -> bool:
242
310
  """Returns True if the given table namespace exists, False if not."""
243
- return _get_catalog(catalog).impl.namespace_exists(namespace, *args, **kwargs)
311
+ catalog_obj = get_catalog(catalog)
312
+ return catalog_obj.impl.namespace_exists(
313
+ namespace,
314
+ *args,
315
+ inner=catalog_obj.inner,
316
+ **kwargs,
317
+ )
244
318
 
245
319
 
246
320
  def create_namespace(
247
321
  namespace: str,
248
- permissions: Dict[str, Any],
322
+ properties: Optional[NamespaceProperties] = None,
249
323
  catalog: Optional[str] = None,
250
324
  *args,
251
325
  **kwargs,
252
326
  ) -> Namespace:
253
- """Creates a table namespace with the given name and permissions. Returns
327
+ """Creates a table namespace with the given name and properties. Returns
254
328
  the created namespace. Raises an error if the namespace already exists."""
255
- return _get_catalog(catalog).impl.create_namespace(
256
- namespace, permissions, *args, **kwargs
329
+ catalog_obj = get_catalog(catalog)
330
+ return catalog_obj.impl.create_namespace(
331
+ namespace,
332
+ *args,
333
+ properties=properties,
334
+ inner=catalog_obj.inner,
335
+ **kwargs,
257
336
  )
258
337
 
259
338
 
260
339
  def alter_namespace(
261
340
  namespace: str,
341
+ *args,
262
342
  catalog: Optional[str] = None,
263
- permissions: Optional[Dict[str, Any]] = None,
343
+ properties: Optional[NamespaceProperties] = None,
264
344
  new_namespace: Optional[str] = None,
265
- *args,
266
345
  **kwargs,
267
346
  ) -> None:
268
347
  """Alter table namespace definition."""
269
- _get_catalog(catalog).impl.alter_namespace(
270
- namespace, permissions, new_namespace, *args, **kwargs
348
+ catalog_obj = get_catalog(catalog)
349
+ catalog_obj.impl.alter_namespace(
350
+ namespace,
351
+ *args,
352
+ properties=properties,
353
+ new_namespace=new_namespace,
354
+ inner=catalog_obj.inner,
355
+ **kwargs,
271
356
  )
272
357
 
273
358
 
274
359
  def drop_namespace(
275
- namespace: str, catalog: Optional[str] = None, purge: bool = False, *args, **kwargs
360
+ namespace: str, *args, catalog: Optional[str] = None, purge: bool = False, **kwargs
276
361
  ) -> None:
277
362
  """Drop the given namespace and all of its tables from the catalog,
278
363
  optionally purging them."""
279
- _get_catalog(catalog).impl.drop_namespace(namespace, purge, *args, **kwargs)
364
+ catalog_obj = get_catalog(catalog)
365
+ catalog_obj.impl.drop_namespace(
366
+ namespace,
367
+ *args,
368
+ purge=purge,
369
+ inner=catalog_obj.inner,
370
+ **kwargs,
371
+ )
280
372
 
281
373
 
282
- def default_namespace(catalog: Optional[str] = None) -> str:
374
+ def default_namespace(*args, catalog: Optional[str] = None, **kwargs) -> str:
283
375
  """Returns the default namespace for the catalog."""
284
- return _get_catalog(catalog).impl.default_namespace()
376
+ catalog_obj = get_catalog(catalog)
377
+ return catalog_obj.impl.default_namespace(*args, inner=catalog_obj.inner, **kwargs)
@@ -0,0 +1,4 @@
1
+ from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
2
+ import deltacat.catalog.iceberg.impl as IcebergCatalog
3
+
4
+ __all__ = ["IcebergCatalogConfig", "IcebergCatalog"]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Dict
3
+
4
+ from attr import dataclass
5
+ from pyiceberg.catalog import CatalogType
6
+
7
+
8
+ @dataclass
9
+ class IcebergCatalogConfig:
10
+ """
11
+ Configuration properties for Iceberg catalog implementation.
12
+
13
+ This class holds the PyIceberg Catalog instance needed for interaction with
14
+ Iceberg tables and metadata.
15
+
16
+ This configuration is passed through to PyIceberg by invoking load_catalog.
17
+ The Properties provided must match properties accepted by PyIceberg for each catalog type
18
+ See: :func:`deltacat.catalog.iceberg.initialize`
19
+
20
+ Attributes:
21
+ type: The PyIceberg Catalog instance
22
+ properties: Dict of properties passed to pyiceberg load_catalog
23
+ """
24
+
25
+ type: CatalogType
26
+ properties: Dict[str, Any]