deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,101 @@
1
+ import os
2
+ import ray
3
+ import logging
4
+
5
+ from deltacat import logs
6
+ from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
7
+ from deltacat.examples.common.fixtures import (
8
+ store_cli_args_in_os_environ,
9
+ )
10
+ from deltacat.env import create_ray_runtime_environment
11
+
12
+ # initialize the driver logger
13
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
14
+
15
+
16
+ @ray.remote
17
+ def logging_worker(var1, var2):
18
+ # for AWS Glue, worker loggers must be initialized within the worker process
19
+ worker_logger = logs.configure_application_logger(logging.getLogger(__name__))
20
+
21
+ log_line_1 = f"Worker System Environment: {os.environ}"
22
+ print(
23
+ f"Writing DEBUG log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
24
+ )
25
+ worker_logger.debug(log_line_1)
26
+
27
+ log_line_2 = f"Worker Variable 1: {var1}"
28
+ print(
29
+ f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
30
+ )
31
+ worker_logger.info(log_line_2)
32
+
33
+ log_line_3 = f"Worker Variable 2: {var2}"
34
+ print(
35
+ f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_3}'"
36
+ )
37
+ worker_logger.info(log_line_3)
38
+
39
+
40
+ def run(var1="default1", var2="default2", **kwargs):
41
+ log_line_1 = f"Driver Variable 1: {var1}"
42
+ print(
43
+ f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
44
+ )
45
+ driver_logger.info(log_line_1)
46
+
47
+ log_line_2 = f"Driver Variable 2: {var2}"
48
+ print(
49
+ f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
50
+ )
51
+ driver_logger.info(log_line_2)
52
+
53
+ print("Starting worker...")
54
+ ray.get(logging_worker.remote(var1, var2))
55
+ print(
56
+ f"The driver is shutting down. Additional DeltaCAT system logs have been written to {DELTACAT_SYS_LOG_DIR}"
57
+ )
58
+
59
+
60
+ if __name__ == "__main__":
61
+ example_script_args = [
62
+ (
63
+ [
64
+ "--var1",
65
+ ],
66
+ {
67
+ "help": "First argument to log.",
68
+ "type": str,
69
+ },
70
+ ),
71
+ (
72
+ [
73
+ "--var2",
74
+ ],
75
+ {
76
+ "help": "Second argument to log.",
77
+ "type": str,
78
+ },
79
+ ),
80
+ (
81
+ [
82
+ "--STAGE",
83
+ ],
84
+ {
85
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
86
+ "type": str,
87
+ },
88
+ ),
89
+ ]
90
+
91
+ # store any CLI args in the runtime environment
92
+ store_cli_args_in_os_environ(example_script_args)
93
+
94
+ # create any runtime environment required to run the example
95
+ runtime_env = create_ray_runtime_environment()
96
+
97
+ # initialize ray
98
+ ray.init(runtime_env=runtime_env)
99
+
100
+ # run the example using os.environ as kwargs
101
+ run(**os.environ)
File without changes
@@ -0,0 +1,15 @@
1
+ import os
2
+ import logging
3
+ import argparse
4
+ from deltacat import logs
5
+
6
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
+
8
+
9
+ def store_cli_args_in_os_environ(script_args_list=[]):
10
+ parser = argparse.ArgumentParser()
11
+ for args, kwargs in script_args_list:
12
+ parser.add_argument(*args, **kwargs)
13
+ args = parser.parse_args()
14
+ print(f"Command Line Arguments: {args}")
15
+ os.environ.update(vars(args))
@@ -0,0 +1,27 @@
1
+ import ray
2
+ import deltacat
3
+ import daft
4
+ import pyiceberg
5
+
6
+
7
+ def print_package_version_info():
8
+ print(f"DeltaCAT Version: {deltacat.__version__}")
9
+ print(f"PyIceberg Version: {pyiceberg.__version__}")
10
+ print(f"Ray Version: {ray.__version__}")
11
+ print(f"Daft Version: {daft.__version__}")
12
+
13
+
14
+ @ray.remote
15
+ def hello_worker():
16
+ print("Hello, Worker!")
17
+ print_package_version_info()
18
+
19
+
20
+ def run():
21
+ print("Hello, Driver!")
22
+ print_package_version_info()
23
+ hello_worker.remote()
24
+
25
+
26
+ if __name__ == "__main__":
27
+ run()
File without changes
@@ -0,0 +1,139 @@
1
+ import os
2
+ import logging
3
+
4
+ import daft
5
+ import deltacat as dc
6
+
7
+ from deltacat import logs
8
+ from deltacat import IcebergCatalog
9
+ from deltacat.examples.common.fixtures import (
10
+ store_cli_args_in_os_environ,
11
+ )
12
+
13
+ from pyiceberg.schema import (
14
+ Schema,
15
+ NestedField,
16
+ DoubleType,
17
+ StringType,
18
+ )
19
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
20
+ from pyiceberg.transforms import BucketTransform
21
+
22
+ from deltacat.storage.iceberg.model import (
23
+ SchemaMapper,
24
+ PartitionSchemeMapper,
25
+ )
26
+ from deltacat.env import create_ray_runtime_environment
27
+
28
+ # initialize the driver logger
29
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
30
+
31
+
32
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
33
+ # create any runtime environment required to run the example
34
+ runtime_env = create_ray_runtime_environment()
35
+
36
+ # Start by initializing DeltaCAT and registering available Catalogs.
37
+ # Ray will be initialized automatically via `ray.init()`.
38
+ # Only the `iceberg` data catalog is provided so it will become the default.
39
+ # If initializing multiple catalogs, use the `default_catalog_name` param
40
+ # to specify which catalog should be the default.
41
+ dc.init(
42
+ catalogs={
43
+ # the name of the DeltaCAT catalog is "iceberg"
44
+ "iceberg": dc.Catalog(
45
+ # Apache Iceberg implementation of deltacat.catalog.interface
46
+ impl=IcebergCatalog,
47
+ # kwargs for pyiceberg.catalog.load_catalog start here...
48
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
49
+ name="example-iceberg-catalog",
50
+ # for additional properties see:
51
+ # https://py.iceberg.apache.org/configuration/
52
+ properties={
53
+ "type": "glue",
54
+ "region_name": "us-east-1",
55
+ "warehouse": warehouse,
56
+ },
57
+ )
58
+ },
59
+ # pass the runtime environment into ray.init()
60
+ ray_init_args={"runtime_env": runtime_env},
61
+ )
62
+
63
+ # define a native Iceberg table schema
64
+ schema = Schema(
65
+ NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
66
+ NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
67
+ NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
68
+ )
69
+
70
+ # define a native Iceberg partition spec
71
+ partition_spec = PartitionSpec(
72
+ PartitionField(
73
+ source_id=1,
74
+ field_id=1000,
75
+ transform=BucketTransform(2),
76
+ name="symbol_bucket",
77
+ )
78
+ )
79
+
80
+ # define a native Iceberg sort order
81
+ # sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
82
+
83
+ # define the Daft dataframe to write
84
+ df = daft.from_pydict(
85
+ {
86
+ "symbol": ["amzn", "goog", "meta", "msft"],
87
+ "bid": [157.16, 150.55, 392.03, 403.25],
88
+ "ask": [157.17, 150.56, 392.09, 403.27],
89
+ }
90
+ )
91
+
92
+ # write to a table named `test_namespace.test_table_bucketed`
93
+ # we don't need to specify which catalog to create this table in since
94
+ # only the "iceberg" catalog is available
95
+ table_name = "test_table_bucketed"
96
+ namespace = "test_namespace"
97
+ print(f"Creating Glue Table: {namespace}.{table_name}")
98
+ dc.write_to_table(
99
+ data=df,
100
+ # path=warehouse + "/datafiles",
101
+ table=table_name,
102
+ namespace=namespace,
103
+ schema=SchemaMapper.map(schema),
104
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
105
+ # sort_keys=SortSchemeMapper.map(sort_order, schema),
106
+ )
107
+
108
+ print(f"Getting Glue Table: {namespace}.{table_name}")
109
+ table_definition = dc.get_table(table_name, namespace)
110
+ print(f"Retrieved Glue Table: {table_definition}")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ example_script_args = [
115
+ (
116
+ [
117
+ "--warehouse",
118
+ ],
119
+ {
120
+ "help": "S3 path for Iceberg file storage.",
121
+ "type": str,
122
+ },
123
+ ),
124
+ (
125
+ [
126
+ "--STAGE",
127
+ ],
128
+ {
129
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
130
+ "type": str,
131
+ },
132
+ ),
133
+ ]
134
+
135
+ # store any CLI args in the runtime environment
136
+ store_cli_args_in_os_environ(example_script_args)
137
+
138
+ # run the example using os.environ as kwargs
139
+ run(**os.environ)
@@ -0,0 +1,149 @@
1
+ import os
2
+ import logging
3
+ import deltacat as dc
4
+
5
+ from deltacat import logs
6
+ from deltacat import IcebergCatalog
7
+ from deltacat.examples.common.fixtures import (
8
+ store_cli_args_in_os_environ,
9
+ )
10
+
11
+ from pyiceberg.schema import (
12
+ Schema,
13
+ NestedField,
14
+ DoubleType,
15
+ StringType,
16
+ TimestampType,
17
+ FloatType,
18
+ StructType,
19
+ )
20
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
21
+ from pyiceberg.transforms import DayTransform, IdentityTransform
22
+ from pyiceberg.table.sorting import SortField, SortOrder
23
+
24
+ from deltacat.exceptions import TableAlreadyExistsError
25
+ from deltacat.storage.iceberg.model import (
26
+ SchemaMapper,
27
+ PartitionSchemeMapper,
28
+ SortSchemeMapper,
29
+ )
30
+ from deltacat.env import create_ray_runtime_environment
31
+
32
+ # initialize the driver logger
33
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
34
+
35
+
36
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
37
+ # create any runtime environment required to run the example
38
+ runtime_env = create_ray_runtime_environment()
39
+
40
+ # Start by initializing DeltaCAT and registering available Catalogs.
41
+ # Ray will be initialized automatically via `ray.init()`.
42
+ # Only the `iceberg` data catalog is provided so it will become the default.
43
+ # If initializing multiple catalogs, use the `default_catalog_name` param
44
+ # to specify which catalog should be the default.
45
+ dc.init(
46
+ catalogs={
47
+ # the name of the DeltaCAT catalog is "iceberg"
48
+ "iceberg": dc.Catalog(
49
+ # Apache Iceberg implementation of deltacat.catalog.interface
50
+ impl=IcebergCatalog,
51
+ # kwargs for pyiceberg.catalog.load_catalog start here...
52
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
53
+ name="example-iceberg-catalog",
54
+ # for additional properties see:
55
+ # https://py.iceberg.apache.org/configuration/
56
+ properties={
57
+ "type": "glue",
58
+ "region_name": "us-east-1",
59
+ "warehouse": warehouse,
60
+ },
61
+ )
62
+ },
63
+ # pass the runtime environment into ray.init()
64
+ ray_init_args={"runtime_env": runtime_env},
65
+ )
66
+
67
+ # define a native Iceberg table schema
68
+ schema = Schema(
69
+ NestedField(
70
+ field_id=1, name="datetime", field_type=TimestampType(), required=True
71
+ ),
72
+ NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
73
+ NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
74
+ NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
75
+ NestedField(
76
+ field_id=5,
77
+ name="details",
78
+ field_type=StructType(
79
+ NestedField(
80
+ field_id=6,
81
+ name="created_by",
82
+ field_type=StringType(),
83
+ required=False,
84
+ ),
85
+ ),
86
+ required=False,
87
+ ),
88
+ )
89
+
90
+ # define a native Iceberg partition spec
91
+ partition_spec = PartitionSpec(
92
+ PartitionField(
93
+ source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
94
+ )
95
+ )
96
+
97
+ # define a native Iceberg sort order
98
+ sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
99
+
100
+ # create a table named `test_namespace.test_table`
101
+ # we don't need to specify which catalog to create this table in since
102
+ # only the "iceberg" catalog is available
103
+ table_name = "test_table"
104
+ namespace = "test_namespace"
105
+ print(f"Creating Glue Table: {namespace}.{table_name}")
106
+ try:
107
+ table_definition = dc.create_table(
108
+ table=table_name,
109
+ namespace=namespace,
110
+ schema=SchemaMapper.map(schema),
111
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
112
+ sort_keys=SortSchemeMapper.map(sort_order, schema),
113
+ )
114
+ print(f"Created Glue Table: {table_definition}")
115
+ except TableAlreadyExistsError:
116
+ print(f"Glue Table `{namespace}.{table_name}` already exists.")
117
+
118
+ print(f"Getting Glue Table: {namespace}.{table_name}")
119
+ table_definition = dc.get_table(table_name, namespace)
120
+ print(f"Retrieved Glue Table: {table_definition}")
121
+
122
+
123
+ if __name__ == "__main__":
124
+ example_script_args = [
125
+ (
126
+ [
127
+ "--warehouse",
128
+ ],
129
+ {
130
+ "help": "S3 path for Iceberg file storage.",
131
+ "type": str,
132
+ },
133
+ ),
134
+ (
135
+ [
136
+ "--STAGE",
137
+ ],
138
+ {
139
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
140
+ "type": str,
141
+ },
142
+ ),
143
+ ]
144
+
145
+ # store any CLI args in the runtime environment
146
+ store_cli_args_in_os_environ(example_script_args)
147
+
148
+ # run the example using os.environ as kwargs
149
+ run(**os.environ)
deltacat/exceptions.py CHANGED
@@ -1,10 +1,16 @@
1
1
  from __future__ import annotations
2
2
  from enum import Enum
3
- import botocore
4
- import ray
3
+ from typing import Callable
5
4
  import logging
5
+
6
6
  import tenacity
7
- from deltacat import logs
7
+
8
+ from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
9
+
10
+ import botocore
11
+ from botocore.exceptions import BotoCoreError
12
+
13
+ import ray
8
14
  from ray.exceptions import (
9
15
  RayError,
10
16
  RayTaskError,
@@ -13,14 +19,14 @@ from ray.exceptions import (
13
19
  NodeDiedError,
14
20
  OutOfMemoryError,
15
21
  )
16
- from deltacat.storage import interface as DeltaCatStorage
17
- from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
18
- from botocore.exceptions import BotoCoreError
19
- from typing import Callable
22
+
23
+ from daft.exceptions import DaftTransientError, DaftCoreException
24
+
25
+ import deltacat as dc
26
+ from deltacat import logs
20
27
  from deltacat.utils.ray_utils.runtime import (
21
28
  get_current_ray_task_id,
22
29
  )
23
- from daft.exceptions import DaftTransientError, DaftCoreException
24
30
 
25
31
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
32
 
@@ -64,6 +70,14 @@ class DeltaCatErrorNames(str, Enum):
64
70
  UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
65
71
  UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
66
72
 
73
+ NAMESPACE_NOT_FOUND_ERROR = "NamespaceNotFoundError"
74
+ TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
75
+ TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
76
+ STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
77
+ DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
78
+ TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
79
+ NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
80
+
67
81
 
68
82
  class DeltaCatError(Exception):
69
83
  def __init__(self, *args, **kwargs):
@@ -206,6 +220,34 @@ class UnrecognizedRayTaskError(NonRetryableError):
206
220
  error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
207
221
 
208
222
 
223
+ class NamespaceNotFoundError(NonRetryableError):
224
+ error_name = DeltaCatErrorNames.NAMESPACE_NOT_FOUND_ERROR.value
225
+
226
+
227
+ class TableNotFoundError(NonRetryableError):
228
+ error_name = DeltaCatErrorNames.TABLE_NOT_FOUND_ERROR.value
229
+
230
+
231
+ class TableVersionNotFoundError(NonRetryableError):
232
+ error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
233
+
234
+
235
+ class StreamNotFoundError(NonRetryableError):
236
+ error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
237
+
238
+
239
+ class DeltaNotFoundError(NonRetryableError):
240
+ error_name = DeltaCatErrorNames.DELTA_NOT_FOUND_ERROR.value
241
+
242
+
243
+ class TableAlreadyExistsError(NonRetryableError):
244
+ error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
245
+
246
+
247
+ class NamespaceAlreadyExistsError(NonRetryableError):
248
+ error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
249
+
250
+
209
251
  def categorize_errors(func: Callable):
210
252
  def wrapper(*args, **kwargs):
211
253
  try:
@@ -238,7 +280,7 @@ def categorize_errors(func: Callable):
238
280
 
239
281
  def categorize_deltacat_exception(
240
282
  e: BaseException,
241
- deltacat_storage: DeltaCatStorage = None,
283
+ deltacat_storage: dc.storage.interface = None,
242
284
  deltacat_storage_kwargs: dict = None,
243
285
  ):
244
286
  if deltacat_storage_kwargs is None:
deltacat/logs.py CHANGED
@@ -18,6 +18,7 @@ from deltacat.constants import (
18
18
  DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
19
19
  DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
20
20
  DELTACAT_LOGGER_CONTEXT,
21
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER,
21
22
  )
22
23
 
23
24
  DEFAULT_LOG_LEVEL = "INFO"
@@ -226,6 +227,7 @@ def _configure_logger(
226
227
  # This maintains log level of rotating file handlers
227
228
  primary_log_level = log_level
228
229
  logger.propagate = False
230
+ needs_handler = True
229
231
  if log_level <= logging.getLevelName("DEBUG"):
230
232
  if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
231
233
  handler = _create_rotating_file_handler(
@@ -235,8 +237,9 @@ def _configure_logger(
235
237
  context_kwargs=context_kwargs,
236
238
  )
237
239
  _add_logger_handler(logger, handler)
240
+ needs_handler = not DELTACAT_LOGGER_USE_SINGLE_HANDLER
238
241
  primary_log_level = logging.getLevelName("INFO")
239
- if not _file_handler_exists(logger, log_dir, log_base_file_name):
242
+ if not _file_handler_exists(logger, log_dir, log_base_file_name) and needs_handler:
240
243
  handler = _create_rotating_file_handler(
241
244
  log_dir,
242
245
  log_base_file_name,