deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,139 @@
1
+ import os
2
+ import logging
3
+
4
+ import daft
5
+ import deltacat as dc
6
+
7
+ from deltacat import logs
8
+ from deltacat import IcebergCatalog
9
+ from deltacat.examples.common.fixtures import (
10
+ store_cli_args_in_os_environ,
11
+ )
12
+
13
+ from pyiceberg.schema import (
14
+ Schema,
15
+ NestedField,
16
+ DoubleType,
17
+ StringType,
18
+ )
19
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
20
+ from pyiceberg.transforms import BucketTransform
21
+
22
+ from deltacat.storage.iceberg.model import (
23
+ SchemaMapper,
24
+ PartitionSchemeMapper,
25
+ )
26
+ from deltacat.env import create_ray_runtime_environment
27
+
28
+ # initialize the driver logger
29
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
30
+
31
+
32
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
33
+ # create any runtime environment required to run the example
34
+ runtime_env = create_ray_runtime_environment()
35
+
36
+ # Start by initializing DeltaCAT and registering available Catalogs.
37
+ # Ray will be initialized automatically via `ray.init()`.
38
+ # Only the `iceberg` data catalog is provided so it will become the default.
39
+ # If initializing multiple catalogs, use the `default_catalog_name` param
40
+ # to specify which catalog should be the default.
41
+ dc.init(
42
+ catalogs={
43
+ # the name of the DeltaCAT catalog is "iceberg"
44
+ "iceberg": dc.Catalog(
45
+ # Apache Iceberg implementation of deltacat.catalog.interface
46
+ impl=IcebergCatalog,
47
+ # kwargs for pyiceberg.catalog.load_catalog start here...
48
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
49
+ name="example-iceberg-catalog",
50
+ # for additional properties see:
51
+ # https://py.iceberg.apache.org/configuration/
52
+ properties={
53
+ "type": "glue",
54
+ "region_name": "us-east-1",
55
+ "warehouse": warehouse,
56
+ },
57
+ )
58
+ },
59
+ # pass the runtime environment into ray.init()
60
+ ray_init_args={"runtime_env": runtime_env},
61
+ )
62
+
63
+ # define a native Iceberg table schema
64
+ schema = Schema(
65
+ NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
66
+ NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
67
+ NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
68
+ )
69
+
70
+ # define a native Iceberg partition spec
71
+ partition_spec = PartitionSpec(
72
+ PartitionField(
73
+ source_id=1,
74
+ field_id=1000,
75
+ transform=BucketTransform(2),
76
+ name="symbol_bucket",
77
+ )
78
+ )
79
+
80
+ # define a native Iceberg sort order
81
+ # sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
82
+
83
+ # define the Daft dataframe to write
84
+ df = daft.from_pydict(
85
+ {
86
+ "symbol": ["amzn", "goog", "meta", "msft"],
87
+ "bid": [157.16, 150.55, 392.03, 403.25],
88
+ "ask": [157.17, 150.56, 392.09, 403.27],
89
+ }
90
+ )
91
+
92
+ # write to a table named `test_namespace.test_table_bucketed`
93
+ # we don't need to specify which catalog to create this table in since
94
+ # only the "iceberg" catalog is available
95
+ table_name = "test_table_bucketed"
96
+ namespace = "test_namespace"
97
+ print(f"Creating Glue Table: {namespace}.{table_name}")
98
+ dc.write_to_table(
99
+ data=df,
100
+ # path=warehouse + "/datafiles",
101
+ table=table_name,
102
+ namespace=namespace,
103
+ schema=SchemaMapper.map(schema),
104
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
105
+ # sort_keys=SortSchemeMapper.map(sort_order, schema),
106
+ )
107
+
108
+ print(f"Getting Glue Table: {namespace}.{table_name}")
109
+ table_definition = dc.get_table(table_name, namespace)
110
+ print(f"Retrieved Glue Table: {table_definition}")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ example_script_args = [
115
+ (
116
+ [
117
+ "--warehouse",
118
+ ],
119
+ {
120
+ "help": "S3 path for Iceberg file storage.",
121
+ "type": str,
122
+ },
123
+ ),
124
+ (
125
+ [
126
+ "--STAGE",
127
+ ],
128
+ {
129
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
130
+ "type": str,
131
+ },
132
+ ),
133
+ ]
134
+
135
+ # store any CLI args in the runtime environment
136
+ store_cli_args_in_os_environ(example_script_args)
137
+
138
+ # run the example using os.environ as kwargs
139
+ run(**os.environ)
@@ -0,0 +1,149 @@
1
+ import os
2
+ import logging
3
+ import deltacat as dc
4
+
5
+ from deltacat import logs
6
+ from deltacat import IcebergCatalog
7
+ from deltacat.examples.common.fixtures import (
8
+ store_cli_args_in_os_environ,
9
+ )
10
+
11
+ from pyiceberg.schema import (
12
+ Schema,
13
+ NestedField,
14
+ DoubleType,
15
+ StringType,
16
+ TimestampType,
17
+ FloatType,
18
+ StructType,
19
+ )
20
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
21
+ from pyiceberg.transforms import DayTransform, IdentityTransform
22
+ from pyiceberg.table.sorting import SortField, SortOrder
23
+
24
+ from deltacat.exceptions import TableAlreadyExistsError
25
+ from deltacat.storage.iceberg.model import (
26
+ SchemaMapper,
27
+ PartitionSchemeMapper,
28
+ SortSchemeMapper,
29
+ )
30
+ from deltacat.env import create_ray_runtime_environment
31
+
32
+ # initialize the driver logger
33
+ driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
34
+
35
+
36
+ def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
37
+ # create any runtime environment required to run the example
38
+ runtime_env = create_ray_runtime_environment()
39
+
40
+ # Start by initializing DeltaCAT and registering available Catalogs.
41
+ # Ray will be initialized automatically via `ray.init()`.
42
+ # Only the `iceberg` data catalog is provided so it will become the default.
43
+ # If initializing multiple catalogs, use the `default_catalog_name` param
44
+ # to specify which catalog should be the default.
45
+ dc.init(
46
+ catalogs={
47
+ # the name of the DeltaCAT catalog is "iceberg"
48
+ "iceberg": dc.Catalog(
49
+ # Apache Iceberg implementation of deltacat.catalog.interface
50
+ impl=IcebergCatalog,
51
+ # kwargs for pyiceberg.catalog.load_catalog start here...
52
+ # the name of the Iceberg catalog is "example-iceberg-catalog"
53
+ name="example-iceberg-catalog",
54
+ # for additional properties see:
55
+ # https://py.iceberg.apache.org/configuration/
56
+ properties={
57
+ "type": "glue",
58
+ "region_name": "us-east-1",
59
+ "warehouse": warehouse,
60
+ },
61
+ )
62
+ },
63
+ # pass the runtime environment into ray.init()
64
+ ray_init_args={"runtime_env": runtime_env},
65
+ )
66
+
67
+ # define a native Iceberg table schema
68
+ schema = Schema(
69
+ NestedField(
70
+ field_id=1, name="datetime", field_type=TimestampType(), required=True
71
+ ),
72
+ NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
73
+ NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
74
+ NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
75
+ NestedField(
76
+ field_id=5,
77
+ name="details",
78
+ field_type=StructType(
79
+ NestedField(
80
+ field_id=6,
81
+ name="created_by",
82
+ field_type=StringType(),
83
+ required=False,
84
+ ),
85
+ ),
86
+ required=False,
87
+ ),
88
+ )
89
+
90
+ # define a native Iceberg partition spec
91
+ partition_spec = PartitionSpec(
92
+ PartitionField(
93
+ source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
94
+ )
95
+ )
96
+
97
+ # define a native Iceberg sort order
98
+ sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
99
+
100
+ # create a table named `test_namespace.test_table`
101
+ # we don't need to specify which catalog to create this table in since
102
+ # only the "iceberg" catalog is available
103
+ table_name = "test_table"
104
+ namespace = "test_namespace"
105
+ print(f"Creating Glue Table: {namespace}.{table_name}")
106
+ try:
107
+ table_definition = dc.create_table(
108
+ table=table_name,
109
+ namespace=namespace,
110
+ schema=SchemaMapper.map(schema),
111
+ partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
112
+ sort_keys=SortSchemeMapper.map(sort_order, schema),
113
+ )
114
+ print(f"Created Glue Table: {table_definition}")
115
+ except TableAlreadyExistsError:
116
+ print(f"Glue Table `{namespace}.{table_name}` already exists.")
117
+
118
+ print(f"Getting Glue Table: {namespace}.{table_name}")
119
+ table_definition = dc.get_table(table_name, namespace)
120
+ print(f"Retrieved Glue Table: {table_definition}")
121
+
122
+
123
+ if __name__ == "__main__":
124
+ example_script_args = [
125
+ (
126
+ [
127
+ "--warehouse",
128
+ ],
129
+ {
130
+ "help": "S3 path for Iceberg file storage.",
131
+ "type": str,
132
+ },
133
+ ),
134
+ (
135
+ [
136
+ "--STAGE",
137
+ ],
138
+ {
139
+ "help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
140
+ "type": str,
141
+ },
142
+ ),
143
+ ]
144
+
145
+ # store any CLI args in the runtime environment
146
+ store_cli_args_in_os_environ(example_script_args)
147
+
148
+ # run the example using os.environ as kwargs
149
+ run(**os.environ)
deltacat/exceptions.py CHANGED
@@ -1,10 +1,16 @@
1
1
  from __future__ import annotations
2
2
  from enum import Enum
3
- import botocore
4
- import ray
3
+ from typing import Callable
5
4
  import logging
5
+
6
6
  import tenacity
7
- from deltacat import logs
7
+
8
+ from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
9
+
10
+ import botocore
11
+ from botocore.exceptions import BotoCoreError
12
+
13
+ import ray
8
14
  from ray.exceptions import (
9
15
  RayError,
10
16
  RayTaskError,
@@ -13,14 +19,14 @@ from ray.exceptions import (
13
19
  NodeDiedError,
14
20
  OutOfMemoryError,
15
21
  )
16
- from deltacat.storage import interface as DeltaCatStorage
17
- from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
18
- from botocore.exceptions import BotoCoreError
19
- from typing import Callable
22
+
23
+ from daft.exceptions import DaftTransientError, DaftCoreException
24
+
25
+ import deltacat as dc
26
+ from deltacat import logs
20
27
  from deltacat.utils.ray_utils.runtime import (
21
28
  get_current_ray_task_id,
22
29
  )
23
- from daft.exceptions import DaftTransientError, DaftCoreException
24
30
 
25
31
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
32
 
@@ -64,6 +70,14 @@ class DeltaCatErrorNames(str, Enum):
64
70
  UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
65
71
  UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
66
72
 
73
+ NAMESPACE_NOT_FOUND_ERROR = "NamespaceNotFoundError"
74
+ TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
75
+ TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
76
+ STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
77
+ DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
78
+ TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
79
+ NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
80
+
67
81
 
68
82
  class DeltaCatError(Exception):
69
83
  def __init__(self, *args, **kwargs):
@@ -206,6 +220,34 @@ class UnrecognizedRayTaskError(NonRetryableError):
206
220
  error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
207
221
 
208
222
 
223
+ class NamespaceNotFoundError(NonRetryableError):
224
+ error_name = DeltaCatErrorNames.NAMESPACE_NOT_FOUND_ERROR.value
225
+
226
+
227
+ class TableNotFoundError(NonRetryableError):
228
+ error_name = DeltaCatErrorNames.TABLE_NOT_FOUND_ERROR.value
229
+
230
+
231
+ class TableVersionNotFoundError(NonRetryableError):
232
+ error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
233
+
234
+
235
+ class StreamNotFoundError(NonRetryableError):
236
+ error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
237
+
238
+
239
+ class DeltaNotFoundError(NonRetryableError):
240
+ error_name = DeltaCatErrorNames.DELTA_NOT_FOUND_ERROR.value
241
+
242
+
243
+ class TableAlreadyExistsError(NonRetryableError):
244
+ error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
245
+
246
+
247
+ class NamespaceAlreadyExistsError(NonRetryableError):
248
+ error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
249
+
250
+
209
251
  def categorize_errors(func: Callable):
210
252
  def wrapper(*args, **kwargs):
211
253
  try:
@@ -238,7 +280,7 @@ def categorize_errors(func: Callable):
238
280
 
239
281
  def categorize_deltacat_exception(
240
282
  e: BaseException,
241
- deltacat_storage: DeltaCatStorage = None,
283
+ deltacat_storage: dc.storage.interface = None,
242
284
  deltacat_storage_kwargs: dict = None,
243
285
  ):
244
286
  if deltacat_storage_kwargs is None:
deltacat/logs.py CHANGED
@@ -18,6 +18,7 @@ from deltacat.constants import (
18
18
  DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
19
19
  DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
20
20
  DELTACAT_LOGGER_CONTEXT,
21
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER,
21
22
  )
22
23
 
23
24
  DEFAULT_LOG_LEVEL = "INFO"
@@ -226,6 +227,7 @@ def _configure_logger(
226
227
  # This maintains log level of rotating file handlers
227
228
  primary_log_level = log_level
228
229
  logger.propagate = False
230
+ needs_handler = True
229
231
  if log_level <= logging.getLevelName("DEBUG"):
230
232
  if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
231
233
  handler = _create_rotating_file_handler(
@@ -235,8 +237,9 @@ def _configure_logger(
235
237
  context_kwargs=context_kwargs,
236
238
  )
237
239
  _add_logger_handler(logger, handler)
240
+ needs_handler = not DELTACAT_LOGGER_USE_SINGLE_HANDLER
238
241
  primary_log_level = logging.getLevelName("INFO")
239
- if not _file_handler_exists(logger, log_dir, log_base_file_name):
242
+ if not _file_handler_exists(logger, log_dir, log_base_file_name) and needs_handler:
240
243
  handler = _create_rotating_file_handler(
241
244
  log_dir,
242
245
  log_base_file_name,
@@ -1,24 +1,63 @@
1
- from deltacat.aws.redshift import (
1
+ from deltacat.storage.model.manifest import (
2
+ EntryType,
3
+ EntryParams,
2
4
  Manifest,
3
5
  ManifestAuthor,
4
6
  ManifestEntry,
5
7
  ManifestEntryList,
6
8
  ManifestMeta,
7
9
  )
8
- from deltacat.storage.model.delta import Delta, DeltaLocator
10
+ from deltacat.storage.model.delta import (
11
+ Delta,
12
+ DeltaLocator,
13
+ DeltaProperties,
14
+ )
9
15
  from deltacat.storage.model.list_result import ListResult
10
16
  from deltacat.storage.model.locator import Locator
11
- from deltacat.storage.model.namespace import Namespace, NamespaceLocator
12
- from deltacat.storage.model.partition import Partition, PartitionLocator
13
- from deltacat.storage.model.stream import Stream, StreamLocator
14
- from deltacat.storage.model.table import Table, TableLocator
15
- from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
16
- from deltacat.storage.model.delete_parameters import DeleteParameters
17
- from deltacat.storage.model.partition_spec import (
18
- PartitionFilter,
17
+ from deltacat.storage.model.metafile import (
18
+ Metafile,
19
+ )
20
+ from deltacat.storage.model.transaction import (
21
+ TransactionOperation,
22
+ Transaction,
23
+ )
24
+ from deltacat.storage.model.namespace import (
25
+ Namespace,
26
+ NamespaceLocator,
27
+ NamespaceProperties,
28
+ )
29
+ from deltacat.storage.model.partition import (
30
+ Partition,
31
+ PartitionLocator,
32
+ PartitionLocatorAlias,
33
+ PartitionKey,
34
+ PartitionScheme,
35
+ PartitionSchemeList,
19
36
  PartitionValues,
20
- DeltaPartitionSpec,
21
- StreamPartitionSpec,
37
+ )
38
+ from deltacat.storage.model.schema import (
39
+ Field,
40
+ FieldId,
41
+ FieldLocator,
42
+ FieldName,
43
+ NestedFieldName,
44
+ Schema,
45
+ SchemaList,
46
+ )
47
+ from deltacat.storage.model.stream import (
48
+ Stream,
49
+ StreamLocator,
50
+ StreamLocatorAlias,
51
+ )
52
+ from deltacat.storage.model.table import (
53
+ Table,
54
+ TableLocator,
55
+ TableProperties,
56
+ )
57
+ from deltacat.storage.model.table_version import (
58
+ TableVersion,
59
+ TableVersionLocator,
60
+ TableVersionProperties,
22
61
  )
23
62
  from deltacat.storage.model.transform import (
24
63
  Transform,
@@ -26,9 +65,17 @@ from deltacat.storage.model.transform import (
26
65
  TransformParameters,
27
66
  BucketingStrategy,
28
67
  BucketTransformParameters,
29
- IdentityTransformParameters,
68
+ TruncateTransformParameters,
69
+ BucketTransform,
70
+ IdentityTransform,
71
+ VoidTransform,
72
+ UnknownTransform,
73
+ HourTransform,
74
+ DayTransform,
75
+ MonthTransform,
76
+ YearTransform,
77
+ TruncateTransform,
30
78
  )
31
-
32
79
  from deltacat.storage.model.types import (
33
80
  CommitState,
34
81
  DeltaType,
@@ -36,18 +83,39 @@ from deltacat.storage.model.types import (
36
83
  LifecycleState,
37
84
  LocalDataset,
38
85
  LocalTable,
86
+ NullOrder,
39
87
  SchemaConsistencyType,
88
+ StreamFormat,
89
+ SortOrder,
90
+ TransactionType,
91
+ TransactionOperationType,
92
+ )
93
+ from deltacat.storage.model.sort_key import (
94
+ SortKey,
95
+ SortScheme,
96
+ SortSchemeList,
40
97
  )
41
- from deltacat.storage.model.sort_key import SortKey, SortOrder
98
+ from deltacat.storage.main import impl as metastore
42
99
 
43
100
  __all__ = [
101
+ "BucketingStrategy",
102
+ "BucketTransform",
103
+ "BucketTransformParameters",
44
104
  "CommitState",
105
+ "DayTransform",
45
106
  "Delta",
46
107
  "DeltaLocator",
47
- "Partition",
48
- "DeleteParameters",
108
+ "DeltaProperties",
49
109
  "DeltaType",
50
110
  "DistributedDataset",
111
+ "EntryType",
112
+ "EntryParams",
113
+ "Field",
114
+ "FieldId",
115
+ "FieldLocator",
116
+ "FieldName",
117
+ "HourTransform",
118
+ "IdentityTransform",
51
119
  "LifecycleState",
52
120
  "ListResult",
53
121
  "LocalDataset",
@@ -56,28 +124,50 @@ __all__ = [
56
124
  "Manifest",
57
125
  "ManifestAuthor",
58
126
  "ManifestEntry",
59
- "ManifestMeta",
60
127
  "ManifestEntryList",
128
+ "ManifestMeta",
129
+ "Metafile",
130
+ "metastore",
131
+ "MonthTransform",
61
132
  "Namespace",
62
133
  "NamespaceLocator",
134
+ "NamespaceProperties",
135
+ "NestedFieldName",
136
+ "NullOrder",
137
+ "Partition",
138
+ "PartitionKey",
63
139
  "PartitionLocator",
64
- "Stream",
140
+ "PartitionLocatorAlias",
141
+ "PartitionScheme",
142
+ "PartitionSchemeList",
143
+ "PartitionValues",
144
+ "Schema",
145
+ "SchemaList",
65
146
  "SchemaConsistencyType",
147
+ "SortKey",
148
+ "SortOrder",
149
+ "SortScheme",
150
+ "SortSchemeList",
151
+ "Stream",
152
+ "StreamFormat",
66
153
  "StreamLocator",
154
+ "StreamLocatorAlias",
67
155
  "Table",
68
156
  "TableLocator",
157
+ "TableProperties",
69
158
  "TableVersion",
70
159
  "TableVersionLocator",
71
- "SortKey",
72
- "SortOrder",
73
- "PartitionFilter",
74
- "PartitionValues",
75
- "DeltaPartitionSpec",
76
- "StreamPartitionSpec",
160
+ "TableVersionProperties",
161
+ "Transaction",
162
+ "TransactionOperation",
163
+ "TransactionOperationType",
164
+ "TransactionType",
77
165
  "Transform",
78
166
  "TransformName",
79
167
  "TransformParameters",
80
- "BucketingStrategy",
81
- "BucketTransformParameters",
82
- "IdentityTransformParameters",
168
+ "TruncateTransform",
169
+ "TruncateTransformParameters",
170
+ "UnknownTransform",
171
+ "VoidTransform",
172
+ "YearTransform",
83
173
  ]
File without changes
@@ -0,0 +1,28 @@
1
+ from typing import Optional
2
+
3
+ from pyiceberg.catalog import Catalog
4
+ from deltacat.storage.model.scan.push_down import Pushdown
5
+ from deltacat.storage.model.scan.scan_plan import ScanPlan
6
+ from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
7
+ from deltacat.storage.util.scan_planner import ScanPlanner
8
+ from deltacat.storage.iceberg.impl import _try_load_iceberg_table
9
+
10
+
11
+ class IcebergScanPlanner(ScanPlanner):
12
+ def __init__(self, catalog: Catalog):
13
+ self.catalog = catalog
14
+
15
+ def create_scan_plan(
16
+ self,
17
+ table_name: str,
18
+ namespace: Optional[str] = None,
19
+ pushdown: Optional[Pushdown] = None,
20
+ ) -> ScanPlan:
21
+ iceberg_table = _try_load_iceberg_table(
22
+ self.catalog, namespace=namespace, table_name=table_name
23
+ )
24
+ file_scan_tasks = []
25
+ # TODO: implement predicate pushdown to Iceberg
26
+ for scan_task in iceberg_table.scan().plan_files():
27
+ file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
28
+ return ScanPlan(file_scan_tasks)