deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. deltacat/__init__.py +19 -15
  2. deltacat/benchmarking/benchmark_engine.py +4 -2
  3. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  4. deltacat/catalog/__init__.py +62 -5
  5. deltacat/catalog/main/impl.py +18 -8
  6. deltacat/catalog/model/catalog.py +111 -73
  7. deltacat/catalog/model/properties.py +25 -22
  8. deltacat/compute/jobs/client.py +7 -5
  9. deltacat/constants.py +1 -2
  10. deltacat/env.py +10 -0
  11. deltacat/examples/basic_logging.py +1 -3
  12. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  13. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  14. deltacat/examples/indexer/indexer.py +2 -2
  15. deltacat/examples/indexer/job_runner.py +1 -2
  16. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  17. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  18. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  19. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  20. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  21. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  22. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  23. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  24. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  25. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  26. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  27. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  28. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  29. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  30. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  31. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  32. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  33. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  34. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  35. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  36. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  37. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  38. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  39. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  40. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  41. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  42. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  43. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  44. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  45. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  46. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  47. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  49. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  50. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  51. deltacat/io/reader/deltacat_read_api.py +1 -1
  52. deltacat/storage/model/shard.py +6 -2
  53. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  54. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  55. deltacat/tests/catalog/model/__init__.py +0 -0
  56. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  57. deltacat/tests/catalog/test_catalogs.py +52 -98
  58. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  59. deltacat/tests/daft/__init__.py +0 -0
  60. deltacat/tests/daft/test_model.py +97 -0
  61. deltacat/tests/experimental/__init__.py +0 -0
  62. deltacat/tests/experimental/catalog/__init__.py +0 -0
  63. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  64. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  65. deltacat/tests/experimental/daft/__init__.py +0 -0
  66. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  67. deltacat/tests/experimental/storage/__init__.py +0 -0
  68. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  69. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  70. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  71. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  72. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  73. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  74. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  75. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  76. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  77. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  78. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  79. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  80. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  81. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  82. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  83. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  84. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  85. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  86. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  87. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  88. deltacat/tests/storage/model/test_shard.py +3 -1
  89. deltacat/types/media.py +3 -3
  90. deltacat/utils/daft.py +530 -4
  91. deltacat/utils/export.py +3 -1
  92. deltacat/utils/url.py +1 -1
  93. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +4 -5
  94. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +120 -100
  95. deltacat/catalog/iceberg/__init__.py +0 -4
  96. deltacat/daft/daft_scan.py +0 -115
  97. deltacat/daft/model.py +0 -258
  98. deltacat/daft/translator.py +0 -126
  99. deltacat/examples/common/fixtures.py +0 -15
  100. deltacat/storage/rivulet/__init__.py +0 -11
  101. deltacat/storage/rivulet/feather/__init__.py +0 -5
  102. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  103. /deltacat/{daft → examples/experimental}/__init__.py +0 -0
  104. /deltacat/examples/{common → experimental/iceberg}/__init__.py +0 -0
  105. /deltacat/{examples/iceberg → experimental/catalog}/__init__.py +0 -0
  106. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  107. /deltacat/{storage/iceberg → experimental/storage}/__init__.py +0 -0
  108. /deltacat/{storage/rivulet/arrow → experimental/storage/iceberg}/__init__.py +0 -0
  109. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  110. /deltacat/{storage/rivulet/fs → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  111. /deltacat/{storage/rivulet/metastore → experimental/storage/rivulet/fs}/__init__.py +0 -0
  112. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  113. /deltacat/{storage/rivulet/reader → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  114. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  115. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  116. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  117. /deltacat/{storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  118. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  119. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  120. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  121. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/shard}/__init__.py +0 -0
  122. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/writer}/__init__.py +0 -0
  123. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  124. /deltacat/tests/{storage/rivulet/schema → catalog/data}/__init__.py +0 -0
  125. /deltacat/tests/{storage/rivulet/writer → catalog/main}/__init__.py +0 -0
  126. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  127. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  128. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -8,7 +8,7 @@ from deltacat.api import (
8
8
  list,
9
9
  put,
10
10
  )
11
- from deltacat.catalog.delegate import (
11
+ from deltacat.catalog import ( # noqa: F401
12
12
  alter_namespace,
13
13
  alter_table,
14
14
  create_namespace,
@@ -27,17 +27,18 @@ from deltacat.catalog.delegate import (
27
27
  table_exists,
28
28
  truncate_table,
29
29
  write_to_table,
30
- )
31
- from deltacat.catalog.model.catalog import ( # noqa: F401
32
- Catalog,
33
- Catalogs,
34
- raise_if_not_initialized,
35
- is_initialized,
36
30
  init,
31
+ is_initialized,
32
+ clear_catalogs,
37
33
  get_catalog,
34
+ get_catalog_properties,
35
+ pop_catalog,
38
36
  put_catalog,
37
+ raise_if_not_initialized,
38
+ Catalog,
39
+ CatalogProperties,
40
+ TableDefinition,
39
41
  )
40
- from deltacat.catalog.model.table_definition import TableDefinition
41
42
  from deltacat.compute import (
42
43
  job_client,
43
44
  local_job_client,
@@ -60,7 +61,6 @@ from deltacat.storage import (
60
61
  SortScheme,
61
62
  NullOrder,
62
63
  )
63
- from deltacat.storage.rivulet import Dataset as RivDataset, Datatype as RivDatatype
64
64
  from deltacat.types.media import (
65
65
  ContentEncoding,
66
66
  ContentType,
@@ -73,7 +73,9 @@ from deltacat.utils.url import DeltaCatUrl
73
73
 
74
74
  __iceberg__ = []
75
75
  if importlib.util.find_spec("pyiceberg") is not None:
76
- from deltacat.catalog.iceberg import impl as IcebergCatalog # noqa: F401
76
+ from deltacat.experimental.catalog.iceberg import ( # noqa: F401
77
+ impl as IcebergCatalog,
78
+ )
77
79
 
78
80
  __iceberg__ = [
79
81
  "IcebergCatalog",
@@ -81,7 +83,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
81
83
 
82
84
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
83
85
 
84
- __version__ = "2.0.0b10"
86
+ __version__ = "2.0.0b11"
85
87
 
86
88
 
87
89
  __all__ = [
@@ -110,12 +112,16 @@ __all__ = [
110
112
  "default_namespace",
111
113
  "write_to_table",
112
114
  "read_table",
115
+ "init",
116
+ "is_initialized",
117
+ "clear_catalogs",
113
118
  "get_catalog",
119
+ "get_catalog_properties",
120
+ "pop_catalog",
114
121
  "put_catalog",
115
122
  "raise_if_not_initialized",
116
- "is_initialized",
117
- "init",
118
123
  "Catalog",
124
+ "CatalogProperties",
119
125
  "ContentType",
120
126
  "ContentEncoding",
121
127
  "Dataset",
@@ -123,8 +129,6 @@ __all__ = [
123
129
  "DatastoreType",
124
130
  "DeltaCatUrl",
125
131
  "DistributedDataset",
126
- "RivDataset",
127
- "RivDatatype",
128
132
  "Field",
129
133
  "LifecycleState",
130
134
  "ListResult",
@@ -4,8 +4,10 @@ from contextlib import contextmanager
4
4
  from typing import Generator, Tuple
5
5
 
6
6
  from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
7
- from deltacat.storage.rivulet.dataset import Dataset
8
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
7
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
9
11
 
10
12
 
11
13
  @contextmanager
@@ -1,10 +1,12 @@
1
1
  import math
2
2
  from random import shuffle
3
3
  import pytest
4
- from deltacat.storage.rivulet.dataset import Dataset
5
- from deltacat.storage.rivulet.schema.datatype import Datatype
6
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
7
- from deltacat.storage.rivulet.schema.schema import Schema
4
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
5
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
7
+ QueryExpression,
8
+ )
9
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
8
10
  from deltacat.benchmarking.benchmark_engine import BenchmarkEngine
9
11
  from deltacat.benchmarking.benchmark_report import BenchmarkRun, BenchmarkReport
10
12
  from deltacat.benchmarking.benchmark_suite import BenchmarkSuite
@@ -1,14 +1,71 @@
1
+ from deltacat.catalog.delegate import (
2
+ alter_namespace,
3
+ alter_table,
4
+ create_namespace,
5
+ create_table,
6
+ default_namespace,
7
+ drop_namespace,
8
+ drop_table,
9
+ get_namespace,
10
+ get_table,
11
+ list_namespaces,
12
+ list_tables,
13
+ namespace_exists,
14
+ read_table,
15
+ refresh_table,
16
+ rename_table,
17
+ table_exists,
18
+ truncate_table,
19
+ write_to_table,
20
+ )
21
+ from deltacat.catalog.model.catalog import ( # noqa: F401
22
+ all_catalogs,
23
+ init,
24
+ is_initialized,
25
+ clear_catalogs,
26
+ get_catalog,
27
+ pop_catalog,
28
+ put_catalog,
29
+ raise_if_not_initialized,
30
+ Catalog,
31
+ )
1
32
  from deltacat.catalog.model.properties import ( # noqa: F401
2
33
  CatalogProperties,
3
34
  get_catalog_properties,
4
35
  )
5
- from deltacat.catalog.model.catalog import Catalog, Catalogs # noqa: F401
6
- from deltacat.catalog.main import impl as DeltacatCatalog
36
+ from deltacat.catalog.model.table_definition import TableDefinition
37
+ from deltacat.catalog.main import impl as dcat
7
38
 
8
39
  __all__ = [
9
- "CatalogProperties",
40
+ "alter_namespace",
41
+ "alter_table",
42
+ "create_namespace",
43
+ "create_table",
44
+ "default_namespace",
45
+ "drop_namespace",
46
+ "drop_table",
47
+ "get_namespace",
48
+ "get_table",
49
+ "list_namespaces",
50
+ "list_tables",
51
+ "namespace_exists",
52
+ "read_table",
53
+ "refresh_table",
54
+ "rename_table",
55
+ "table_exists",
56
+ "truncate_table",
57
+ "write_to_table",
58
+ "all_catalogs",
59
+ "init",
60
+ "is_initialized",
61
+ "clear_catalogs",
62
+ "get_catalog",
10
63
  "get_catalog_properties",
64
+ "pop_catalog",
65
+ "put_catalog",
66
+ "raise_if_not_initialized",
67
+ "dcat",
11
68
  "Catalog",
12
- "Catalogs",
13
- "DeltacatCatalog",
69
+ "CatalogProperties",
70
+ "TableDefinition",
14
71
  ]
@@ -3,7 +3,7 @@ import logging
3
3
 
4
4
  import deltacat as dc
5
5
 
6
- from deltacat.catalog import CatalogProperties
6
+ from deltacat.catalog.model.properties import CatalogProperties
7
7
  from deltacat.exceptions import (
8
8
  NamespaceAlreadyExistsError,
9
9
  StreamNotFoundError,
@@ -42,20 +42,30 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
42
42
  """
43
43
  Default Catalog interface implementation using DeltaCAT native storage.
44
44
 
45
- When this is used by `delegate.py` the `Catalog` implementation `inner`
46
- property will be set to the value returned from `intialize`.
45
+ The functions here should not be invoked directly, but should instead be
46
+ invoked through `delegate.py` (e.g., to support passing catalog's by name, and
47
+ to ensure that each initialized `Catalog` implementation has its `inner`
48
+ property set to the `CatalogProperties` returned from `initialize()`).
47
49
 
48
- `CatalogProperties` has all state required to implement catalog functions,
49
- such as metastore root URI.
50
+ The `CatalogProperties` instance returned by `initialize()` contains all
51
+ durable state required to deterministically reconstruct the associated DeltaCAT
52
+ native `Catalog` implementation (e.g., the root URI for the catalog metastore).
50
53
  """
51
54
 
52
55
 
53
56
  # catalog functions
54
- def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProperties:
57
+ def initialize(
58
+ config: Optional[CatalogProperties] = None,
59
+ *args,
60
+ **kwargs,
61
+ ) -> CatalogProperties:
55
62
  """
56
- Initializes the data catalog with the given arguments.
63
+ Performs any required one-time initialization and validation of this
64
+ catalog implementation based on the input configuration. If no config
65
+ instance is given, a new `CatalogProperties` instance is constructed
66
+ using the given keyword arguments.
57
67
 
58
- returns CatalogProperties as the "inner" state value for a DC native catalog
68
+ Returns the input config if given, and the newly created config otherwise.
59
69
  """
60
70
  if config is not None:
61
71
  if not isinstance(config, CatalogProperties):
@@ -9,11 +9,8 @@ from functools import partial
9
9
  import ray
10
10
 
11
11
  from deltacat import logs
12
- from deltacat.annotations import ExperimentalAPI
13
- from deltacat.catalog.main import impl as DeltaCatCatalog
14
- from deltacat.catalog.iceberg import impl as IcebergCatalog
15
- from deltacat.catalog import CatalogProperties
16
- from deltacat.catalog.iceberg import IcebergCatalogConfig
12
+ from deltacat.catalog.main import impl as dcat
13
+ from deltacat.catalog.model.properties import CatalogProperties
17
14
  from deltacat.constants import DEFAULT_CATALOG
18
15
 
19
16
  all_catalogs: Optional[ray.actor.ActorHandle] = None
@@ -22,14 +19,20 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
19
 
23
20
 
24
21
  class Catalog:
25
- def __init__(self, impl: ModuleType = DeltaCatCatalog, *args, **kwargs):
22
+ def __init__(
23
+ self,
24
+ config: Optional[Union[CatalogProperties, Any]] = None,
25
+ impl: ModuleType = dcat,
26
+ *args,
27
+ **kwargs,
28
+ ):
26
29
  """
27
30
  Constructor for a Catalog.
28
31
 
29
- Invokes `impl.initialize(*args, **kwargs)` and stores its return value
30
- in the `inner` property, which captures all state required to
31
- deterministically reconstruct this Catalog instance on any node (and
32
- must therefore be pickleable by Ray cloudpickle).
32
+ Invokes `impl.initialize(config, *args, **kwargs)` and stores its
33
+ return value in the `inner` property. This captures all state required
34
+ to deterministically reconstruct this Catalog instance on any node, and
35
+ must be pickleable by Ray cloudpickle.
33
36
  """
34
37
  if not isinstance(self, Catalog):
35
38
  # self may contain the tuple returned from __reduce__ (ray pickle bug?)
@@ -40,32 +43,15 @@ class Catalog:
40
43
  err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
41
44
  raise RuntimeError(err_msg)
42
45
 
46
+ self._config = config
43
47
  self._impl = impl
44
- self._inner = self._impl.initialize(*args, **kwargs)
48
+ self._inner = self._impl.initialize(config=config, *args, **kwargs)
45
49
  self._args = args
46
50
  self._kwargs = kwargs
47
51
 
48
- @classmethod
49
- @ExperimentalAPI
50
- def iceberg(cls, config: IcebergCatalogConfig, *args, **kwargs):
51
- """
52
- !!! ICEBERG SUPPORT IS EXPERIMENTAL !!!
53
-
54
- Factory method to construct a catalog from Iceberg catalog params
55
-
56
- This method is just a wrapper around __init__ with stronger typing. You may still call __init__,
57
- plumbing __params__ through as kwargs
58
- """
59
- return cls(impl=IcebergCatalog, *args, **{"config": config, **kwargs})
60
-
61
- @classmethod
62
- def default(cls, config: CatalogProperties, *args, **kwargs):
63
- """
64
- Factory method to construct a catalog with the default implementation
65
-
66
- Uses CatalogProperties as configuration
67
- """
68
- return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
52
+ @property
53
+ def config(self):
54
+ return self._config
69
55
 
70
56
  @property
71
57
  def impl(self):
@@ -79,7 +65,11 @@ class Catalog:
79
65
  def __reduce__(self):
80
66
  # instantiated catalogs may fail to pickle, so exclude _inner
81
67
  # (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
82
- return partial(self.__class__, **self._kwargs), (self._impl, *self._args)
68
+ return partial(self.__class__, **self._kwargs), (
69
+ self._config,
70
+ self._impl,
71
+ *self._args,
72
+ )
83
73
 
84
74
  def __str__(self):
85
75
  string_rep = f"{self.__class__.__name__}("
@@ -102,38 +92,62 @@ class Catalogs:
102
92
  catalogs: Union[Catalog, Dict[str, Catalog]],
103
93
  default: Optional[str] = None,
104
94
  ):
95
+ self._catalogs = {}
96
+ self._default_catalog_name = None
97
+ self._default_catalog = None
98
+ self.update(catalogs, default)
99
+
100
+ def all(self) -> Dict[str, Catalog]:
101
+ return self._catalogs
102
+
103
+ def update(
104
+ self,
105
+ catalogs: Union[Catalog, Dict[str, Catalog]],
106
+ default: Optional[str] = None,
107
+ ) -> None:
105
108
  if isinstance(catalogs, Catalog):
106
109
  catalogs = {DEFAULT_CATALOG: catalogs}
107
110
  elif not isinstance(catalogs, dict):
108
111
  raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
109
- self.catalogs: Dict[str, Catalog] = catalogs
112
+ self._catalogs.update(catalogs)
110
113
  if default:
111
114
  if default not in catalogs:
112
115
  raise ValueError(
113
116
  f"Default catalog `{default}` not found in: {catalogs}"
114
117
  )
115
- self.default_catalog = self.catalogs[default]
118
+ self._default_catalog = self._catalogs[default]
119
+ self._default_catalog_name = default
116
120
  elif len(catalogs) == 1:
117
- self.default_catalog = list(self.catalogs.values())[0]
121
+ self._default_catalog = list(self._catalogs.values())[0]
118
122
  else:
119
- self.default_catalog = None
120
-
121
- def all(self) -> Dict[str, Catalog]:
122
- return self.catalogs
123
+ self._default_catalog = None
123
124
 
124
125
  def names(self) -> List[str]:
125
- return list(self.catalogs.keys())
126
+ return list(self._catalogs.keys())
126
127
 
127
128
  def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
128
- self.catalogs[name] = catalog
129
+ self._catalogs[name] = catalog
129
130
  if set_default:
130
- self.default_catalog = catalog
131
+ self._default_catalog = catalog
131
132
 
132
133
  def get(self, name) -> Optional[Catalog]:
133
- return self.catalogs.get(name)
134
+ return self._catalogs.get(name)
135
+
136
+ def pop(self, name) -> Optional[Catalog]:
137
+ catalog = self._catalogs.pop(name, None)
138
+ if catalog and self._default_catalog_name == name:
139
+ if len(self._catalogs) == 1:
140
+ self._default_catalog = list(self._catalogs.values())[0]
141
+ else:
142
+ self._default_catalog = None
143
+ return catalog
144
+
145
+ def clear(self) -> None:
146
+ self._catalogs.clear()
147
+ self._default_catalog = None
134
148
 
135
149
  def default(self) -> Optional[Catalog]:
136
- return self.default_catalog
150
+ return self._default_catalog
137
151
 
138
152
 
139
153
  def is_initialized(*args, **kwargs) -> bool:
@@ -142,12 +156,9 @@ def is_initialized(*args, **kwargs) -> bool:
142
156
  """
143
157
  global all_catalogs
144
158
 
145
- # If ray is not initialized, then Catalogs cannot be initialized
146
159
  if not ray.is_initialized():
147
- # Any existing actor reference stored in catalog_module must be stale - reset it
160
+ # Any existing Catalogs actor reference must be stale - reset it
148
161
  all_catalogs = None
149
- return False
150
-
151
162
  return all_catalogs is not None
152
163
 
153
164
 
@@ -168,9 +179,9 @@ def raise_if_not_initialized(
168
179
  def init(
169
180
  catalogs: Union[Dict[str, Catalog], Catalog] = {},
170
181
  default: Optional[str] = None,
171
- ray_init_args: Dict[str, Any] = None,
182
+ ray_init_args: Dict[str, Any] = {},
172
183
  *,
173
- force_reinitialize=False,
184
+ force=False,
174
185
  ) -> None:
175
186
  """
176
187
  Initialize DeltaCAT catalogs.
@@ -180,18 +191,19 @@ def init(
180
191
  :param default: The name of the default Catalog. If only one Catalog is
181
192
  provided, it will always be the default.
182
193
  :param ray_init_args: Keyword arguments to pass to `ray.init()`.
183
- :param force_reinitialize: Whether to force Ray reinitialization.
194
+ :param force: Whether to force DeltaCAT reinitialization. If True, reruns
195
+ ray.init(**ray_init_args) and overwrites all previously registered
196
+ catalogs.
184
197
  """
185
198
  global all_catalogs
186
199
 
187
- if is_initialized() and not force_reinitialize:
200
+ if is_initialized() and not force:
188
201
  logger.warning("DeltaCAT already initialized.")
189
202
  return
190
- else:
191
- if ray_init_args:
192
- ray.init(**ray_init_args)
193
- else:
194
- ray.init()
203
+
204
+ # initialize ray (and ignore reinitialization errors)
205
+ ray_init_args["ignore_reinit_error"] = True
206
+ ray.init(**ray_init_args)
195
207
 
196
208
  # register custom serializer for catalogs since these may contain
197
209
  # unserializable objects like boto3 clients with SSLContext
@@ -203,7 +215,7 @@ def init(
203
215
  all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
204
216
 
205
217
 
206
- def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
218
+ def get_catalog(name: Optional[str] = None) -> Catalog:
207
219
  """
208
220
  Get a catalog by name, or the default catalog if no name is provided.
209
221
 
@@ -241,17 +253,44 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
241
253
  return catalog
242
254
 
243
255
 
256
+ def clear_catalogs() -> None:
257
+ """
258
+ Clear all catalogs from the global map of named catalogs.
259
+ """
260
+ if all_catalogs:
261
+ ray.get(all_catalogs.clear.remote())
262
+
263
+
264
+ def pop_catalog(name: str) -> Optional[Catalog]:
265
+ """
266
+ Remove a named catalog from the global map of named catalogs.
267
+
268
+ Args:
269
+ name: Name of the catalog to remove.
270
+
271
+ Returns:
272
+ The removed catalog, or None if not found.
273
+ """
274
+ global all_catalogs
275
+
276
+ if not all_catalogs:
277
+ return None
278
+ catalog = ray.get(all_catalogs.pop.remote(name))
279
+ return catalog
280
+
281
+
244
282
  def put_catalog(
245
283
  name: str,
246
284
  catalog: Catalog = None,
247
285
  *,
248
286
  default: bool = False,
249
- ray_init_args: Dict[str, Any] = None,
287
+ ray_init_args: Dict[str, Any] = {},
250
288
  fail_if_exists: bool = False,
251
289
  **kwargs,
252
290
  ) -> Catalog:
253
291
  """
254
- Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
292
+ Add a named catalog to the global map of named catalogs. Initializes
293
+ DeltaCAT if not already initialized.
255
294
 
256
295
  Args:
257
296
  name: Name of the catalog.
@@ -261,8 +300,8 @@ def put_catalog(
261
300
  default: Make this the default catalog if multiple catalogs are
262
301
  available. If only one catalog is available, it will always be the
263
302
  default.
264
- ray_init_args: Ray initialization args (used only if ray not already
265
- initialized)
303
+ ray_init_args: Ray initialization args (used only if ray is not already
304
+ initialized).
266
305
  fail_if_exists: if True, raises an error if a catalog with the given
267
306
  name already exists. If False, inserts or replaces the given
268
307
  catalog name.
@@ -276,6 +315,8 @@ def put_catalog(
276
315
 
277
316
  if not catalog:
278
317
  catalog = Catalog(**kwargs)
318
+ if name is None:
319
+ raise ValueError("Catalog name cannot be None")
279
320
 
280
321
  # Initialize, if necessary
281
322
  if not is_initialized():
@@ -283,25 +324,22 @@ def put_catalog(
283
324
  if not default:
284
325
  logger.info(
285
326
  f"Calling put_catalog with set_as_default=False, "
286
- f"but still setting Catalog {catalog} as default since it is the only catalog."
327
+ f"but still setting Catalog {catalog} as default since it is "
328
+ f"the only catalog."
287
329
  )
288
330
  init({name: catalog}, ray_init_args=ray_init_args)
289
- return
331
+ return catalog
290
332
 
291
333
  # Fail if fail_if_exists and catalog already exists
292
334
  if fail_if_exists:
293
- catalog_already_exists = False
294
335
  try:
295
336
  get_catalog(name)
296
- # Note - need to set state catalog_already_exists and throw ValueError later, or else it will be
297
- # caught in the except block which is meant to catch the ValueError from get_catalog
298
- catalog_already_exists = True
299
337
  except ValueError:
300
338
  pass
301
- if catalog_already_exists:
302
- raise ValueError(
303
- f"Failed to put catalog {name} because it already exists and fail_if_exists={fail_if_exists}"
304
- )
339
+ raise ValueError(
340
+ f"Failed to put catalog {name} because it already exists and "
341
+ f"fail_if_exists={fail_if_exists}"
342
+ )
305
343
 
306
344
  # Add the catalog (which may overwrite existing if fail_if_exists=False)
307
345
  ray.get(all_catalogs.put.remote(name, catalog, default))
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
+
2
3
  from typing import Optional, Any
3
4
 
5
+ import os
6
+
4
7
  import pyarrow
5
8
  from deltacat.constants import DELTACAT_ROOT
6
9
 
@@ -8,18 +11,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
8
11
 
9
12
 
10
13
  def get_catalog_properties(
11
- *args,
14
+ *,
12
15
  catalog: Optional[CatalogProperties] = None,
13
16
  inner: Optional[CatalogProperties] = None,
14
17
  **kwargs,
15
18
  ) -> CatalogProperties:
16
19
  """
17
- Helper function to fetch CatalogProperties instance. You are meant to call this by providing your functions
18
- kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
20
+ Helper function to fetch CatalogProperties instance.
19
21
 
20
- This will look for a CatalogProperty value in the kwargs "catalog" or "inner". If these are found, it returns
21
- the CatalogProperty value under that kwarg. Otherwise, it will pass through kwargs to the CatalogProperties
22
- constructor.
22
+ This will look first look for CatalogProperties in either "catalog"
23
+ or "inner" and otherwise passes all keyword arguments to the
24
+ CatalogProperties constructor.
23
25
  """
24
26
  properties = catalog if catalog is not None else inner
25
27
  if properties is not None and isinstance(properties, CatalogProperties):
@@ -39,21 +41,22 @@ class CatalogProperties:
39
41
  DeltaCAT catalog instance. Properties are set from system environment
40
42
  variables unless explicit overrides are provided during initialization.
41
43
 
42
- Catalog and storage APIs rely on the property catalog to retrieve durable state about the catalog they're
43
- working against.
44
+ Catalog and storage APIs rely on the property catalog to retrieve durable
45
+ state about the catalog they're working against.
44
46
 
45
47
  Attributes:
46
- root (str): URI string The root path where catalog metadata and data
47
- files are stored. Root is determined (in prededence order) by:
48
- 1. check "root" input argument
49
- 2. check env variable "DELTACAT_ROOT"
50
- 3. default to ${cwd}/.deltacat
48
+ root: The root path for catalog metadata and data storage. Resolved by
49
+ searching for the root path in the following order:
50
+ 1. "root" constructor input argument
51
+ 2. "DELTACAT_ROOT" system environment variable
52
+ 3. default to "./.deltacat/"
51
53
 
52
54
  filesystem: The filesystem implementation that should be used for
53
55
  reading/writing files. If None, a filesystem will be inferred from
54
56
  the catalog root path.
55
57
 
56
- storage: Storage class implementation (overrides default filesystem storage impl)
58
+ storage: Storage class implementation (overrides default filesystem
59
+ storage impl)
57
60
  """
58
61
 
59
62
  def __init__(
@@ -66,21 +69,21 @@ class CatalogProperties:
66
69
  Initialize a CatalogProperties instance.
67
70
 
68
71
  Args:
69
- root: A single directory path that serves as the catalog root dir.
72
+ root: Catalog root directory path. Uses the "DELTACAT_ROOT"
73
+ system environment variable if not set, and defaults to
74
+ "./.deltacat/" if this environment variable is not set.
70
75
  filesystem: The filesystem implementation that should be used for
71
76
  reading these files. If None, a filesystem will be inferred.
72
- If not None, the provided filesystem will still be validated
73
- against the provided path to ensure compatibility.
77
+ If provided, this will be validated for compatibility with the
78
+ catalog root path.
74
79
  """
75
80
  # set root, using precedence rules described in pydoc
76
81
  if root is None:
77
82
  # Check environment variables
78
- # This is set or defaulted in constants.py
79
83
  root = DELTACAT_ROOT
80
- if root is None:
81
- raise ValueError(
82
- "Expected environment variable DELTACAT_ROOT to be set or defaulted"
83
- )
84
+ if not root:
85
+ # Default to "./.deltacat/"
86
+ root = os.path.join(os.getcwd(), ".deltacat")
84
87
 
85
88
  resolved_root, resolved_filesystem = resolve_path_and_filesystem(
86
89
  path=root,