demo_uc_setup 0.1.0__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,62 @@
1
+ # Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode
3
+
4
+ ### macOS ###
5
+ # General
6
+ .DS_Store
7
+ .AppleDouble
8
+ .LSOverride
9
+
10
+ # Icon must end with two \r
11
+ Icon
12
+
13
+
14
+ # Thumbnails
15
+ ._*
16
+
17
+ # Files that might appear in the root of a volume
18
+ .DocumentRevisions-V100
19
+ .fseventsd
20
+ .Spotlight-V100
21
+ .TemporaryItems
22
+ .Trashes
23
+ .VolumeIcon.icns
24
+ .com.apple.timemachine.donotpresent
25
+
26
+ # Directories potentially created on remote AFP share
27
+ .AppleDB
28
+ .AppleDesktop
29
+ Network Trash Folder
30
+ Temporary Items
31
+ .apdisk
32
+
33
+ ### macOS Patch ###
34
+ # iCloud generated files
35
+ *.icloud
36
+
37
+ ### VisualStudioCode ###
38
+ .vscode/*
39
+ !.vscode/settings.json
40
+ !.vscode/tasks.json
41
+ !.vscode/launch.json
42
+ !.vscode/extensions.json
43
+ !.vscode/*.code-snippets
44
+
45
+ # Local History for Visual Studio Code
46
+ .history/
47
+
48
+ # Built Visual Studio Code Extensions
49
+ *.vsix
50
+
51
+ ### VisualStudioCode Patch ###
52
+ # Ignore all local history of files
53
+ .history
54
+ .ionide
55
+
56
+ # End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
57
+
58
+ .databricks
59
+ *.pdf
60
+ __pycache__
61
+ .env
62
+ .venv
@@ -0,0 +1 @@
1
+ 3.11
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: demo_uc_setup
3
+ Version: 0.1.0
4
+ Summary: A reusable task-based framework for managing Unity Catalog in Databricks
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: databricks-connect>=16.1.1
7
+ Requires-Dist: databricks-sdk>=0.44.1
8
+ Requires-Dist: pydantic-settings>=2.8.0
9
+ Requires-Dist: pydantic>=2.10.6
10
+ Description-Content-Type: text/markdown
11
+
12
+ # Databricks Unity Catalog Setup Demo
13
+
14
+ A Python package that demonstrates automated setup and teardown of Databricks Unity Catalog resources using the Databricks SDK. This package provides a reusable framework for managing Unity Catalog resources programmatically, both from local environments and within Databricks notebooks.
15
+
16
+ ## Features
17
+
18
+ - Automated creation of Unity Catalog resources:
19
+ - Catalogs
20
+ - Schemas
21
+ - Volumes
22
+ - Configurable resource naming via environment variables
23
+ - Support for both local execution and Databricks notebook execution
24
+ - Type-safe configuration management using Pydantic
25
+ - Clean teardown functionality
26
+
27
+ ## Prerequisites
28
+
29
+ - Python 3.8+
30
+ - A Databricks workspace with Unity Catalog enabled
31
+ - Appropriate permissions to create/manage Unity Catalog resources
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install demo-uc-setup
37
+ ```
38
+
39
+ ## Configuration
40
+
41
+ The package uses environment variables for configuration. You can set these either in your environment or in a `.env` file:
42
+
43
+ ```env
44
+ # Required for local execution (optional in Databricks notebooks)
45
+ DATABRICKS_HOST=your-workspace-url
46
+ DATABRICKS_TOKEN=your-pat-token
47
+
48
+ # Optional - override default resource names
49
+ DEMO_CATALOG_NAME=custom_catalog_name
50
+ DEMO_SCHEMAS=["schema1", "schema2"]
51
+ DEMO_VOLUME_NAME=custom_volume_name
52
+ ```
53
+
54
+ ## Usage
55
+
56
+ ### Local Execution
57
+
58
+ ```python
59
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
60
+ from demo_uc_setup.unity_catalog_teardown import UnityCatalogTeardownTask
61
+
62
+ # Setup Unity Catalog resources
63
+ UnityCatalogSetupTask.entrypoint()
64
+
65
+ # Teardown Unity Catalog resources
66
+ UnityCatalogTeardownTask.entrypoint()
67
+ ```
68
+
69
+ ### Databricks Notebook Execution
70
+
71
+ ```python
72
+ %pip install demo-uc-setup
73
+
74
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
75
+ UnityCatalogSetupTask.entrypoint()
76
+ ```
77
+
78
+ ## Default Resource Names
79
+
80
+ If not overridden via environment variables, the package will create:
81
+ - A catalog named `demo_catalog`
82
+ - Two schemas: `demo_schema_1` and `demo_schema_2`
83
+ - A volume named `demo_volume` in each schema
84
+
85
+ ## Extending the Framework
86
+
87
+ The package provides a reusable `Task` base class that can be extended for custom Unity Catalog operations:
88
+
89
+ ```python
90
+ from demo_uc_setup.common import Task
91
+ from demo_uc_setup.config import Config
92
+
93
+ class CustomTask(Task[Config]):
94
+ def run(self):
95
+ self.logger.info("Starting custom task...")
96
+ # Your custom logic here
97
+ ```
98
+
99
+ ## Contributing
100
+
101
+ Contributions are welcome! Please feel free to submit a Pull Request.
102
+
103
+ ## License
104
+
105
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,94 @@
1
+ # Databricks Unity Catalog Setup Demo
2
+
3
+ A Python package that demonstrates automated setup and teardown of Databricks Unity Catalog resources using the Databricks SDK. This package provides a reusable framework for managing Unity Catalog resources programmatically, both from local environments and within Databricks notebooks.
4
+
5
+ ## Features
6
+
7
+ - Automated creation of Unity Catalog resources:
8
+ - Catalogs
9
+ - Schemas
10
+ - Volumes
11
+ - Configurable resource naming via environment variables
12
+ - Support for both local execution and Databricks notebook execution
13
+ - Type-safe configuration management using Pydantic
14
+ - Clean teardown functionality
15
+
16
+ ## Prerequisites
17
+
18
+ - Python 3.8+
19
+ - A Databricks workspace with Unity Catalog enabled
20
+ - Appropriate permissions to create/manage Unity Catalog resources
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install demo-uc-setup
26
+ ```
27
+
28
+ ## Configuration
29
+
30
+ The package uses environment variables for configuration. You can set these either in your environment or in a `.env` file:
31
+
32
+ ```env
33
+ # Required for local execution (optional in Databricks notebooks)
34
+ DATABRICKS_HOST=your-workspace-url
35
+ DATABRICKS_TOKEN=your-pat-token
36
+
37
+ # Optional - override default resource names
38
+ DEMO_CATALOG_NAME=custom_catalog_name
39
+ DEMO_SCHEMAS=["schema1", "schema2"]
40
+ DEMO_VOLUME_NAME=custom_volume_name
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ### Local Execution
46
+
47
+ ```python
48
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
49
+ from demo_uc_setup.unity_catalog_teardown import UnityCatalogTeardownTask
50
+
51
+ # Setup Unity Catalog resources
52
+ UnityCatalogSetupTask.entrypoint()
53
+
54
+ # Teardown Unity Catalog resources
55
+ UnityCatalogTeardownTask.entrypoint()
56
+ ```
57
+
58
+ ### Databricks Notebook Execution
59
+
60
+ ```python
61
+ %pip install demo-uc-setup
62
+
63
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
64
+ UnityCatalogSetupTask.entrypoint()
65
+ ```
66
+
67
+ ## Default Resource Names
68
+
69
+ If not overridden via environment variables, the package will create:
70
+ - A catalog named `demo_catalog`
71
+ - Two schemas: `demo_schema_1` and `demo_schema_2`
72
+ - A volume named `demo_volume` in each schema
73
+
74
+ ## Extending the Framework
75
+
76
+ The package provides a reusable `Task` base class that can be extended for custom Unity Catalog operations:
77
+
78
+ ```python
79
+ from demo_uc_setup.common import Task
80
+ from demo_uc_setup.config import Config
81
+
82
+ class CustomTask(Task[Config]):
83
+ def run(self):
84
+ self.logger.info("Starting custom task...")
85
+ # Your custom logic here
86
+ ```
87
+
88
+ ## Contributing
89
+
90
+ Contributions are welcome! Please feel free to submit a Pull Request.
91
+
92
+ ## License
93
+
94
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,38 @@
1
+ # Pythonic Config Management Approach
2
+ Type variables are a key concept in generic programming, allowing for the creation of flexible and reusable code that can work with multiple data types while maintaining type safety. In the context of Python configuration management, type variables play a crucial role in implementing generic classes and methods, as demonstrated in the common.py file of the chatten project, where they are used to create a versatile Task class that can work with different configuration types.
3
+
4
+ ## Pythonic Configuration with Pydantic
5
+ Pydantic's BaseSettings class forms the foundation of a flexible, type-safe configuration management system in the chatten project. This approach centralizes configuration parameters, leveraging Pydantic's automatic type validation and easy integration with environment variables and command-line arguments[1][2]. The configuration can be seamlessly imported into various tasks, as demonstrated in the loader.py and indexer.py files, while the databricks.yml file showcases parameterization for cloud deployment[3]. This method offers a balance between flexibility and type safety, particularly suitable for Python projects in cloud environments like Databricks, though it may not be ideal for scenarios requiring language-agnostic configurations.
6
+
7
+ Citations:
8
+ [1] https://docs.pydantic.dev/latest/api/pydantic_settings/
9
+ [2] https://docs.pydantic.dev/2.4/concepts/pydantic_settings/
10
+ [3] https://dzone.com/articles/order-in-chaos-python-configuration-management-for
11
+
12
+ ## Task Class Structure Explained
13
+ The `Task` class in `common.py` serves as a foundation for various tasks within the `chatten_rag` package, reducing boilerplate code and providing a reusable structure. It utilizes a generic type `T`, bound to the `Config` class, allowing each task to have its own specific configuration while maintaining type safety. The class initializes common components such as SparkSession, logger, and Databricks WorkspaceClient[1]. A key feature is the dynamic creation of configuration instances in the `__init__` method, where `self.config: T = self.config_class()` instantiates the task-specific configuration[1]. The `entrypoint` class method offers a standardized way to execute tasks, handling logging, configuration setup, and Spark environment initialization[1].
14
+
15
+ Citations:
16
+ [1] https://github.com/renardeinside/chatten/blob/main/packages/chatten_rag/chatten_rag/common.py
17
+
18
+ ## Understanding Type Variables in Python
19
+ Type variables serve as placeholders for specific types in generic programming, allowing for the creation of flexible and reusable code. They are typically denoted by single uppercase letters like T, U, or V, and can represent any non-primitive type, including class types, interface types, array types, or even other type variables[1][2]. In Python, type variables are defined using the TypeVar construct from the typing module, enabling the specification of generic types in type hints[3].
20
+
21
+ * Enhance code reusability by enabling functions or classes to work with multiple types
22
+ * Provide compile-time type checking, reducing runtime errors
23
+ * Eliminate the need for type casting, potentially improving performance
24
+ * Distinct from type parameters, which are formal declarations in class or method signatures[4]
25
+
26
+ Citations:
27
+ [1] https://stackoverflow.com/questions/42847287/what-is-type-variable-in-haskell-java
28
+ [2] https://docs.oracle.com/javase/tutorial/java/generics/types.html
29
+ [3] https://realpython.com/python-variables/
30
+ [4] https://stackoverflow.com/questions/7075363/definition-of-type-variable-and-parameter
31
+
32
+ ## Example of Configuration Management in the Chatten Project
33
+ The chatten project demonstrates an efficient approach to configuration management using Pydantic's BaseSettings. In the `config.py` file, a `Config` class is defined that inherits from `BaseSettings`, allowing for easy configuration sharing across multiple workflows and applications[1]. This class includes various settings such as database configurations, model parameters, and API keys, all with type annotations for improved safety and clarity.
34
+
35
+ The configuration is then seamlessly integrated into task files like `loader.py` and `indexer.py`[1]. These tasks import the `Config` class and utilize its properties, demonstrating how easily the shared configuration can be referenced and used across different components of the project. This approach not only centralizes configuration management but also leverages Pydantic's built-in validation and environment variable integration, making it a flexible and maintainable solution for complex Python projects, particularly those deployed in cloud environments like Databricks.
36
+
37
+ Citations:
38
+ [1] https://gist.github.com/renardeinside
File without changes
@@ -0,0 +1,49 @@
1
+ import logging
2
+ from typing import TypeVar, Generic
3
+ from databricks.sdk import WorkspaceClient
4
+
5
+ from demo_uc_setup.config import Config
6
+
7
+ # Example of a type variable bound to our Config class
8
+ T = TypeVar("T", bound=Config)
9
+
10
+ class Task(Generic[T]):
11
+ """
12
+ A reusable Task base class that works both locally and in Databricks notebooks.
13
+ When running locally, requires databricks_host and databricks_token.
14
+ When running in a notebook, these parameters are optional.
15
+ """
16
+
17
+ def __init__(self, config_class: type[T]):
18
+ # Instantiate the typed configuration
19
+ self.config: T = config_class()
20
+ # Setup a basic logger
21
+ self.logger = logging.getLogger(self.__class__.__name__)
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+ # Create a Databricks workspace client with or without credentials
25
+ if self.config.databricks_host and self.config.databricks_token:
26
+ # Local execution with credentials
27
+ self.workspace_client = WorkspaceClient(
28
+ host=self.config.databricks_host,
29
+ token=self.config.databricks_token
30
+ )
31
+ else:
32
+ # Notebook execution - no credentials needed
33
+ self.workspace_client = WorkspaceClient()
34
+
35
+ @classmethod
36
+ def entrypoint(cls, *args, **kwargs):
37
+ """
38
+ Creates an instance of the task and runs it. If you
39
+ want a consistent run pattern, place it here.
40
+ """
41
+ instance = cls(*args, **kwargs)
42
+ instance.run()
43
+
44
+ def run(self):
45
+ """
46
+ The main entrypoint for the task's execution.
47
+ Override this in subclasses to implement custom logic.
48
+ """
49
+ self.logger.info("Base Task run method. Override in subclasses.")
@@ -0,0 +1,23 @@
1
+ from pydantic_settings import BaseSettings
2
+ from typing import Optional
3
+
4
+ class Config(BaseSettings):
5
+ """
6
+ Configuration class using Pydantic BaseSettings.
7
+ By default, each field can be overridden by environment
8
+ variables matching the field name (in uppercase).
9
+ For example, DATABRICKS_HOST, DATABRICKS_TOKEN, etc.
10
+ """
11
+
12
+ # Databricks connection settings - optional for notebook execution
13
+ databricks_host: Optional[str] = None
14
+ databricks_token: Optional[str] = None
15
+
16
+ # Default names for Unity Catalog demo objects
17
+ demo_catalog_name: str = "demo_catalog"
18
+ demo_schemas: list[str] = ["demo_schema_1", "demo_schema_2"] # List of schemas
19
+ demo_volume_name: str = "demo_volume" # This could also be a list if needed
20
+
21
+ class Config:
22
+ env_file = ".env" # or any custom file, if desired
23
+ env_file_encoding = "utf-8"
@@ -0,0 +1,82 @@
1
+ from typing import Type
2
+ from databricks.sdk.service import catalog
3
+ from demo_uc_setup.common import Task, T
4
+ from demo_uc_setup.config import Config
5
+
6
+ class UnityCatalogSetupTask(Task[Config]):
7
+ """
8
+ A task to ensure catalogs, schemas, and volumes exist
9
+ in the Databricks workspace. Uses typed config for
10
+ resource names, credentials, etc.
11
+ """
12
+
13
+ def __init__(self, config_class: Type[T] = Config):
14
+ super().__init__(config_class)
15
+
16
+ def run(self):
17
+ self.logger.info("Starting Unity Catalog setup...")
18
+
19
+ # 1) Ensure the catalog exists
20
+ catalog_name = self.config.demo_catalog_name
21
+ self.logger.info(f"Ensuring catalog '{catalog_name}'")
22
+ try:
23
+ self.workspace_client.catalogs.get(name=catalog_name)
24
+ self.logger.info(f"Catalog '{catalog_name}' already exists.")
25
+ except Exception:
26
+ self.logger.info(f"Catalog '{catalog_name}' not found; creating it.")
27
+ self.workspace_client.catalogs.create(
28
+ name=catalog_name,
29
+ comment="Demo Catalog for Databricks demos"
30
+ )
31
+
32
+ # 2) Ensure all schemas exist and create volumes within each schema
33
+ for schema_name in self.config.demo_schemas:
34
+ # Create schema
35
+ self.logger.info(f"Ensuring schema '{catalog_name}.{schema_name}'")
36
+ try:
37
+ self.workspace_client.schemas.get(
38
+ name=schema_name,
39
+ catalog_name=catalog_name
40
+ )
41
+ self.logger.info(f"Schema '{catalog_name}.{schema_name}' already exists.")
42
+ except Exception:
43
+ try:
44
+ self.logger.info(f"Schema '{catalog_name}.{schema_name}' not found; creating it.")
45
+ self.workspace_client.schemas.create(
46
+ name=schema_name,
47
+ catalog_name=catalog_name,
48
+ comment=f"Demo Schema {schema_name} for Databricks demos"
49
+ )
50
+ except Exception as e:
51
+ if "already exists" in str(e):
52
+ self.logger.info(f"Schema '{catalog_name}.{schema_name}' already exists (caught during creation).")
53
+ else:
54
+ raise e
55
+
56
+ # Create volume within this schema
57
+ volume_name = self.config.demo_volume_name
58
+ self.logger.info(f"Ensuring volume '{catalog_name}.{schema_name}.{volume_name}'")
59
+ try:
60
+ self.workspace_client.volumes.get(
61
+ name=volume_name,
62
+ catalog_name=catalog_name,
63
+ schema_name=schema_name
64
+ )
65
+ self.logger.info(f"Volume '{catalog_name}.{schema_name}.{volume_name}' already exists.")
66
+ except Exception:
67
+ try:
68
+ self.logger.info(f"Volume '{catalog_name}.{schema_name}.{volume_name}' not found; creating it.")
69
+ self.workspace_client.volumes.create(
70
+ name=volume_name,
71
+ catalog_name=catalog_name,
72
+ schema_name=schema_name,
73
+ volume_type=catalog.VolumeType.MANAGED,
74
+ comment=f"Demo Volume for schema {schema_name}"
75
+ )
76
+ except Exception as e:
77
+ if "already exists" in str(e):
78
+ self.logger.info(f"Volume '{catalog_name}.{schema_name}.{volume_name}' already exists (caught during creation).")
79
+ else:
80
+ raise e
81
+
82
+ self.logger.info("Unity Catalog setup complete!")
@@ -0,0 +1,28 @@
1
+ from typing import Type
2
+ from databricks.sdk import WorkspaceClient
3
+ from demo_uc_setup.common import Task, T
4
+ from demo_uc_setup.config import Config
5
+
6
+ class UnityCatalogTeardownTask(Task[Config]):
7
+ """
8
+ A task to delete (teardown) the Unity Catalog resources in
9
+ the configured Databricks workspace. Uses typed config for
10
+ resource names, credentials, etc.
11
+ """
12
+
13
+ def __init__(self, config_class: Type[T] = Config):
14
+ super().__init__(config_class)
15
+
16
+ def run(self):
17
+ self.logger.info("Starting teardown of Unity Catalog resources...")
18
+
19
+ catalog_name = self.config.demo_catalog_name
20
+ self.logger.info(f"Deleting catalog '{catalog_name}' and its dependencies (force=True).")
21
+
22
+ try:
23
+ self.workspace_client.catalogs.delete(name=catalog_name, force=True)
24
+ self.logger.info(f"Catalog '{catalog_name}' (and its contents) successfully deleted.")
25
+ except Exception as e:
26
+ self.logger.error(f"Failed to delete catalog '{catalog_name}'. Reason: {e}")
27
+
28
+ self.logger.info("Unity Catalog teardown complete!")
@@ -0,0 +1,31 @@
1
+ """
2
+ Main entrypoint to run the Unity Catalog resource creation using
3
+ the Databricks Python SDK and the flexible Pydantic-based config.
4
+
5
+ Usage:
6
+ python main.py setup # Run setup
7
+ python main.py teardown # Run teardown
8
+ """
9
+
10
+ import sys
11
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
12
+ from demo_uc_setup.unity_catalog_teardown import UnityCatalogTeardownTask
13
+
14
+ def print_usage():
15
+ print("Usage: python main.py [setup|teardown]")
16
+ print(" setup - Run Unity Catalog setup")
17
+ print(" teardown - Run Unity Catalog teardown")
18
+
19
+ if __name__ == "__main__":
20
+ if len(sys.argv) != 2 or sys.argv[1] not in ["setup", "teardown"]:
21
+ print_usage()
22
+ sys.exit(1)
23
+
24
+ if sys.argv[1] == "setup":
25
+ UnityCatalogSetupTask.entrypoint()
26
+ else:
27
+ UnityCatalogTeardownTask.entrypoint()
28
+
29
+ # Option B) Or instantiate directly:
30
+ # task = UnityCatalogSetupTask()
31
+ # task.run()
@@ -0,0 +1,21 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "demo_uc_setup"
7
+ version = "0.1.0"
8
+ description = "A reusable task-based framework for managing Unity Catalog in Databricks"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "databricks-connect>=16.1.1",
13
+ "databricks-sdk>=0.44.1",
14
+ "pydantic>=2.10.6",
15
+ "pydantic-settings>=2.8.0",
16
+ ]
17
+
18
+ [dependency-groups]
19
+ dev = [
20
+ "hatch>=1.14.0",
21
+ ]
@@ -0,0 +1,7 @@
1
+ """
2
+ Script to run the Unity Catalog setup process.
3
+ """
4
+ from demo_uc_setup.unity_catalog_setup import UnityCatalogSetupTask
5
+
6
+ if __name__ == "__main__":
7
+ UnityCatalogSetupTask.entrypoint()
@@ -0,0 +1,7 @@
1
+ """
2
+ Script to run the Unity Catalog teardown process.
3
+ """
4
+ from demo_uc_setup.unity_catalog_teardown import UnityCatalogTeardownTask
5
+
6
+ if __name__ == "__main__":
7
+ UnityCatalogTeardownTask.entrypoint()