PyPI - databricks-dbt-factory - Versions diffs - 0.0.1__tar.gz - Mend

databricks-dbt-factory 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

databricks_dbt_factory-0.0.1/.gitignore ADDED Viewed

@@ -0,0 +1,156 @@
+# macos
+.DS_Store
+*.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+*.out
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env.admin
+.venv
+.env.*
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.env
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# ruff
+.ruff_cache
+/scratch
+# dev files and scratches
+dev/cleanup.py
+.databricks
+.vscode
+.python-version
+.databricks-login.json
+.local-dev

databricks_dbt_factory-0.0.1/LICENSE.txt ADDED Viewed

@@ -0,0 +1,9 @@
+MIT License
+Copyright (c) 2024-present mwojtyczka <wojtyczka.marcin@gmail.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

databricks_dbt_factory-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,113 @@
+Metadata-Version: 2.3
+Name: databricks-dbt-factory
+Version: 0.0.1
+Summary: Databricks dbt factory library for creating Databricks Job definition where individual models are run as separate tasks.
+Project-URL: Documentation, https://github.com/mwojtyczka/databricks-dbt-factory#readme
+Project-URL: Issues, https://github.com/mwojtyczka/databricks-dbt-factory/issues
+Project-URL: Source, https://github.com/mwojtyczka/databricks-dbt-factory
+Author-email: Marcin Wojtyczka <marcin.wojtyczka@databricks.com>
+License: MIT
+Keywords: Databricks,dbt
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Framework :: Pytest
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: System Administrators
+Classifier: Operating System :: MacOS
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Utilities
+Requires-Python: >=3.10
+Requires-Dist: pyyaml~=6.0.1
+Description-Content-Type: text/markdown
+Databricks dbt factory
+===
+Databricks dbt factory is a simple library to generate Databricks Job tasks based on dbt manifest.
+The tool can overwrite tasks in the existing Databricks job definition (in-place update, or creating new definition).
+[![PyPI - Version](https://img.shields.io/pypi/v/databricks-dbt-factory.svg)](https://pypi.org/project/databricks-dbt-factory)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/databricks-dbt-factory.svg)](https://pypi.org/project/databricks-dbt-factory)
+-----
+**Table of Contents**
+- [Installation](#installation)
+- [Usage](#usage)
+- [Contribution](#contribution)
+- [License](#license)
+# Motivation
+The current integration of dbt with Databricks Workflows treats the entire dbt project as a single execution unit (black box), limiting flexibility and debugging options.
+This project breaks down each dbt object (seed/snapshot/model/test) into separate Workflow task offering several key benefits:
+* Simplified Troubleshooting: Isolating tasks makes it easier to identify and resolve issues specific to a single model
+* Enhanced Logging and Notifications: Provides more detailed logs and precise error alerts, improving debugging efficiency
+* Better Retriability: Enables retrying only the failed model tasks, saving time and resources compared to rerunning the entire project
+* Seamless Testing: Allows running dbt data tests on tables immediately after a model completes, ensuring faster validation and feedback
+### Databricks Workflows run all dbt objects at once:
+![before](docs/before.png?)
+![dbt_task](docs/dbt_task.png?)
+### The tool generates workflows where dbt objects are run as individual Databricks Workflow tasks:
+![after](docs/after.png?)
+![workflow](docs/workflow.png?)
+# Installation
+```shell
+pip install databricks-dbt-factory
+```
+# Usage
+Update tasks in the existing Databricks job definition and write the results to `job_definition_new.yaml`:
+```shell
+databricks_dbt_factory  \
+  --dbt-manifest-path tests/test_data/manifest.json \
+  --input-job-spec-path tests/test_data/job_definition_template.yaml \
+  --target-job-spec-path job_definition_new.yaml \
+  --source GIT \
+  --target dev
+```
+**Arguments:**
+- `--new-job-name` (type: str, optional, default: None): Optional job name. If provided, the existing job name in the job spec is updated.
+- `--dbt-manifest-path` (type: str, required): Path to the manifest file.
+- `--input-job-spec-path` (type: str, required): Path to the input job spec file.
+- `--target-job-spec-path` (type: str, required): Path to the target job spec file.
+- `--target` (type: str, required): dbt target to use.
+- `--source` (type: str, optional, default: None): Optional project source. If not provided, WORKSPACE will be used.
+- `--warehouse_id` (type: str, optional, default: None): Optional SQL Warehouse to run dbt models on.
+- `--schema` (type: str, optional, default: None): Optional schema to write to.
+- `--catalog` (type: str, optional, default: None): Optional catalog to write to.
+- `--profiles-directory` (type: str, optional, default: None): Optional (relative) path to the profiles directory.
+- `--project-directory` (type: str, optional, default: None): Optional (relative) path to the project directory.
+- `--environment-key` (type: str, optional, default: Default): Optional (relative) key of an environment.
+- `--extra-dbt-command-options` (type: str, optional, default: ""): Optional additional dbt command options.
+- `--run-tests` (type: bool, optional, default: True): Whether to run data tests after the model. Enabled by default.
+- `--dry-run` (type: bool, optional, default: False): Print generated tasks without updating the job spec file. Disabled by default.
+You can also check all input arguments by running `databricks_dbt_factory --help`.
+Demo of the tool can be found [here](https://github.com/mwojtyczka/dbt-demo).
+# Contribution
+See contribution guidance [here](CONTRIBUTING.md).
+# License
+`databricks-dbt-factory` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.

databricks_dbt_factory-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,84 @@
+Databricks dbt factory
+===
+Databricks dbt factory is a simple library to generate Databricks Job tasks based on dbt manifest.
+The tool can overwrite tasks in the existing Databricks job definition (in-place update, or creating new definition).
+[![PyPI - Version](https://img.shields.io/pypi/v/databricks-dbt-factory.svg)](https://pypi.org/project/databricks-dbt-factory)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/databricks-dbt-factory.svg)](https://pypi.org/project/databricks-dbt-factory)
+-----
+**Table of Contents**
+- [Installation](#installation)
+- [Usage](#usage)
+- [Contribution](#contribution)
+- [License](#license)
+# Motivation
+The current integration of dbt with Databricks Workflows treats the entire dbt project as a single execution unit (black box), limiting flexibility and debugging options.
+This project breaks down each dbt object (seed/snapshot/model/test) into separate Workflow task offering several key benefits:
+* Simplified Troubleshooting: Isolating tasks makes it easier to identify and resolve issues specific to a single model
+* Enhanced Logging and Notifications: Provides more detailed logs and precise error alerts, improving debugging efficiency
+* Better Retriability: Enables retrying only the failed model tasks, saving time and resources compared to rerunning the entire project
+* Seamless Testing: Allows running dbt data tests on tables immediately after a model completes, ensuring faster validation and feedback
+### Databricks Workflows run all dbt objects at once:
+![before](docs/before.png?)
+![dbt_task](docs/dbt_task.png?)
+### The tool generates workflows where dbt objects are run as individual Databricks Workflow tasks:
+![after](docs/after.png?)
+![workflow](docs/workflow.png?)
+# Installation
+```shell
+pip install databricks-dbt-factory
+```
+# Usage
+Update tasks in the existing Databricks job definition and write the results to `job_definition_new.yaml`:
+```shell
+databricks_dbt_factory  \
+  --dbt-manifest-path tests/test_data/manifest.json \
+  --input-job-spec-path tests/test_data/job_definition_template.yaml \
+  --target-job-spec-path job_definition_new.yaml \
+  --source GIT \
+  --target dev
+```
+**Arguments:**
+- `--new-job-name` (type: str, optional, default: None): Optional job name. If provided, the existing job name in the job spec is updated.
+- `--dbt-manifest-path` (type: str, required): Path to the manifest file.
+- `--input-job-spec-path` (type: str, required): Path to the input job spec file.
+- `--target-job-spec-path` (type: str, required): Path to the target job spec file.
+- `--target` (type: str, required): dbt target to use.
+- `--source` (type: str, optional, default: None): Optional project source. If not provided, WORKSPACE will be used.
+- `--warehouse_id` (type: str, optional, default: None): Optional SQL Warehouse to run dbt models on.
+- `--schema` (type: str, optional, default: None): Optional schema to write to.
+- `--catalog` (type: str, optional, default: None): Optional catalog to write to.
+- `--profiles-directory` (type: str, optional, default: None): Optional (relative) path to the profiles directory.
+- `--project-directory` (type: str, optional, default: None): Optional (relative) path to the project directory.
+- `--environment-key` (type: str, optional, default: Default): Optional (relative) key of an environment.
+- `--extra-dbt-command-options` (type: str, optional, default: ""): Optional additional dbt command options.
+- `--run-tests` (type: bool, optional, default: True): Whether to run data tests after the model. Enabled by default.
+- `--dry-run` (type: bool, optional, default: False): Print generated tasks without updating the job spec file. Disabled by default.
+You can also check all input arguments by running `databricks_dbt_factory --help`.
+Demo of the tool can be found [here](https://github.com/mwojtyczka/dbt-demo).
+# Contribution
+See contribution guidance [here](CONTRIBUTING.md).
+# License
+`databricks-dbt-factory` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.

databricks_dbt_factory-0.0.1/__init__.py ADDED Viewed

File without changes

databricks_dbt_factory-0.0.1/databricks_dbt_factory/DbtFactory.py ADDED Viewed

@@ -0,0 +1,84 @@
+from databricks_dbt_factory import TaskFactory
+from databricks_dbt_factory.SpecsHandler import SpecsHandler
+from databricks_dbt_factory.DbtTask import DbtTask
+class DbtFactory:
+    """A factory for generating Databricks job definitions from DBT manifests."""
+    def __init__(self, file_handler: SpecsHandler, task_factories: dict[str, TaskFactory]):
+        """
+        Initializes the DatabricksDbtFactory.
+        Args:
+            file_handler (SpecsHandler): An instance of FileHandler to handle file operations.
+            task_factories (dict[str, TaskFactory]): A dictionary mapping resource types to their respective TaskFactory.
+        """
+        self.file_handler = file_handler
+        self.task_factories = task_factories
+    def create_tasks_and_update_job_spec(
+        self,
+        dbt_manifest_path: str,
+        input_job_spec_path: str,
+        target_job_spec_path: str,
+        new_job_name: str | None = None,
+        dry_run: bool = False,
+    ):
+        """
+        Generates tasks for Databricks Job from a DBT manifest and updates the existing job definition file
+        either in place, or to a new file if target_job_spec_path is provided.
+        Args:
+            dbt_manifest_path (str): Path to the DBT manifest file.
+            input_job_spec_path (str): Path to the input job specification YAML file.
+            target_job_spec_path (str): Path to save the updated job specification file.
+            new_job_name (str, optional): The name of the job to update. Defaults to None.
+            dry_run (bool, optional): If True, the tasks will be printed to the console instead of writing to a file. Defaults to False.
+        """
+        manifest = self.file_handler.read_dbt_manifest(dbt_manifest_path)
+        tasks = self.create_tasks(manifest)
+        if dry_run:
+            print(tasks)
+        else:
+            self.file_handler.replace_tasks_in_job_spec(input_job_spec_path, tasks, target_job_spec_path, new_job_name)
+    def create_tasks(self, dbt_manifest: dict) -> list[dict]:
+        """
+        Generates tasks for Databricks Job from a DBT manifest.
+        Args:
+            dbt_manifest (dict): The DBT manifest content.
+        Returns:
+            list[dict]: A list of task dictionaries suitable for the job definition.
+        """
+        tasks = self._create_tasks(dbt_manifest)
+        return [task.to_dict() for task in tasks]
+    def _create_tasks(self, dbt_manifest: dict) -> list[DbtTask]:
+        """
+        Generates a list of Databricks job tasks based on the DBT manifest.
+        Args:
+            dbt_manifest (dict): The DBT manifest content.
+        Returns:
+            list[DbtTask]: A list of Task instances.
+        """
+        dbt_nodes = dbt_manifest.get('nodes', {})
+        tasks = []
+        for node_full_name, node_info in dbt_nodes.items():
+            resource_type = node_info['resource_type']
+            if resource_type not in self.task_factories:
+                continue
+            node_name = node_info['name']
+            task_key = node_full_name.replace('.', '_')  # make sure it can be used as a task key
+            factory = self.task_factories[resource_type]
+            task = factory.create_task(node_name, node_info, task_key)
+            tasks.append(task)
+        return tasks

databricks_dbt_factory-0.0.1/databricks_dbt_factory/DbtTask.py ADDED Viewed

@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+from typing import Any
+@dataclass(frozen=True)
+class DbtTaskOptions:
+    environment_key: str = "Default"  # serverless env
+    """The key of an environment. It has to be unique within a job."""
+    catalog: str | None = None
+    """Optional name of the catalog to use. The value is the top level in the 3-level namespace of
+    Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a
+    warehouse_id is specified. Requires dbt-databricks >= 1.1.1."""
+    profiles_directory: str | None = None
+    """Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is
+    specified. If no warehouse_id is specified and this folder is unset, the root directory is used."""
+    project_directory: str | None = None
+    """Path to the project directory. Optional for Git sourced tasks, in which case if no value is
+    provided, the root of the Git repository is used."""
+    schema: str | None = None
+    """Optional schema to write to. This parameter is only used when a warehouse_id is also provided.
+    If not provided, the `default` schema is used."""
+    source: str | None = None
+    """Optional location type of the project directory. When set to `WORKSPACE`, the project will be
+    retrieved from the local Databricks workspace. When set to `GIT`, the project will be retrieved
+    from a Git repository defined in `git_source`. If the value is empty, the task will use `GIT` if
+    `git_source` is defined and `WORKSPACE` otherwise.
+    * `WORKSPACE`: Project is located in Databricks workspace. * `GIT`: Project is located in cloud
+    Git provider."""
+    warehouse_id: str | None = None
+    """ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the
+    profile and connection details to dbt. It can be overridden on a per-command basis by using the
+    `--profiles-dir` command line argument."""
+@dataclass(frozen=True)
+class DbtTask:
+    """Represents a dbt task in the Databricks job definition."""
+    task_key: str
+    commands: list[str]
+    options: DbtTaskOptions
+    depends_on: list[str] | None = None
+    def to_dict(self) -> dict:
+        """Converts the Task to a dictionary suitable for the job definition."""
+        spec: dict[str, Any] = {
+            'task_key': self.task_key,
+            'dbt_task': {
+                'commands': self.commands,
+            },
+            'environment_key': self.options.environment_key,
+            'depends_on': [{'task_key': dep} for dep in (self.depends_on or [])],
+        }
+        if self.options.source:
+            spec['dbt_task']['source'] = self.options.source
+        if self.options.project_directory:
+            spec['dbt_task']['project_directory'] = self.options.project_directory
+        if self.options.schema:
+            spec['dbt_task']['schema'] = self.options.schema
+        if self.options.warehouse_id:  # not required if using "None (Manual) / Serverless"
+            spec['dbt_task']['warehouse_id'] = self.options.warehouse_id
+        if self.options.catalog:  # catalog can only be specified if warehouse_id is specified
+            spec['dbt_task']['catalog'] = self.options.catalog
+        if self.options.profiles_directory:  # only if no warehouse_id is specified
+            spec['dbt_task']['profiles_directory'] = self.options.profiles_directory
+        return spec

databricks_dbt_factory-0.0.1/databricks_dbt_factory/SpecsHandler.py ADDED Viewed

@@ -0,0 +1,68 @@
+import json
+import yaml
+class SpecsHandler:
+    """Handles reading and writing files for dbt manifests and databricks job definitions."""
+    @staticmethod
+    def read_dbt_manifest(path: str) -> dict:
+        """
+        Reads a JSON manifest file and returns its content as a dictionary.
+        Args:
+            path (str): Path to the manifest file.
+        Returns:
+            dict: Parsed content of the manifest file.
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the file is not a valid manifest file.
+        """
+        try:
+            with open(path, 'r', encoding="utf-8") as file:
+                return json.load(file)
+        except FileNotFoundError as e:
+            raise FileNotFoundError(f"Manifest file not found: {path}. Details: {e}") from e
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Error parsing JSON from manifest file: {path}. Details: {e}") from e
+    @staticmethod
+    def replace_tasks_in_job_spec(
+        input_job_spec_path: str,
+        new_tasks: list[dict],
+        target_job_spec_path: str,
+        new_job_name: str | None = None,
+    ) -> None:
+        """Replace the tasks field in a Databricks job definition YAML file. The first job is only updated.
+        Args:
+            input_job_spec_path (str): Path to the job definition YAML file.
+            new_tasks (dict): New tasks to replace the existing tasks in the job definition file.
+            target_job_spec_path (str): Path to save the updated job definition file.
+            new_job_name (str, optional): The name of the job to update. Defaults to None.
+        Raises:
+        KeyError: If no jobs are found in the provided YAML file.
+        """
+        with open(input_job_spec_path, 'r', encoding="utf-8") as file:
+            job_definition = yaml.safe_load(file)
+        jobs = job_definition.get('resources', {}).get('jobs', {})
+        if jobs is None:
+            raise KeyError("No jobs found in the provided YAML file.")
+        # replaces the first job only!
+        first_job_key = next(iter(jobs))
+        if new_job_name:
+            jobs[new_job_name] = jobs.pop(first_job_key)
+            first_job_key = new_job_name
+        first_job = jobs[first_job_key]
+        if new_job_name:
+            first_job['name'] = new_job_name
+        first_job['tasks'] = new_tasks  # Replace tasks field
+        with open(target_job_spec_path, 'w', encoding="utf-8") as file:
+            yaml.dump(job_definition, file, sort_keys=False, width=1000)