PyPI - pydataframer-databricks - Versions diffs - 0.1.0__tar.gz - Mend

pydataframer-databricks 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pydataframer_databricks-0.1.0/.github/workflows/python-publish.yml ADDED Viewed

@@ -0,0 +1,70 @@
+# This workflow will upload a Python Package to PyPI when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+name: Upload Python Package
+on:
+  release:
+    types: [published]
+permissions:
+  contents: read
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+      - name: Build release distributions
+        run: |
+          # NOTE: put your own distribution build steps here.
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          uv build
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write
+    # Dedicated environments with protections for publishing are strongly recommended.
+    # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
+    environment:
+      name: pypi
+      # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
+      url: https://pypi.org/p/pydataframer-databricks
+      #
+      # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
+      # ALTERNATIVE: exactly, uncomment the following line instead:
+      # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/

pydataframer_databricks-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+venv/
+.venv/
+uv.lock
+__pycache__/
+dist/
+build/
+htmlcov/
+*.egg-info/
+.coverage
+.pytest_cache/

pydataframer_databricks-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,43 @@
+Metadata-Version: 2.4
+Name: pydataframer-databricks
+Version: 0.1.0
+Summary: Databricks connector for Dataframer
+Author-email: Dataframer <info@dataframer.ai>
+License: MIT
+Requires-Python: >=3.9
+Requires-Dist: databricks-sdk>=0.81.0
+Requires-Dist: databricks-sql-connector>=4.2.4
+Requires-Dist: pandas>=2.0.0
+Provides-Extra: dev
+Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
+Requires-Dist: pytest>=7.4.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# pydataframer-databricks
+Databricks connector package for Dataframer, providing seamless integration with Databricks SQL and data operations.
+## Installation
+```bash
+pip install pydataframer-databricks
+```
+## Building
+Requires [uv](https://docs.astral.sh/uv/) installed in your environment.
+```bash
+uv build
+```
+## Development
+```bash
+# Install with dev dependencies
+uv pip install -e ".[dev]"
+# Run tests
+pytest
+```

pydataframer_databricks-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# pydataframer-databricks
+Databricks connector package for Dataframer, providing seamless integration with Databricks SQL and data operations.
+## Installation
+```bash
+pip install pydataframer-databricks
+```
+## Building
+Requires [uv](https://docs.astral.sh/uv/) installed in your environment.
+```bash
+uv build
+```
+## Development
+```bash
+# Install with dev dependencies
+uv pip install -e ".[dev]"
+# Run tests
+pytest
+```

pydataframer_databricks-0.1.0/pydataframer_databricks/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+pydataframer-databricks: Databricks connector for Dataframer
+"""
+from .connectors import DatabricksConnector, FileType, DatasetType
+__all__ = [
+    "FileType",
+    "DatasetType",
+    "DatabricksConnector",
+]

pydataframer_databricks-0.1.0/pydataframer_databricks/connectors.py ADDED Viewed

@@ -0,0 +1,251 @@
+from enum import Enum
+class DatasetType(Enum):
+    """Dataset type enumeration matching Dataframer backend."""
+    SINGLE_FILE = "SINGLE_FILE"
+    MULTI_FILE = "MULTI_FILE"
+    MULTI_FOLDER = "MULTI_FOLDER"
+class FileType(Enum):
+    """File type enumeration matching Dataframer backend."""
+    MD = "md"
+    TXT = "txt"
+    CSV = "csv"
+    PDF = "pdf"
+    JSON = "json"
+    JSONL = "jsonl"
+class DatabricksConnector:
+    """
+    Databricks connector for Dataframer workflows.
+    This class provides methods to interact with Databricks SQL, fetch sample data,
+    and load generated data into Databricks tables.
+    Parameters
+    ----------
+    dbutils : DBUtils
+        The dbutils object from your Databricks notebook context.
+        This is automatically available in Databricks notebooks.
+    Examples
+    --------
+    >>> databricks_connector = DatabricksConnector(dbutils)
+    >>> df = databricks_connector.fetch_sample_data(
+    ...     num_items_to_select=25,
+    ...     table_name="samples.bakehouse.media_customer_reviews"
+    ... )
+    >>> df.head()
+    """
+    def __init__(self, dbutils):
+        """
+        Initialize the Databricks connector.
+        Parameters
+        ----------
+        dbutils : DBUtils
+            The dbutils object from your Databricks notebook context.
+        """
+        self.dbutils = dbutils
+    def get_connection(self):
+        """
+        Return an authenticated Databricks SQL connection.
+        Returns
+        -------
+        Connection
+            A Databricks SQL connection object.
+        """
+        from databricks import sql
+        from databricks.sdk.core import Config, oauth_service_principal
+        server_hostname = self.dbutils.secrets.get("dataframer", "DATABRICKS_SERVER_HOSTNAME")
+        http_path = self.dbutils.secrets.get("dataframer", "DATABRICKS_HTTP_PATH")
+        def credential_provider():
+            config = Config(
+                host=f"https://{server_hostname}",
+                client_id=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_ID"),
+                client_secret=self.dbutils.secrets.get("dataframer", "DATABRICKS_CLIENT_SECRET"),
+            )
+            return oauth_service_principal(config)
+        return sql.connect(
+            server_hostname=server_hostname,
+            http_path=http_path,
+            credentials_provider=credential_provider,
+            user_agent_entry="dataframer_user_agent",
+        )
+    def fetch_sample_data(self, num_items_to_select, table_name):
+        """
+        Fetch sample data from a Databricks table and return it as a Pandas DataFrame.
+        Parameters
+        ----------
+        num_items_to_select : int
+            Number of rows to fetch from the table.
+        table_name : str
+            Fully qualified table name (e.g., "catalog.schema.table").
+        Returns
+        -------
+        pd.DataFrame
+            A Pandas DataFrame containing the sample data.
+        Examples
+        --------
+        >>> databricks_connector = DatabricksConnector(dbutils)
+        >>> df = databricks_connector.fetch_sample_data(
+        ...     num_items_to_select=25,
+        ...     table_name="samples.bakehouse.media_customer_reviews"
+        ... )
+        >>> df.head()
+        """
+        import pandas as pd
+        query = f"""
+            SELECT *
+            FROM {table_name}
+            LIMIT {num_items_to_select}
+        """
+        try:
+            with self.get_connection() as connection:
+                with connection.cursor() as cursor:
+                    cursor.execute(query)
+                    rows = cursor.fetchall()
+                    columns = [desc[0] for desc in cursor.description]
+        except Exception as e:
+            error_msg = f"Failed to fetch data from table `{table_name}`"
+            print(f"{error_msg}: {str(e)}")
+            print("Verify table exists, is accessible, and you have SELECT permissions")
+            raise RuntimeError(f"{error_msg}: {str(e)}") from e
+        return pd.DataFrame(rows, columns=columns)
+    def load_generated_data(self, table_name, downloaded_zip, dataset_type, file_type):
+        """
+        Load generated samples from a ZIP file into a Databricks table using Databricks SQL.
+        Parameters
+        ----------
+        table_name : str
+            Target table name (e.g., "catalog.schema.table")
+        downloaded_zip : file-like
+            ZIP file object containing the generated data file
+        dataset_type : DatasetType
+            Type of dataset structure (DatasetType.SINGLE_FILE, DatasetType.MULTI_FILE, or DatasetType.MULTI_FOLDER)
+        file_type : FileType
+            Type of file in the ZIP (FileType.CSV, FileType.JSON, FileType.JSONL, etc.)
+        Examples
+        --------
+        >>> databricks_connector = DatabricksConnector(dbutils)
+        >>> with open("samples.zip", "rb") as f:
+        ...     databricks_connector.load_generated_data(
+        ...         table_name="my_catalog.my_schema.my_table",
+        ...         downloaded_zip=f,
+        ...         dataset_type=DatasetType.SINGLE_FILE,
+        ...         file_type=FileType.CSV
+        ...     )
+        """
+        import zipfile
+        import pandas as pd
+        from io import BytesIO
+        zip_buffer = BytesIO(downloaded_zip.read())
+        if dataset_type == DatasetType.SINGLE_FILE:
+            try:
+                with zipfile.ZipFile(zip_buffer) as z:
+                    file_list = z.namelist()
+                    generated_data_files = [f for f in file_list if f.lower().endswith(f'.{file_type.value}')]
+                    if len(generated_data_files) != 1:
+                        error_msg = f"Expected exactly one .{file_type.value} file in ZIP"
+                        print(f"{error_msg}. Available files: {file_list}")
+                        raise ValueError(error_msg)
+                    data_filename = generated_data_files[0]
+                    data_bytes = z.read(data_filename)
+                    print(f"Found {file_type.value} file: {data_filename}")
+            except zipfile.BadZipFile as e:
+                error_msg = "Invalid or corrupted ZIP file"
+                print(f"{error_msg}: {str(e)}")
+                raise ValueError(f"{error_msg}: {str(e)}") from e
+            except ValueError:
+                raise
+            except Exception as e:
+                error_msg = "Failed to extract file from ZIP"
+                print(f"{error_msg}: {str(e)}")
+                raise RuntimeError(f"{error_msg}: {str(e)}") from e
+            if file_type == FileType.CSV:
+                pandas_df = pd.read_csv(BytesIO(data_bytes))
+            elif file_type == FileType.JSON:
+                # TODO: Implement JSON file handling
+                pass
+            elif file_type == FileType.JSONL:
+                # TODO: Implement JSONL file handling
+                pass
+            else:
+                raise ValueError(f"Unsupported file_type: {file_type}. Supported: CSV, JSON, JSONL for SINGLE_FILE datasets")
+            with self.get_connection() as connection:
+                cursor = connection.cursor()
+                columns_sql = ", ".join(
+                    f"`{col}` STRING" for col in pandas_df.columns
+                )
+                try:
+                    cursor.execute(f"""
+                        CREATE OR REPLACE TABLE {table_name} (
+                            {columns_sql}
+                        )
+                    """)
+                except Exception as e:
+                    error_msg = f"Failed to create table `{table_name}`"
+                    print(f"{error_msg}: {str(e)}")
+                    print("Verify table name format (catalog.schema.table), permissions, and warehouse is running")
+                    cursor.close()
+                    raise RuntimeError(f"{error_msg}: {str(e)}") from e
+                insert_sql = f"""
+                    INSERT INTO {table_name}
+                    VALUES ({", ".join(["?"] * len(pandas_df.columns))})
+                """
+                try:
+                    cursor.executemany(
+                        insert_sql,
+                        pandas_df.values.tolist()
+                    )
+                except Exception as e:
+                    error_msg = f"Failed to insert data into table `{table_name}`"
+                    print(f"{error_msg}: {str(e)} | Rows attempted: {len(pandas_df)}")
+                    cursor.close()
+                    raise RuntimeError(f"{error_msg}: {str(e)}") from e
+                cursor.close()
+            print(f"✅ Table `{table_name}` saved successfully using Databricks SQL")
+        elif dataset_type == DatasetType.MULTI_FILE:
+            # TODO: Implement MULTI_FILE handling
+            pass
+        elif dataset_type == DatasetType.MULTI_FOLDER:
+            # TODO: Implement MULTI_FOLDER handling
+            pass
+        else:
+            raise ValueError(f"Invalid dataset_type: {dataset_type}. Expected DatasetType enum")

pydataframer_databricks-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[project]
+name = "pydataframer-databricks"
+version = "0.1.0"
+description = "Databricks connector for Dataframer"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "MIT" }
+authors = [
+    { name = "Dataframer", email = "info@dataframer.ai" }
+]
+dependencies = [
+    "pandas>=2.0.0",
+    "databricks-sdk>=0.81.0",
+    "databricks-sql-connector>=4.2.4",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.4.0",
+    "pytest-cov>=4.1.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

pydataframer_databricks-0.1.0/tests/__init__.py ADDED Viewed

File without changes

pydataframer_databricks-0.1.0/tests/test_connectors.py ADDED Viewed

@@ -0,0 +1,231 @@
+import pytest
+from unittest.mock import Mock, MagicMock, patch
+import pandas as pd
+from io import BytesIO
+import zipfile
+from pydataframer_databricks import FileType, DatasetType
+from pydataframer_databricks import FileType
+class TestDatabricksConnector:
+    """Test suite for DatabricksConnector class"""
+    @pytest.fixture
+    def mock_dbutils(self):
+        """Create a mock dbutils object"""
+        dbutils = Mock()
+        dbutils.secrets.get = Mock(side_effect=lambda scope, key: {
+            ("dataframer", "DATABRICKS_SERVER_HOSTNAME"): "test.databricks.com",
+            ("dataframer", "DATABRICKS_HTTP_PATH"): "/sql/1.0/warehouses/abc123",
+            ("dataframer", "DATABRICKS_CLIENT_ID"): "test-client-id",
+            ("dataframer", "DATABRICKS_CLIENT_SECRET"): "test-client-secret",
+        }.get((scope, key), "default-value"))
+        return dbutils
+    @pytest.fixture
+    def connector(self, mock_dbutils):
+        """Create a DatabricksConnector instance with mocked dbutils"""
+        from pydataframer_databricks import DatabricksConnector
+        return DatabricksConnector(mock_dbutils)
+    def test_init(self, mock_dbutils):
+        """Test connector initialization"""
+        from pydataframer_databricks import DatabricksConnector
+        connector = DatabricksConnector(mock_dbutils)
+        assert connector.dbutils == mock_dbutils
+    def test_get_connection(self, connector, mock_dbutils):
+        """Test get_connection establishes connection with correct parameters"""
+        with patch('databricks.sql.connect') as mock_sql_connect:
+            mock_connection = Mock()
+            mock_sql_connect.return_value = mock_connection
+            with patch('databricks.sdk.core.oauth_service_principal') as mock_oauth:
+                result = connector.get_connection()
+                mock_dbutils.secrets.get.assert_called()
+                mock_sql_connect.assert_called_once()
+                call_kwargs = mock_sql_connect.call_args.kwargs
+                assert call_kwargs['server_hostname'] == "test.databricks.com"
+                assert call_kwargs['http_path'] == "/sql/1.0/warehouses/abc123"
+                assert call_kwargs['user_agent_entry'] == "dataframer_user_agent"
+                assert result == mock_connection
+    @patch('pydataframer_databricks.connectors.DatabricksConnector.get_connection')
+    def test_fetch_sample_data_success(self, mock_get_connection, connector):
+        """Test fetch_sample_data successfully fetches and returns DataFrame"""
+        mock_cursor = Mock()
+        mock_cursor.description = [("id",), ("name",), ("value",)]
+        mock_cursor.fetchall.return_value = [
+            (1, "test1", 100),
+            (2, "test2", 200),
+        ]
+        mock_connection = MagicMock()
+        mock_connection.__enter__.return_value = mock_connection
+        mock_connection.cursor.return_value.__enter__.return_value = mock_cursor
+        mock_get_connection.return_value = mock_connection
+        result = connector.fetch_sample_data(
+            num_items_to_select=25,
+            table_name="test_catalog.test_schema.test_table"
+        )
+        assert isinstance(result, pd.DataFrame)
+        assert len(result) == 2
+        assert list(result.columns) == ["id", "name", "value"]
+        mock_cursor.execute.assert_called_once()
+    @patch('pydataframer_databricks.connectors.DatabricksConnector.get_connection')
+    def test_fetch_sample_data_query_failure(self, mock_get_connection, connector):
+        """Test fetch_sample_data handles query execution errors"""
+        mock_cursor = Mock()
+        mock_cursor.execute.side_effect = Exception("Table not found")
+        mock_connection = MagicMock()
+        mock_connection.__enter__.return_value = mock_connection
+        mock_connection.cursor.return_value.__enter__.return_value = mock_cursor
+        mock_get_connection.return_value = mock_connection
+        with pytest.raises(RuntimeError) as exc_info:
+            connector.fetch_sample_data(
+                num_items_to_select=10,
+                table_name="nonexistent.table"
+            )
+        assert "Failed to fetch data from table" in str(exc_info.value)
+    def test_load_generated_data_single_file_csv_success(self, connector):
+        """Test load_generated_data with SINGLE_FILE CSV dataset"""
+        csv_content = b"id,name,value\n1,test1,100\n2,test2,200\n"
+        zip_buffer = BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w') as z:
+            z.writestr("generated_samples.csv", csv_content)
+        zip_buffer.seek(0)
+        mock_cursor = Mock()
+        mock_connection = MagicMock()
+        mock_connection.__enter__.return_value = mock_connection
+        mock_connection.cursor.return_value = mock_cursor
+        with patch.object(connector, 'get_connection', return_value=mock_connection):
+            connector.load_generated_data(
+                table_name="test_catalog.test_schema.test_table",
+                downloaded_zip=zip_buffer,
+                dataset_type=DatasetType.SINGLE_FILE,
+                file_type=FileType.CSV
+            )
+        assert mock_cursor.execute.call_count == 1
+        create_table_call = mock_cursor.execute.call_args[0][0]
+        assert "CREATE OR REPLACE TABLE" in create_table_call
+        assert "test_catalog.test_schema.test_table" in create_table_call
+        assert mock_cursor.executemany.call_count == 1
+        assert mock_cursor.close.call_count == 1
+    def test_load_generated_data_invalid_zip(self, connector):
+        """Test load_generated_data handles invalid ZIP files"""
+        invalid_zip = BytesIO(b"not a zip file")
+        with pytest.raises(ValueError) as exc_info:
+            connector.load_generated_data(
+                table_name="test_table",
+                downloaded_zip=invalid_zip,
+                dataset_type=DatasetType.SINGLE_FILE,
+                file_type=FileType.CSV
+            )
+        assert "Invalid or corrupted ZIP file" in str(exc_info.value)
+    def test_load_generated_data_no_csv_file(self, connector):
+        """Test load_generated_data when ZIP has no CSV file"""
+        zip_buffer = BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w') as z:
+            z.writestr("data.txt", b"some text")
+        zip_buffer.seek(0)
+        with pytest.raises(ValueError) as exc_info:
+            connector.load_generated_data(
+                table_name="test_table",
+                downloaded_zip=zip_buffer,
+                dataset_type=DatasetType.SINGLE_FILE,
+                file_type=FileType.CSV
+            )
+        assert "Expected exactly one .csv file in ZIP" in str(exc_info.value)
+    def test_load_generated_data_multiple_csv_files(self, connector):
+        """Test load_generated_data when ZIP has multiple CSV files"""
+        zip_buffer = BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w') as z:
+            z.writestr("data1.csv", b"id,name\n1,test1")
+            z.writestr("data2.csv", b"id,name\n2,test2")
+        zip_buffer.seek(0)
+        with pytest.raises(ValueError) as exc_info:
+            connector.load_generated_data(
+                table_name="test_table",
+                downloaded_zip=zip_buffer,
+                dataset_type=DatasetType.SINGLE_FILE,
+                file_type=FileType.CSV
+            )
+        assert "Expected exactly one .csv file in ZIP" in str(exc_info.value)
+    def test_load_generated_data_create_table_failure(self, connector):
+        """Test load_generated_data handles CREATE TABLE errors"""
+        csv_content = b"id,name\n1,test1"
+        zip_buffer = BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w') as z:
+            z.writestr("data.csv", csv_content)
+        zip_buffer.seek(0)
+        mock_cursor = Mock()
+        mock_cursor.execute.side_effect = Exception("Permission denied")
+        mock_connection = MagicMock()
+        mock_connection.__enter__.return_value = mock_connection
+        mock_connection.cursor.return_value = mock_cursor
+        with patch.object(connector, 'get_connection', return_value=mock_connection):
+            with pytest.raises(RuntimeError) as exc_info:
+                connector.load_generated_data(
+                    table_name="test_table",
+                    downloaded_zip=zip_buffer,
+                    dataset_type=DatasetType.SINGLE_FILE,
+                    file_type=FileType.CSV
+                )
+        assert "Failed to create table" in str(exc_info.value)
+        assert mock_cursor.close.call_count == 1
+    def test_load_generated_data_insert_failure(self, connector):
+        """Test load_generated_data handles INSERT errors"""
+        csv_content = b"id,name\n1,test1"
+        zip_buffer = BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w') as z:
+            z.writestr("data.csv", csv_content)
+        zip_buffer.seek(0)
+        mock_cursor = Mock()
+        mock_cursor.execute.return_value = None
+        mock_cursor.executemany.side_effect = Exception("Constraint violation")
+        mock_connection = MagicMock()
+        mock_connection.__enter__.return_value = mock_connection
+        mock_connection.cursor.return_value = mock_cursor
+        with patch.object(connector, 'get_connection', return_value=mock_connection):
+            with pytest.raises(RuntimeError) as exc_info:
+                connector.load_generated_data(
+                    table_name="test_table",
+                    downloaded_zip=zip_buffer,
+                    dataset_type=DatasetType.SINGLE_FILE,
+                    file_type=FileType.CSV
+                )
+        assert "Failed to insert data" in str(exc_info.value)
+        assert mock_cursor.close.call_count == 1