PyPI - fabrictools - Versions diffs - 0.1.0__tar.gz - Mend

fabrictools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

fabrictools-0.1.0/PKG-INFO +197 -0
fabrictools-0.1.0/README.md +164 -0
fabrictools-0.1.0/fabrictools/__init__.py +34 -0
fabrictools-0.1.0/fabrictools/_logger.py +36 -0
fabrictools-0.1.0/fabrictools/_paths.py +104 -0
fabrictools-0.1.0/fabrictools/_spark.py +25 -0
fabrictools-0.1.0/fabrictools/lakehouse.py +229 -0
fabrictools-0.1.0/fabrictools/warehouse.py +123 -0
fabrictools-0.1.0/fabrictools.egg-info/PKG-INFO +197 -0
fabrictools-0.1.0/fabrictools.egg-info/SOURCES.txt +15 -0
fabrictools-0.1.0/fabrictools.egg-info/dependency_links.txt +1 -0
fabrictools-0.1.0/fabrictools.egg-info/requires.txt +12 -0
fabrictools-0.1.0/fabrictools.egg-info/top_level.txt +1 -0
fabrictools-0.1.0/pyproject.toml +57 -0
fabrictools-0.1.0/setup.cfg +4 -0
fabrictools-0.1.0/tests/test_lakehouse.py +198 -0
fabrictools-0.1.0/tests/test_warehouse.py +124 -0

fabrictools-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,197 @@
+Metadata-Version: 2.4
+Name: fabrictools
+Version: 0.1.0
+Summary: User-friendly PySpark helpers for Microsoft Fabric Lakehouses and Warehouses
+Author-email: Willy Kinfoussia <willy.kinfoussia@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/willykinfoussia/FabricPackage
+Project-URL: Repository, https://github.com/willykinfoussia/FabricPackage
+Project-URL: Issues, https://github.com/willykinfoussia/FabricPackage/issues
+Keywords: microsoft-fabric,pyspark,delta,lakehouse,warehouse,azure
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Database
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Provides-Extra: spark
+Requires-Dist: pyspark>=3.3; extra == "spark"
+Requires-Dist: delta-spark>=2.4; extra == "spark"
+Provides-Extra: dev
+Requires-Dist: pyspark>=3.3; extra == "dev"
+Requires-Dist: delta-spark>=2.4; extra == "dev"
+Requires-Dist: pytest>=7.4; extra == "dev"
+Requires-Dist: pytest-mock>=3.12; extra == "dev"
+Requires-Dist: build>=1.0; extra == "dev"
+Requires-Dist: twine>=5.0; extra == "dev"
+# fabrictools
+> User-friendly PySpark helpers for **Microsoft Fabric** — read, write, and merge Lakehouses and Warehouses with a single function call.
+---
+## Features
+- **Auto-resolved paths** — pass a Lakehouse or Warehouse *name*, no ABFS URL configuration required
+- **Auto-detected SparkSession** — uses `SparkSession.builder.getOrCreate()`, works seamlessly inside Fabric notebooks
+- **Auto-detected format** on read — tries Delta → Parquet → CSV automatically
+- **Delta merge (upsert)** — one-liner upsert into any Lakehouse Delta table
+- **Built-in logging** — every operation logs its resolved path, detected format, and row/column count
+---
+## Requirements
+- Microsoft Fabric Spark runtime (provides `notebookutils`, `pyspark`, and `delta-spark`)
+- Python >= 3.9
+> **Local development:** install the `spark` extras to get PySpark and delta-spark.
+> `notebookutils` is only available inside Fabric — functions that resolve paths will raise a clear `ValueError` outside Fabric.
+---
+## Installation
+```bash
+# Inside a Fabric notebook or pipeline
+pip install fabrictools
+# Local development (includes PySpark + delta-spark)
+pip install "fabrictools[spark]"
+```
+---
+## Quick start
+```python
+import fabrictools as ft
+```
+### Read a Lakehouse dataset
+```python
+# Auto-detects Delta → Parquet → CSV
+df = ft.read_lakehouse("BronzeLakehouse", "sales/2024")
+```
+### Write to a Lakehouse
+```python
+ft.write_lakehouse(
+    df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    mode="overwrite",
+    partition_by=["year", "month"],   # optional
+)
+```
+### Merge (upsert) into a Delta table
+```python
+ft.merge_lakehouse(
+    source_df=new_df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    merge_condition="src.id = tgt.id",
+    # update_set and insert_set are optional:
+    # omit them to update/insert all columns automatically
+)
+```
+With explicit column mappings:
+```python
+ft.merge_lakehouse(
+    source_df=new_df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    merge_condition="src.id = tgt.id",
+    update_set={"amount": "src.amount", "updated_at": "src.updated_at"},
+    insert_set={"id": "src.id", "amount": "src.amount", "updated_at": "src.updated_at"},
+)
+```
+### Read from a Warehouse
+```python
+df = ft.read_warehouse("MyWarehouse", "SELECT * FROM dbo.sales WHERE year = 2024")
+```
+### Write to a Warehouse
+```python
+ft.write_warehouse(
+    df,
+    warehouse_name="MyWarehouse",
+    table="dbo.sales_clean",
+    mode="overwrite",       # or "append"
+    batch_size=10_000,      # optional, default 10 000
+)
+```
+---
+## API reference
+### Lakehouse
+| Function | Description |
+|---|---|
+| `read_lakehouse(lakehouse_name, relative_path, spark=None)` | Read a dataset — auto-detects Delta / Parquet / CSV |
+| `write_lakehouse(df, lakehouse_name, relative_path, mode, partition_by, format, spark=None)` | Write a DataFrame (default: Delta, overwrite) |
+| `merge_lakehouse(source_df, lakehouse_name, relative_path, merge_condition, update_set, insert_set, spark=None)` | Upsert via Delta merge |
+### Warehouse
+| Function | Description |
+|---|---|
+| `read_warehouse(warehouse_name, query, spark=None)` | Run a SQL query, return a DataFrame |
+| `write_warehouse(df, warehouse_name, table, mode, batch_size, spark=None)` | Write to a Warehouse table via JDBC |
+---
+## How path resolution works
+```
+lakehouse_name="BronzeLakehouse"
+       │
+       ▼
+notebookutils.lakehouse.get("BronzeLakehouse")
+       │
+       ▼
+lh.properties.abfsPath
+= "abfss://bronze@<account>.dfs.core.windows.net"
+       │
+       ▼
+full_path = abfsPath + "/" + relative_path
+```
+---
+## Running the tests
+```bash
+pip install "fabrictools[dev]"
+pytest
+```
+---
+## Publishing to PyPI
+See [docs/PYPI_PUBLISH.md](docs/PYPI_PUBLISH.md) for a step-by-step guide.
+---
+## License
+MIT

fabrictools-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,164 @@
+# fabrictools
+> User-friendly PySpark helpers for **Microsoft Fabric** — read, write, and merge Lakehouses and Warehouses with a single function call.
+---
+## Features
+- **Auto-resolved paths** — pass a Lakehouse or Warehouse *name*, no ABFS URL configuration required
+- **Auto-detected SparkSession** — uses `SparkSession.builder.getOrCreate()`, works seamlessly inside Fabric notebooks
+- **Auto-detected format** on read — tries Delta → Parquet → CSV automatically
+- **Delta merge (upsert)** — one-liner upsert into any Lakehouse Delta table
+- **Built-in logging** — every operation logs its resolved path, detected format, and row/column count
+---
+## Requirements
+- Microsoft Fabric Spark runtime (provides `notebookutils`, `pyspark`, and `delta-spark`)
+- Python >= 3.9
+> **Local development:** install the `spark` extras to get PySpark and delta-spark.
+> `notebookutils` is only available inside Fabric — functions that resolve paths will raise a clear `ValueError` outside Fabric.
+---
+## Installation
+```bash
+# Inside a Fabric notebook or pipeline
+pip install fabrictools
+# Local development (includes PySpark + delta-spark)
+pip install "fabrictools[spark]"
+```
+---
+## Quick start
+```python
+import fabrictools as ft
+```
+### Read a Lakehouse dataset
+```python
+# Auto-detects Delta → Parquet → CSV
+df = ft.read_lakehouse("BronzeLakehouse", "sales/2024")
+```
+### Write to a Lakehouse
+```python
+ft.write_lakehouse(
+    df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    mode="overwrite",
+    partition_by=["year", "month"],   # optional
+)
+```
+### Merge (upsert) into a Delta table
+```python
+ft.merge_lakehouse(
+    source_df=new_df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    merge_condition="src.id = tgt.id",
+    # update_set and insert_set are optional:
+    # omit them to update/insert all columns automatically
+)
+```
+With explicit column mappings:
+```python
+ft.merge_lakehouse(
+    source_df=new_df,
+    lakehouse_name="SilverLakehouse",
+    relative_path="sales_clean",
+    merge_condition="src.id = tgt.id",
+    update_set={"amount": "src.amount", "updated_at": "src.updated_at"},
+    insert_set={"id": "src.id", "amount": "src.amount", "updated_at": "src.updated_at"},
+)
+```
+### Read from a Warehouse
+```python
+df = ft.read_warehouse("MyWarehouse", "SELECT * FROM dbo.sales WHERE year = 2024")
+```
+### Write to a Warehouse
+```python
+ft.write_warehouse(
+    df,
+    warehouse_name="MyWarehouse",
+    table="dbo.sales_clean",
+    mode="overwrite",       # or "append"
+    batch_size=10_000,      # optional, default 10 000
+)
+```
+---
+## API reference
+### Lakehouse
+| Function | Description |
+|---|---|
+| `read_lakehouse(lakehouse_name, relative_path, spark=None)` | Read a dataset — auto-detects Delta / Parquet / CSV |
+| `write_lakehouse(df, lakehouse_name, relative_path, mode, partition_by, format, spark=None)` | Write a DataFrame (default: Delta, overwrite) |
+| `merge_lakehouse(source_df, lakehouse_name, relative_path, merge_condition, update_set, insert_set, spark=None)` | Upsert via Delta merge |
+### Warehouse
+| Function | Description |
+|---|---|
+| `read_warehouse(warehouse_name, query, spark=None)` | Run a SQL query, return a DataFrame |
+| `write_warehouse(df, warehouse_name, table, mode, batch_size, spark=None)` | Write to a Warehouse table via JDBC |
+---
+## How path resolution works
+```
+lakehouse_name="BronzeLakehouse"
+       │
+       ▼
+notebookutils.lakehouse.get("BronzeLakehouse")
+       │
+       ▼
+lh.properties.abfsPath
+= "abfss://bronze@<account>.dfs.core.windows.net"
+       │
+       ▼
+full_path = abfsPath + "/" + relative_path
+```
+---
+## Running the tests
+```bash
+pip install "fabrictools[dev]"
+pytest
+```
+---
+## Publishing to PyPI
+See [docs/PYPI_PUBLISH.md](docs/PYPI_PUBLISH.md) for a step-by-step guide.
+---
+## License
+MIT

fabrictools-0.1.0/fabrictools/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""
+fabrictools — User-friendly PySpark helpers for Microsoft Fabric.
+Public API
+----------
+Lakehouse
+~~~~~~~~~
+read_lakehouse(lakehouse_name, relative_path, spark=None)
+    Read a dataset (auto-detects Delta / Parquet / CSV).
+write_lakehouse(df, lakehouse_name, relative_path, mode, partition_by, format, spark=None)
+    Write a DataFrame to a Lakehouse (defaults to Delta format).
+merge_lakehouse(source_df, lakehouse_name, relative_path, merge_condition, ...)
+    Upsert (merge) a DataFrame into an existing Delta table.
+Warehouse
+~~~~~~~~~
+read_warehouse(warehouse_name, query, spark=None)
+    Run a SQL query and return the result as a DataFrame.
+write_warehouse(df, warehouse_name, table, mode, batch_size, spark=None)
+    Write a DataFrame to a Warehouse table via JDBC.
+"""
+from fabrictools.lakehouse import merge_lakehouse, read_lakehouse, write_lakehouse
+from fabrictools.warehouse import read_warehouse, write_warehouse
+__all__ = [
+    "read_lakehouse",
+    "write_lakehouse",
+    "merge_lakehouse",
+    "read_warehouse",
+    "write_warehouse",
+]
+__version__ = "0.1.0"

fabrictools-0.1.0/fabrictools/_logger.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Internal logging utility for fabrictools."""
+from __future__ import annotations
+import datetime
+import logging
+_LEVELS = {
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "debug": logging.DEBUG,
+}
+logging.basicConfig(
+    format="%(message)s",
+    level=logging.INFO,
+)
+_logger = logging.getLogger("fabrictools")
+def log(message: str, level: str = "info") -> None:
+    """
+    Emit a timestamped log message.
+    Parameters
+    ----------
+    message:
+        Text to log.
+    level:
+        One of ``"info"``, ``"warning"``, ``"error"``, ``"debug"``.
+        Defaults to ``"info"``.
+    """
+    ts = datetime.datetime.now().strftime("%H:%M:%S")
+    lvl = _LEVELS.get(level.lower(), logging.INFO)
+    _logger.log(lvl, "[%s] %s", ts, message)

fabrictools-0.1.0/fabrictools/_paths.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+Path resolution helpers for Microsoft Fabric resources.
+These functions rely on ``notebookutils``, which is injected automatically
+into the Python environment by the Fabric notebook runtime.  They will raise
+a clear ``ValueError`` when called outside Fabric (e.g. local tests) so that
+callers can handle the missing dependency gracefully.
+"""
+from __future__ import annotations
+from fabrictools._logger import log
+def get_lakehouse_abfs_path(lakehouse_name: str) -> str:
+    """
+    Return the full ABFS path for a Fabric Lakehouse.
+    Internally calls ``notebookutils.lakehouse.get(lakehouse_name)`` which is
+    available in every Fabric Spark notebook.
+    Parameters
+    ----------
+    lakehouse_name:
+        Display name of the Lakehouse as it appears in the Fabric workspace
+        (e.g. ``"BronzeLakehouse"``).
+    Returns
+    -------
+    str
+        ABFS path of the form
+        ``abfss://<container>@<account>.dfs.core.windows.net``.
+    Raises
+    ------
+    ValueError
+        When ``notebookutils`` is not available (outside Fabric).
+    """
+    try:
+        import notebookutils  # type: ignore[import-untyped]  # noqa: PLC0415
+        lh = notebookutils.lakehouse.get(lakehouse_name)
+        path: str = lh.properties.abfsPath
+        log(f"Resolved Lakehouse '{lakehouse_name}' → {path}")
+        return path
+    except ImportError as exc:
+        raise ValueError(
+            f"notebookutils is not available — are you running inside "
+            f"Microsoft Fabric? ({exc})"
+        ) from exc
+    except Exception as exc:
+        raise ValueError(
+            f"Could not resolve Lakehouse '{lakehouse_name}': {exc}"
+        ) from exc
+def get_warehouse_jdbc_url(warehouse_name: str) -> str:
+    """
+    Return the JDBC connection URL for a Fabric Warehouse.
+    Internally calls ``notebookutils.warehouse.get(warehouse_name)`` to
+    retrieve the SQL endpoint and builds a standard JDBC URL from it.
+    Parameters
+    ----------
+    warehouse_name:
+        Display name of the Warehouse as it appears in the Fabric workspace
+        (e.g. ``"MyWarehouse"``).
+    Returns
+    -------
+    str
+        JDBC URL suitable for use with ``spark.read.format("jdbc")``.
+    Raises
+    ------
+    ValueError
+        When ``notebookutils`` is not available or the warehouse cannot be
+        found.
+    """
+    try:
+        import notebookutils  # type: ignore[import-untyped]  # noqa: PLC0415
+        wh = notebookutils.warehouse.get(warehouse_name)
+        sql_endpoint: str = wh.properties.connectionString
+        database: str = wh.properties.databaseName
+        jdbc_url = (
+            f"jdbc:sqlserver://{sql_endpoint};"
+            f"database={database};"
+            "encrypt=true;"
+            "trustServerCertificate=false;"
+            "loginTimeout=30;"
+        )
+        log(f"Resolved Warehouse '{warehouse_name}' → {sql_endpoint}/{database}")
+        return jdbc_url
+    except ImportError as exc:
+        raise ValueError(
+            f"notebookutils is not available — are you running inside "
+            f"Microsoft Fabric? ({exc})"
+        ) from exc
+    except Exception as exc:
+        raise ValueError(
+            f"Could not resolve Warehouse '{warehouse_name}': {exc}"
+        ) from exc

fabrictools-0.1.0/fabrictools/_spark.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""SparkSession accessor for fabrictools."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+def get_spark() -> "SparkSession":
+    """
+    Return the active SparkSession, creating one if necessary.
+    Inside a Microsoft Fabric notebook the runtime already has an active
+    session, so ``getOrCreate()`` simply returns it.  Outside Fabric (e.g.
+    local development) a new local session is started automatically.
+    Returns
+    -------
+    SparkSession
+    """
+    from pyspark.sql import SparkSession  # noqa: PLC0415
+    return SparkSession.builder.getOrCreate()