PyPI - phlo-dlt - Versions diffs - 0.1.0__tar.gz - Mend

phlo-dlt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

phlo_dlt-0.1.0/PKG-INFO +21 -0
phlo_dlt-0.1.0/README.md +72 -0
phlo_dlt-0.1.0/pyproject.toml +46 -0
phlo_dlt-0.1.0/setup.cfg +4 -0
phlo_dlt-0.1.0/src/phlo_dlt/__init__.py +3 -0
phlo_dlt-0.1.0/src/phlo_dlt/cli_plugin.py +21 -0
phlo_dlt-0.1.0/src/phlo_dlt/cli_workflow.py +89 -0
phlo_dlt-0.1.0/src/phlo_dlt/converter.py +213 -0
phlo_dlt-0.1.0/src/phlo_dlt/decorator.py +316 -0
phlo_dlt-0.1.0/src/phlo_dlt/dlt_helpers.py +256 -0
phlo_dlt-0.1.0/src/phlo_dlt/executor.py +187 -0
phlo_dlt-0.1.0/src/phlo_dlt/plugin.py +28 -0
phlo_dlt-0.1.0/src/phlo_dlt/registry.py +21 -0
phlo_dlt-0.1.0/src/phlo_dlt/scaffold.py +242 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/PKG-INFO +21 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/SOURCES.txt +23 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/dependency_links.txt +1 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/entry_points.txt +5 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/requires.txt +12 -0
phlo_dlt-0.1.0/src/phlo_dlt.egg-info/top_level.txt +1 -0
phlo_dlt-0.1.0/tests/test_ingestion_decorator.py +554 -0
phlo_dlt-0.1.0/tests/test_integration_dlt.py +97 -0
phlo_dlt-0.1.0/tests/test_schema_converter.py +397 -0
phlo_dlt-0.1.0/tests/test_strict_validation.py +61 -0
phlo_dlt-0.1.0/tests/test_validate_with_pandera_datetime_coercion.py +35 -0

phlo_dlt-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,21 @@
+Metadata-Version: 2.4
+Name: phlo-dlt
+Version: 0.1.0
+Summary: DLT ingestion engine capability plugin for Phlo
+Author-email: Phlo Team <team@phlo.dev>
+License: MIT
+Requires-Python: >=3.11
+Description-Content-Type: text/plain
+Requires-Dist: phlo>=0.1.0
+Requires-Dist: phlo-iceberg>=0.1.0
+Requires-Dist: phlo-lineage>=0.1.0
+Requires-Dist: phlo-quality>=0.1.0
+Requires-Dist: dlt>=1.18.2
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: pandera>=0.26.1
+Requires-Dist: pyarrow>=21.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+DLT ingestion engine capability plugin for Phlo.

phlo_dlt-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,72 @@
+# phlo-dlt
+DLT (Data Load Tool) ingestion engine for Phlo.
+## Description
+Provides the `@phlo_ingestion` decorator for defining data ingestion pipelines using DLT. Automatically materializes data into Iceberg tables with schema evolution and lineage tracking.
+## Installation
+```bash
+pip install phlo-dlt
+# or
+phlo plugin install dlt
+```
+## Configuration
+| Variable                 | Default               | Description                |
+| ------------------------ | --------------------- | -------------------------- |
+| `ICEBERG_WAREHOUSE_PATH` | `s3://lake/warehouse` | Iceberg warehouse S3 path  |
+| `ICEBERG_STAGING_PATH`   | `s3://lake/stage`     | Staging path for ingestion |
+| `NESSIE_HOST`            | `nessie`              | Nessie catalog host        |
+| `NESSIE_PORT`            | `19120`               | Nessie catalog port        |
+## Auto-Configuration
+This package is **fully auto-configured**:
+| Feature                | How It Works                                                         |
+| ---------------------- | -------------------------------------------------------------------- |
+| **Asset Registration** | Ingestion assets published as capability specs via asset provider entry points |
+| **Lineage Events**     | Emits `ingestion.start`, `ingestion.end` events for lineage tracking |
+| **Schema Evolution**   | Automatically handles schema changes during ingestion                |
+| **Hook Integration**   | Events captured by alerting, metrics, and OpenMetadata plugins       |
+### Event Flow
+```
+@phlo_ingestion → IngestionEventEmitter → HookBus → [Alerting, Metrics, Lineage plugins]
+```
+## Usage
+### Defining Ingestion
+```python
+from phlo.ingestion import phlo_ingestion
+@phlo_ingestion(
+    name="github_events",
+    source="rest_api",
+    destination="bronze.github_events"
+)
+def ingest_github_events():
+    return {
+        "client": {"base_url": "https://api.github.com"},
+        "resources": ["events"]
+    }
+```
+### Running Ingestion
+Ingestion assets are automatically discovered and can be materialized via the active orchestrator:
+```bash
+phlo materialize dlt_github_events
+```
+## Entry Points
+- `phlo.plugins.assets` - Provides `DltAssetProvider` for ingestion asset specs

phlo_dlt-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,46 @@
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "phlo-dlt"
+version = "0.1.0"
+description = "DLT ingestion engine capability plugin for Phlo"
+readme = {text = "DLT ingestion engine capability plugin for Phlo.", content-type = "text/plain"}
+requires-python = ">=3.11"
+authors = [
+    {name = "Phlo Team", email = "team@phlo.dev"},
+]
+license = {text = "MIT"}
+dependencies = [
+    "phlo>=0.1.0",
+    "phlo-iceberg>=0.1.0",
+    "phlo-lineage>=0.1.0",
+    "phlo-quality>=0.1.0",
+    "dlt>=1.18.2",
+    "pandas>=2.3.3",
+    "pandera>=0.26.1",
+    "pyarrow>=21.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "ruff>=0.1.0",
+]
+[project.entry-points."phlo.plugins.assets"]
+dlt = "phlo_dlt.plugin:DltAssetProvider"
+[project.entry-points."phlo.plugins.cli"]
+dlt = "phlo_dlt.cli_plugin:DltCliPlugin"
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.ruff]
+line-length = 100
+target-version = "py311"

phlo_dlt-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

phlo_dlt-0.1.0/src/phlo_dlt/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from phlo_dlt.decorator import get_ingestion_assets, phlo_ingestion
+__all__ = ["get_ingestion_assets", "phlo_ingestion"]

phlo_dlt-0.1.0/src/phlo_dlt/cli_plugin.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""CLI plugin for DLT workflow scaffolding."""
+from __future__ import annotations
+import click
+from phlo.plugins.base import CliCommandPlugin, PluginMetadata
+from phlo_dlt.cli_workflow import workflow_group
+class DltCliPlugin(CliCommandPlugin):
+    @property
+    def metadata(self) -> PluginMetadata:
+        return PluginMetadata(
+            name="dlt",
+            version="0.1.0",
+            description="Workflow scaffolding commands for DLT ingestion",
+        )
+    def get_cli_commands(self) -> list[click.Command]:
+        return [workflow_group]

phlo_dlt-0.1.0/src/phlo_dlt/cli_workflow.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Workflow management commands."""
+from __future__ import annotations
+import sys
+import click
+@click.group(name="workflow")
+def workflow_group() -> None:
+    """Manage workflows."""
+@workflow_group.command("create")
+@click.option(
+    "--type",
+    "workflow_type",
+    type=click.Choice(["ingestion"]),
+    prompt="Workflow type",
+    help="Type of workflow to create (ingestion only)",
+)
+@click.option("--domain", prompt="Domain name", help="Domain name (e.g., weather, stripe, github)")
+@click.option("--table", prompt="Table name", help="Table name for ingestion")
+@click.option(
+    "--unique-key",
+    prompt="Unique key field",
+    help="Field name for deduplication (e.g., id, _id)",
+)
+@click.option(
+    "--cron",
+    default="0 */1 * * *",
+    prompt="Cron schedule",
+    help="Cron schedule expression",
+)
+@click.option(
+    "--api-base-url",
+    prompt="API base URL (optional)",
+    default="",
+    help="REST API base URL",
+)
+@click.option(
+    "--field",
+    "fields",
+    multiple=True,
+    help="Additional schema field (name:type, name:type?, name:type!)",
+)
+def create_workflow_cmd(
+    workflow_type: str,
+    domain: str,
+    table: str,
+    unique_key: str,
+    cron: str,
+    api_base_url: str,
+    fields: tuple[str, ...],
+) -> None:
+    """Create a workflow scaffold."""
+    from phlo_dlt.scaffold import create_ingestion_workflow
+    click.echo(f"\nCreating {workflow_type} workflow for {domain}.{table}...\n")
+    try:
+        if workflow_type == "ingestion":
+            files = create_ingestion_workflow(
+                domain=domain,
+                table_name=table,
+                unique_key=unique_key,
+                cron=cron,
+                api_base_url=api_base_url or None,
+                fields=list(fields),
+            )
+            click.echo("Created files:\n")
+            for file_path in files:
+                click.echo(f"  - {file_path}")
+            click.echo("\nNext steps:")
+            click.echo(f"  1. Edit schema: {files[0]}")
+            click.echo(f"  2. Configure API: {files[1]}")
+            click.echo("  3. Restart Dagster: docker restart dagster-webserver")
+            click.echo(f"  4. Test: phlo test {domain}")
+            click.echo(f"  5. Materialize: phlo materialize dlt_{table}")
+        else:
+            click.echo(f"Error: Workflow type '{workflow_type}' not yet implemented", err=True)
+            click.echo("Currently supported: ingestion", err=True)
+            sys.exit(1)
+    except Exception as exc:
+        click.echo(f"Error creating workflow: {exc}", err=True)
+        sys.exit(1)

phlo_dlt-0.1.0/src/phlo_dlt/converter.py ADDED Viewed

@@ -0,0 +1,213 @@
+from __future__ import annotations
+from datetime import date, datetime
+from decimal import Decimal
+from typing import Any, get_args, get_origin, get_type_hints
+from pandera.pandas import DataFrameModel
+from pyiceberg.schema import Schema
+from pyiceberg.types import (
+    BinaryType,
+    BooleanType,
+    DateType,
+    DoubleType,
+    LongType,
+    NestedField,
+    StringType,
+    TimestamptzType,
+)
+class SchemaConversionError(Exception):
+    pass
+def pandera_to_iceberg(
+    pandera_schema: type[DataFrameModel],
+    start_field_id: int = 1,
+    add_dlt_metadata: bool = True,
+    add_phlo_metadata: bool = True,
+) -> Schema:
+    reserved_field_ids: dict[str, int] = {
+        "_dlt_load_id": 100,
+        "_dlt_id": 101,
+        "_phlo_ingested_at": 102,
+        "_phlo_row_id": 103,
+        "_phlo_partition_date": 104,
+        "_phlo_run_id": 105,
+    }
+    fields: list[NestedField] = []
+    next_field_id = start_field_id
+    user_field_count = 0
+    try:
+        annotations = get_type_hints(pandera_schema)
+    except Exception as e:
+        raise SchemaConversionError(
+            f"Failed to get type hints from Pandera schema {pandera_schema.__name__}: {e}"
+        ) from e
+    if not annotations:
+        raise SchemaConversionError(
+            f"Pandera schema {pandera_schema.__name__} has no field annotations"
+        )
+    try:
+        pandera_schema_obj = pandera_schema.to_schema()
+    except Exception as e:
+        raise SchemaConversionError(
+            f"Failed to instantiate Pandera schema {pandera_schema.__name__}: {e}"
+        ) from e
+    for field_name, field_type in annotations.items():
+        if field_name.startswith("__") or field_name == "Config":
+            continue
+        user_field_count += 1
+        description = ""
+        nullable = True
+        if field_name in pandera_schema_obj.columns:
+            column = pandera_schema_obj.columns[field_name]
+            nullable = column.nullable
+            description = column.description or ""
+        try:
+            iceberg_type = _map_type(field_name, field_type)
+        except SchemaConversionError as e:
+            raise SchemaConversionError(
+                f"Cannot map Pandera type for field {field_name}: {e}"
+            ) from e
+        field_id = reserved_field_ids.get(field_name, next_field_id)
+        if field_name not in reserved_field_ids:
+            next_field_id += 1
+        fields.append(
+            NestedField(
+                field_id=field_id,
+                name=field_name,
+                field_type=iceberg_type,
+                required=not nullable,
+                doc=description,
+            )
+        )
+    if user_field_count == 0:
+        raise SchemaConversionError(f"No fields found in Pandera schema {pandera_schema.__name__}")
+    if add_dlt_metadata:
+        existing_names = {f.name for f in fields}
+        if "_dlt_load_id" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=100,
+                    name="_dlt_load_id",
+                    field_type=StringType(),
+                    required=True,
+                    doc="DLT load identifier",
+                )
+            )
+        if "_dlt_id" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=101,
+                    name="_dlt_id",
+                    field_type=StringType(),
+                    required=True,
+                    doc="DLT record identifier",
+                )
+            )
+    if add_phlo_metadata:
+        existing_names = {f.name for f in fields}
+        if "_phlo_row_id" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=103,
+                    name="_phlo_row_id",
+                    field_type=StringType(),
+                    required=True,
+                    doc="Phlo row-level lineage identifier (ULID)",
+                )
+            )
+        if "_phlo_ingested_at" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=102,
+                    name="_phlo_ingested_at",
+                    field_type=TimestamptzType(),
+                    required=True,
+                    doc="UTC timestamp when phlo processed this record",
+                )
+            )
+        if "_phlo_partition_date" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=104,
+                    name="_phlo_partition_date",
+                    field_type=StringType(),
+                    required=True,
+                    doc="Partition date used for ingestion (YYYY-MM-DD)",
+                )
+            )
+        if "_phlo_run_id" not in existing_names:
+            fields.append(
+                NestedField(
+                    field_id=105,
+                    name="_phlo_run_id",
+                    field_type=StringType(),
+                    required=True,
+                    doc="Dagster run ID for traceability",
+                )
+            )
+    return Schema(*fields)
+def _map_type(field_name: str, pandera_type: Any) -> Any:
+    origin = get_origin(pandera_type)
+    if origin is None:
+        return _map_scalar(field_name, pandera_type)
+    if origin is list:
+        raise SchemaConversionError(f"Lists are not supported for field {field_name}")
+    if origin is dict:
+        raise SchemaConversionError(f"Dicts are not supported for field {field_name}")
+    if origin is Any:
+        return StringType()
+    # Optional[T] / Union[T, None]
+    if origin is type(None):
+        return StringType()
+    args = get_args(pandera_type)
+    for arg in args:
+        if arg is type(None):
+            continue
+        return _map_type(field_name, arg)
+    return StringType()
+def _map_scalar(field_name: str, t: Any) -> Any:
+    if t in (str,):
+        return StringType()
+    if t in (int,):
+        return LongType()
+    if t in (float,):
+        return DoubleType()
+    if t in (bool,):
+        return BooleanType()
+    if t in (datetime,):
+        return TimestamptzType()
+    if t in (date,):
+        return DateType()
+    if t in (bytes,):
+        return BinaryType()
+    if t in (Decimal,):
+        return DoubleType()
+    raise SchemaConversionError(f"Unsupported type for field {field_name}: {t}")