phlo-dlt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: phlo-dlt
3
+ Version: 0.1.0
4
+ Summary: DLT ingestion engine capability plugin for Phlo
5
+ Author-email: Phlo Team <team@phlo.dev>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/plain
9
+ Requires-Dist: phlo>=0.1.0
10
+ Requires-Dist: phlo-iceberg>=0.1.0
11
+ Requires-Dist: phlo-lineage>=0.1.0
12
+ Requires-Dist: phlo-quality>=0.1.0
13
+ Requires-Dist: dlt>=1.18.2
14
+ Requires-Dist: pandas>=2.3.3
15
+ Requires-Dist: pandera>=0.26.1
16
+ Requires-Dist: pyarrow>=21.0.0
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest>=7.0; extra == "dev"
19
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
20
+
21
+ DLT ingestion engine capability plugin for Phlo.
@@ -0,0 +1,72 @@
1
+ # phlo-dlt
2
+
3
+ DLT (Data Load Tool) ingestion engine for Phlo.
4
+
5
+ ## Description
6
+
7
+ Provides the `@phlo_ingestion` decorator for defining data ingestion pipelines using DLT. Automatically materializes data into Iceberg tables with schema evolution and lineage tracking.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install phlo-dlt
13
+ # or
14
+ phlo plugin install dlt
15
+ ```
16
+
17
+ ## Configuration
18
+
19
+ | Variable | Default | Description |
20
+ | ------------------------ | --------------------- | -------------------------- |
21
+ | `ICEBERG_WAREHOUSE_PATH` | `s3://lake/warehouse` | Iceberg warehouse S3 path |
22
+ | `ICEBERG_STAGING_PATH` | `s3://lake/stage` | Staging path for ingestion |
23
+ | `NESSIE_HOST` | `nessie` | Nessie catalog host |
24
+ | `NESSIE_PORT` | `19120` | Nessie catalog port |
25
+
26
+ ## Auto-Configuration
27
+
28
+ This package is **fully auto-configured**:
29
+
30
+ | Feature | How It Works |
31
+ | ---------------------- | -------------------------------------------------------------------- |
32
+ | **Asset Registration** | Ingestion assets published as capability specs via asset provider entry points |
33
+ | **Lineage Events** | Emits `ingestion.start`, `ingestion.end` events for lineage tracking |
34
+ | **Schema Evolution** | Automatically handles schema changes during ingestion |
35
+ | **Hook Integration** | Events captured by alerting, metrics, and OpenMetadata plugins |
36
+
37
+ ### Event Flow
38
+
39
+ ```
40
+ @phlo_ingestion → IngestionEventEmitter → HookBus → [Alerting, Metrics, Lineage plugins]
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ### Defining Ingestion
46
+
47
+ ```python
48
+ from phlo.ingestion import phlo_ingestion
49
+
50
+ @phlo_ingestion(
51
+ name="github_events",
52
+ source="rest_api",
53
+ destination="bronze.github_events"
54
+ )
55
+ def ingest_github_events():
56
+ return {
57
+ "client": {"base_url": "https://api.github.com"},
58
+ "resources": ["events"]
59
+ }
60
+ ```
61
+
62
+ ### Running Ingestion
63
+
64
+ Ingestion assets are automatically discovered and can be materialized via the active orchestrator:
65
+
66
+ ```bash
67
+ phlo materialize dlt_github_events
68
+ ```
69
+
70
+ ## Entry Points
71
+
72
+ - `phlo.plugins.assets` - Provides `DltAssetProvider` for ingestion asset specs
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "phlo-dlt"
7
+ version = "0.1.0"
8
+ description = "DLT ingestion engine capability plugin for Phlo"
9
+ readme = {text = "DLT ingestion engine capability plugin for Phlo.", content-type = "text/plain"}
10
+ requires-python = ">=3.11"
11
+ authors = [
12
+ {name = "Phlo Team", email = "team@phlo.dev"},
13
+ ]
14
+ license = {text = "MIT"}
15
+ dependencies = [
16
+ "phlo>=0.1.0",
17
+ "phlo-iceberg>=0.1.0",
18
+ "phlo-lineage>=0.1.0",
19
+ "phlo-quality>=0.1.0",
20
+ "dlt>=1.18.2",
21
+ "pandas>=2.3.3",
22
+ "pandera>=0.26.1",
23
+ "pyarrow>=21.0.0",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ dev = [
28
+ "pytest>=7.0",
29
+ "ruff>=0.1.0",
30
+ ]
31
+
32
+ [project.entry-points."phlo.plugins.assets"]
33
+ dlt = "phlo_dlt.plugin:DltAssetProvider"
34
+
35
+ [project.entry-points."phlo.plugins.cli"]
36
+ dlt = "phlo_dlt.cli_plugin:DltCliPlugin"
37
+
38
+ [tool.setuptools]
39
+ package-dir = {"" = "src"}
40
+
41
+ [tool.setuptools.packages.find]
42
+ where = ["src"]
43
+
44
+ [tool.ruff]
45
+ line-length = 100
46
+ target-version = "py311"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from phlo_dlt.decorator import get_ingestion_assets, phlo_ingestion
2
+
3
+ __all__ = ["get_ingestion_assets", "phlo_ingestion"]
@@ -0,0 +1,21 @@
1
+ """CLI plugin for DLT workflow scaffolding."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import click
6
+
7
+ from phlo.plugins.base import CliCommandPlugin, PluginMetadata
8
+ from phlo_dlt.cli_workflow import workflow_group
9
+
10
+
11
+ class DltCliPlugin(CliCommandPlugin):
12
+ @property
13
+ def metadata(self) -> PluginMetadata:
14
+ return PluginMetadata(
15
+ name="dlt",
16
+ version="0.1.0",
17
+ description="Workflow scaffolding commands for DLT ingestion",
18
+ )
19
+
20
+ def get_cli_commands(self) -> list[click.Command]:
21
+ return [workflow_group]
@@ -0,0 +1,89 @@
1
+ """Workflow management commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ import click
8
+
9
+
10
+ @click.group(name="workflow")
11
+ def workflow_group() -> None:
12
+ """Manage workflows."""
13
+
14
+
15
+ @workflow_group.command("create")
16
+ @click.option(
17
+ "--type",
18
+ "workflow_type",
19
+ type=click.Choice(["ingestion"]),
20
+ prompt="Workflow type",
21
+ help="Type of workflow to create (ingestion only)",
22
+ )
23
+ @click.option("--domain", prompt="Domain name", help="Domain name (e.g., weather, stripe, github)")
24
+ @click.option("--table", prompt="Table name", help="Table name for ingestion")
25
+ @click.option(
26
+ "--unique-key",
27
+ prompt="Unique key field",
28
+ help="Field name for deduplication (e.g., id, _id)",
29
+ )
30
+ @click.option(
31
+ "--cron",
32
+ default="0 */1 * * *",
33
+ prompt="Cron schedule",
34
+ help="Cron schedule expression",
35
+ )
36
+ @click.option(
37
+ "--api-base-url",
38
+ prompt="API base URL (optional)",
39
+ default="",
40
+ help="REST API base URL",
41
+ )
42
+ @click.option(
43
+ "--field",
44
+ "fields",
45
+ multiple=True,
46
+ help="Additional schema field (name:type, name:type?, name:type!)",
47
+ )
48
+ def create_workflow_cmd(
49
+ workflow_type: str,
50
+ domain: str,
51
+ table: str,
52
+ unique_key: str,
53
+ cron: str,
54
+ api_base_url: str,
55
+ fields: tuple[str, ...],
56
+ ) -> None:
57
+ """Create a workflow scaffold."""
58
+ from phlo_dlt.scaffold import create_ingestion_workflow
59
+
60
+ click.echo(f"\nCreating {workflow_type} workflow for {domain}.{table}...\n")
61
+
62
+ try:
63
+ if workflow_type == "ingestion":
64
+ files = create_ingestion_workflow(
65
+ domain=domain,
66
+ table_name=table,
67
+ unique_key=unique_key,
68
+ cron=cron,
69
+ api_base_url=api_base_url or None,
70
+ fields=list(fields),
71
+ )
72
+
73
+ click.echo("Created files:\n")
74
+ for file_path in files:
75
+ click.echo(f" - {file_path}")
76
+
77
+ click.echo("\nNext steps:")
78
+ click.echo(f" 1. Edit schema: {files[0]}")
79
+ click.echo(f" 2. Configure API: {files[1]}")
80
+ click.echo(" 3. Restart Dagster: docker restart dagster-webserver")
81
+ click.echo(f" 4. Test: phlo test {domain}")
82
+ click.echo(f" 5. Materialize: phlo materialize dlt_{table}")
83
+ else:
84
+ click.echo(f"Error: Workflow type '{workflow_type}' not yet implemented", err=True)
85
+ click.echo("Currently supported: ingestion", err=True)
86
+ sys.exit(1)
87
+ except Exception as exc:
88
+ click.echo(f"Error creating workflow: {exc}", err=True)
89
+ sys.exit(1)
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import date, datetime
4
+ from decimal import Decimal
5
+ from typing import Any, get_args, get_origin, get_type_hints
6
+
7
+ from pandera.pandas import DataFrameModel
8
+ from pyiceberg.schema import Schema
9
+ from pyiceberg.types import (
10
+ BinaryType,
11
+ BooleanType,
12
+ DateType,
13
+ DoubleType,
14
+ LongType,
15
+ NestedField,
16
+ StringType,
17
+ TimestamptzType,
18
+ )
19
+
20
+
21
+ class SchemaConversionError(Exception):
22
+ pass
23
+
24
+
25
+ def pandera_to_iceberg(
26
+ pandera_schema: type[DataFrameModel],
27
+ start_field_id: int = 1,
28
+ add_dlt_metadata: bool = True,
29
+ add_phlo_metadata: bool = True,
30
+ ) -> Schema:
31
+ reserved_field_ids: dict[str, int] = {
32
+ "_dlt_load_id": 100,
33
+ "_dlt_id": 101,
34
+ "_phlo_ingested_at": 102,
35
+ "_phlo_row_id": 103,
36
+ "_phlo_partition_date": 104,
37
+ "_phlo_run_id": 105,
38
+ }
39
+ fields: list[NestedField] = []
40
+ next_field_id = start_field_id
41
+ user_field_count = 0
42
+
43
+ try:
44
+ annotations = get_type_hints(pandera_schema)
45
+ except Exception as e:
46
+ raise SchemaConversionError(
47
+ f"Failed to get type hints from Pandera schema {pandera_schema.__name__}: {e}"
48
+ ) from e
49
+
50
+ if not annotations:
51
+ raise SchemaConversionError(
52
+ f"Pandera schema {pandera_schema.__name__} has no field annotations"
53
+ )
54
+
55
+ try:
56
+ pandera_schema_obj = pandera_schema.to_schema()
57
+ except Exception as e:
58
+ raise SchemaConversionError(
59
+ f"Failed to instantiate Pandera schema {pandera_schema.__name__}: {e}"
60
+ ) from e
61
+
62
+ for field_name, field_type in annotations.items():
63
+ if field_name.startswith("__") or field_name == "Config":
64
+ continue
65
+ user_field_count += 1
66
+
67
+ description = ""
68
+ nullable = True
69
+
70
+ if field_name in pandera_schema_obj.columns:
71
+ column = pandera_schema_obj.columns[field_name]
72
+ nullable = column.nullable
73
+ description = column.description or ""
74
+
75
+ try:
76
+ iceberg_type = _map_type(field_name, field_type)
77
+ except SchemaConversionError as e:
78
+ raise SchemaConversionError(
79
+ f"Cannot map Pandera type for field {field_name}: {e}"
80
+ ) from e
81
+
82
+ field_id = reserved_field_ids.get(field_name, next_field_id)
83
+ if field_name not in reserved_field_ids:
84
+ next_field_id += 1
85
+
86
+ fields.append(
87
+ NestedField(
88
+ field_id=field_id,
89
+ name=field_name,
90
+ field_type=iceberg_type,
91
+ required=not nullable,
92
+ doc=description,
93
+ )
94
+ )
95
+
96
+ if user_field_count == 0:
97
+ raise SchemaConversionError(f"No fields found in Pandera schema {pandera_schema.__name__}")
98
+
99
+ if add_dlt_metadata:
100
+ existing_names = {f.name for f in fields}
101
+ if "_dlt_load_id" not in existing_names:
102
+ fields.append(
103
+ NestedField(
104
+ field_id=100,
105
+ name="_dlt_load_id",
106
+ field_type=StringType(),
107
+ required=True,
108
+ doc="DLT load identifier",
109
+ )
110
+ )
111
+ if "_dlt_id" not in existing_names:
112
+ fields.append(
113
+ NestedField(
114
+ field_id=101,
115
+ name="_dlt_id",
116
+ field_type=StringType(),
117
+ required=True,
118
+ doc="DLT record identifier",
119
+ )
120
+ )
121
+
122
+ if add_phlo_metadata:
123
+ existing_names = {f.name for f in fields}
124
+ if "_phlo_row_id" not in existing_names:
125
+ fields.append(
126
+ NestedField(
127
+ field_id=103,
128
+ name="_phlo_row_id",
129
+ field_type=StringType(),
130
+ required=True,
131
+ doc="Phlo row-level lineage identifier (ULID)",
132
+ )
133
+ )
134
+ if "_phlo_ingested_at" not in existing_names:
135
+ fields.append(
136
+ NestedField(
137
+ field_id=102,
138
+ name="_phlo_ingested_at",
139
+ field_type=TimestamptzType(),
140
+ required=True,
141
+ doc="UTC timestamp when phlo processed this record",
142
+ )
143
+ )
144
+ if "_phlo_partition_date" not in existing_names:
145
+ fields.append(
146
+ NestedField(
147
+ field_id=104,
148
+ name="_phlo_partition_date",
149
+ field_type=StringType(),
150
+ required=True,
151
+ doc="Partition date used for ingestion (YYYY-MM-DD)",
152
+ )
153
+ )
154
+ if "_phlo_run_id" not in existing_names:
155
+ fields.append(
156
+ NestedField(
157
+ field_id=105,
158
+ name="_phlo_run_id",
159
+ field_type=StringType(),
160
+ required=True,
161
+ doc="Dagster run ID for traceability",
162
+ )
163
+ )
164
+
165
+ return Schema(*fields)
166
+
167
+
168
+ def _map_type(field_name: str, pandera_type: Any) -> Any:
169
+ origin = get_origin(pandera_type)
170
+ if origin is None:
171
+ return _map_scalar(field_name, pandera_type)
172
+
173
+ if origin is list:
174
+ raise SchemaConversionError(f"Lists are not supported for field {field_name}")
175
+
176
+ if origin is dict:
177
+ raise SchemaConversionError(f"Dicts are not supported for field {field_name}")
178
+
179
+ if origin is Any:
180
+ return StringType()
181
+
182
+ # Optional[T] / Union[T, None]
183
+ if origin is type(None):
184
+ return StringType()
185
+
186
+ args = get_args(pandera_type)
187
+ for arg in args:
188
+ if arg is type(None):
189
+ continue
190
+ return _map_type(field_name, arg)
191
+
192
+ return StringType()
193
+
194
+
195
+ def _map_scalar(field_name: str, t: Any) -> Any:
196
+ if t in (str,):
197
+ return StringType()
198
+ if t in (int,):
199
+ return LongType()
200
+ if t in (float,):
201
+ return DoubleType()
202
+ if t in (bool,):
203
+ return BooleanType()
204
+ if t in (datetime,):
205
+ return TimestamptzType()
206
+ if t in (date,):
207
+ return DateType()
208
+ if t in (bytes,):
209
+ return BinaryType()
210
+ if t in (Decimal,):
211
+ return DoubleType()
212
+
213
+ raise SchemaConversionError(f"Unsupported type for field {field_name}: {t}")