metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Asset selection helpers for Metaxy assets."""
|
|
2
|
+
|
|
3
|
+
import dagster as dg
|
|
4
|
+
|
|
5
|
+
import metaxy as mx
|
|
6
|
+
from metaxy.ext.dagster.constants import (
|
|
7
|
+
DAGSTER_METAXY_FEATURE_METADATA_KEY,
|
|
8
|
+
DAGSTER_METAXY_PROJECT_TAG_KEY,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def select_metaxy_assets(
|
|
13
|
+
*,
|
|
14
|
+
project: str | None = None,
|
|
15
|
+
feature: mx.CoercibleToFeatureKey | None = None,
|
|
16
|
+
) -> dg.AssetSelection:
|
|
17
|
+
"""Select Metaxy assets by project and/or feature.
|
|
18
|
+
|
|
19
|
+
This helper creates an `AssetSelection` that filters assets tagged by `@metaxify`.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
project: Filter by project name. If None, uses `MetaxyConfig.get().project`.
|
|
23
|
+
feature: Filter by specific feature key. If provided, further narrows the selection.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
An `AssetSelection` that can be used with `dg.define_asset_job`,
|
|
27
|
+
`dg.materialize`, or `AssetSelection` operations like `|` and `&`.
|
|
28
|
+
|
|
29
|
+
Example: Select all Metaxy assets in current project
|
|
30
|
+
```python
|
|
31
|
+
import metaxy.ext.dagster as mxd
|
|
32
|
+
|
|
33
|
+
all_metaxy = mxd.select_metaxy_assets()
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Example: Select assets for a specific project
|
|
37
|
+
```python
|
|
38
|
+
prod_assets = mxd.select_metaxy_assets(project="production")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Example: Select a specific feature's assets
|
|
42
|
+
```python
|
|
43
|
+
feature_assets = mxd.select_metaxy_assets(feature="my/feature/key")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Example: Use with asset jobs
|
|
47
|
+
```python
|
|
48
|
+
metaxy_job = dg.define_asset_job(
|
|
49
|
+
name="materialize_metaxy",
|
|
50
|
+
selection=mxd.select_metaxy_assets(),
|
|
51
|
+
)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Example: Combine with other selections
|
|
55
|
+
```python
|
|
56
|
+
# All metaxy assets plus some other assets
|
|
57
|
+
combined = mxd.select_metaxy_assets() | dg.AssetSelection.keys("other_asset")
|
|
58
|
+
|
|
59
|
+
# Metaxy assets that are also in a specific group
|
|
60
|
+
filtered = mxd.select_metaxy_assets() & dg.AssetSelection.groups("my_group")
|
|
61
|
+
```
|
|
62
|
+
"""
|
|
63
|
+
resolved_project = project if project is not None else mx.MetaxyConfig.get().project
|
|
64
|
+
|
|
65
|
+
selection = dg.AssetSelection.tag(DAGSTER_METAXY_PROJECT_TAG_KEY, resolved_project)
|
|
66
|
+
|
|
67
|
+
if feature is not None:
|
|
68
|
+
feature_key = mx.coerce_to_feature_key(feature)
|
|
69
|
+
selection = selection & dg.AssetSelection.tag(
|
|
70
|
+
DAGSTER_METAXY_FEATURE_METADATA_KEY, str(feature_key)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return selection
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""Table metadata utilities for Dagster integration.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for building Dagster table metadata
|
|
4
|
+
(column schema, column lineage, table previews, etc.) from Metaxy feature definitions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import types
|
|
8
|
+
from typing import Any, Union, get_args, get_origin
|
|
9
|
+
|
|
10
|
+
import dagster as dg
|
|
11
|
+
import narwhals as nw
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
import metaxy as mx
|
|
15
|
+
from metaxy.ext.dagster.utils import get_asset_key_for_metaxy_feature_spec
|
|
16
|
+
from metaxy.models.constants import ALL_SYSTEM_COLUMNS, SYSTEM_COLUMNS_WITH_LINEAGE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_column_schema(feature_cls: type[mx.BaseFeature]) -> dg.TableSchema:
|
|
20
|
+
"""Build a Dagster TableSchema from a Metaxy feature class.
|
|
21
|
+
|
|
22
|
+
Creates column definitions from Pydantic model fields, including inherited
|
|
23
|
+
system columns. Field types are converted to strings and field descriptions
|
|
24
|
+
are used as column descriptions.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
feature_cls: The Metaxy feature class to extract schema from.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
A TableSchema with columns derived from Pydantic model fields,
|
|
31
|
+
sorted alphabetically by name.
|
|
32
|
+
|
|
33
|
+
!!! tip
|
|
34
|
+
This is automatically injected by [`@metaxify`][metaxy.ext.dagster.metaxify.metaxify]
|
|
35
|
+
"""
|
|
36
|
+
columns: list[dg.TableColumn] = []
|
|
37
|
+
for field_name, field_info in feature_cls.model_fields.items():
|
|
38
|
+
columns.append(
|
|
39
|
+
dg.TableColumn(
|
|
40
|
+
name=field_name,
|
|
41
|
+
type=_get_type_string(field_info.annotation),
|
|
42
|
+
description=field_info.description,
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Sort columns alphabetically by name
|
|
47
|
+
columns.sort(key=lambda col: col.name)
|
|
48
|
+
return dg.TableSchema(columns=columns)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_type_string(annotation: Any) -> str:
|
|
52
|
+
"""Get a clean string representation of a type annotation.
|
|
53
|
+
|
|
54
|
+
For generic types (list[str], dict[str, int], etc.), str() works well.
|
|
55
|
+
For simple types (str, int, etc.), use __name__ to avoid "<class 'str'>" output.
|
|
56
|
+
|
|
57
|
+
Special handling:
|
|
58
|
+
- Pydantic datetime types show cleaner representations
|
|
59
|
+
- None is stripped from union types (nullability is handled separately via DB constraints)
|
|
60
|
+
"""
|
|
61
|
+
from pydantic import AwareDatetime, NaiveDatetime
|
|
62
|
+
|
|
63
|
+
# Map Pydantic datetime types to cleaner representations
|
|
64
|
+
pydantic_type_names = {
|
|
65
|
+
AwareDatetime: "datetime (UTC)",
|
|
66
|
+
NaiveDatetime: "datetime (naive)",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# For generic types (list[str], dict[str, int], Union, etc.), handle args recursively
|
|
70
|
+
origin = get_origin(annotation)
|
|
71
|
+
if origin is not None:
|
|
72
|
+
args = get_args(annotation)
|
|
73
|
+
if args:
|
|
74
|
+
# Handle Union types (X | Y syntax uses types.UnionType, typing.Union is different)
|
|
75
|
+
if origin is Union or isinstance(annotation, types.UnionType):
|
|
76
|
+
# Filter out None - nullability is handled via DB constraints, not Pydantic types
|
|
77
|
+
non_none_args = [arg for arg in args if arg is not type(None)]
|
|
78
|
+
if len(non_none_args) == 1:
|
|
79
|
+
# Simple optional type like `str | None` -> just return the base type
|
|
80
|
+
return _get_type_string(non_none_args[0])
|
|
81
|
+
# Multiple non-None types in union
|
|
82
|
+
clean_args = [_get_type_string(arg) for arg in non_none_args]
|
|
83
|
+
return " | ".join(clean_args)
|
|
84
|
+
# Handle other generic types
|
|
85
|
+
clean_args = [_get_type_string(arg) for arg in args]
|
|
86
|
+
origin_name = getattr(origin, "__name__", str(origin))
|
|
87
|
+
return f"{origin_name}[{', '.join(clean_args)}]"
|
|
88
|
+
return str(annotation)
|
|
89
|
+
|
|
90
|
+
# Check for Pydantic special types
|
|
91
|
+
if annotation in pydantic_type_names:
|
|
92
|
+
return pydantic_type_names[annotation]
|
|
93
|
+
|
|
94
|
+
# For simple types, use __name__ if available
|
|
95
|
+
if hasattr(annotation, "__name__"):
|
|
96
|
+
return annotation.__name__
|
|
97
|
+
|
|
98
|
+
# Fallback to str()
|
|
99
|
+
return str(annotation)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def build_column_lineage(
|
|
103
|
+
feature_cls: type[mx.BaseFeature],
|
|
104
|
+
feature_spec: mx.FeatureSpec | None = None,
|
|
105
|
+
) -> dg.TableColumnLineage | None:
|
|
106
|
+
"""Build column-level lineage from feature dependencies.
|
|
107
|
+
|
|
108
|
+
Tracks column provenance by analyzing:
|
|
109
|
+
- `FeatureDep.rename` mappings: renamed columns trace back to their upstream source
|
|
110
|
+
- `FeatureSpec.lineage`: ID column relationships between features
|
|
111
|
+
- Direct pass-through: columns with same name in both upstream and downstream
|
|
112
|
+
- System columns: `metaxy_provenance_by_field` and `metaxy_provenance` have lineage
|
|
113
|
+
from corresponding upstream columns
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
feature_cls: The downstream feature class.
|
|
117
|
+
feature_spec: The downstream feature specification. If None, uses feature_cls.spec().
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
TableColumnLineage mapping downstream columns to their upstream sources,
|
|
121
|
+
or None if no column lineage can be determined.
|
|
122
|
+
|
|
123
|
+
!!! tip
|
|
124
|
+
This is automatically injected by [`@metaxify`][metaxy.ext.dagster.metaxify.metaxify]
|
|
125
|
+
"""
|
|
126
|
+
if feature_spec is None:
|
|
127
|
+
feature_spec = feature_cls.spec()
|
|
128
|
+
|
|
129
|
+
if not feature_spec.deps:
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
deps_by_column: dict[str, list[dg.TableColumnDep]] = {}
|
|
133
|
+
downstream_columns = set(feature_cls.model_fields.keys())
|
|
134
|
+
|
|
135
|
+
for dep in feature_spec.deps:
|
|
136
|
+
upstream_feature_cls = mx.get_feature_by_key(dep.feature)
|
|
137
|
+
upstream_feature_spec = upstream_feature_cls.spec()
|
|
138
|
+
upstream_asset_key = get_asset_key_for_metaxy_feature_spec(
|
|
139
|
+
upstream_feature_spec
|
|
140
|
+
)
|
|
141
|
+
upstream_columns = set(upstream_feature_cls.model_fields.keys())
|
|
142
|
+
|
|
143
|
+
# Build reverse rename map: downstream_name -> upstream_name
|
|
144
|
+
# FeatureDep.rename is {old_upstream_name: new_downstream_name}
|
|
145
|
+
reverse_rename: dict[str, str] = {}
|
|
146
|
+
if dep.rename:
|
|
147
|
+
reverse_rename = {v: k for k, v in dep.rename.items()}
|
|
148
|
+
|
|
149
|
+
# Track columns based on lineage relationship
|
|
150
|
+
lineage = feature_spec.lineage
|
|
151
|
+
|
|
152
|
+
# Get ID column mappings based on lineage type
|
|
153
|
+
id_column_mapping = _get_id_column_mapping(
|
|
154
|
+
downstream_id_columns=feature_spec.id_columns,
|
|
155
|
+
upstream_id_columns=upstream_feature_spec.id_columns,
|
|
156
|
+
lineage=lineage,
|
|
157
|
+
rename=reverse_rename,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Process ID columns
|
|
161
|
+
for downstream_col, upstream_col in id_column_mapping.items():
|
|
162
|
+
if downstream_col in downstream_columns:
|
|
163
|
+
if downstream_col not in deps_by_column:
|
|
164
|
+
deps_by_column[downstream_col] = []
|
|
165
|
+
deps_by_column[downstream_col].append(
|
|
166
|
+
dg.TableColumnDep(
|
|
167
|
+
asset_key=upstream_asset_key,
|
|
168
|
+
column_name=upstream_col,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Process renamed columns (that aren't ID columns)
|
|
173
|
+
for downstream_col, upstream_col in reverse_rename.items():
|
|
174
|
+
if (
|
|
175
|
+
downstream_col in downstream_columns
|
|
176
|
+
and downstream_col not in id_column_mapping
|
|
177
|
+
):
|
|
178
|
+
if upstream_col in upstream_columns:
|
|
179
|
+
if downstream_col not in deps_by_column:
|
|
180
|
+
deps_by_column[downstream_col] = []
|
|
181
|
+
deps_by_column[downstream_col].append(
|
|
182
|
+
dg.TableColumnDep(
|
|
183
|
+
asset_key=upstream_asset_key,
|
|
184
|
+
column_name=upstream_col,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Process direct pass-through columns (same name in both, not renamed, ID, or system)
|
|
189
|
+
# System columns are handled separately below since only some have lineage
|
|
190
|
+
handled_columns = (
|
|
191
|
+
set(id_column_mapping.keys())
|
|
192
|
+
| set(reverse_rename.keys())
|
|
193
|
+
| ALL_SYSTEM_COLUMNS
|
|
194
|
+
)
|
|
195
|
+
for col in downstream_columns - handled_columns:
|
|
196
|
+
if col in upstream_columns:
|
|
197
|
+
if col not in deps_by_column:
|
|
198
|
+
deps_by_column[col] = []
|
|
199
|
+
deps_by_column[col].append(
|
|
200
|
+
dg.TableColumnDep(
|
|
201
|
+
asset_key=upstream_asset_key,
|
|
202
|
+
column_name=col,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Process system columns with lineage (metaxy_provenance_by_field, metaxy_provenance)
|
|
207
|
+
# These columns are always present in both upstream and downstream features
|
|
208
|
+
# and have a direct lineage relationship (downstream values are computed from upstream)
|
|
209
|
+
for sys_col in SYSTEM_COLUMNS_WITH_LINEAGE:
|
|
210
|
+
if sys_col not in deps_by_column:
|
|
211
|
+
deps_by_column[sys_col] = []
|
|
212
|
+
deps_by_column[sys_col].append(
|
|
213
|
+
dg.TableColumnDep(
|
|
214
|
+
asset_key=upstream_asset_key,
|
|
215
|
+
column_name=sys_col,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if not deps_by_column:
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
# Sort columns alphabetically
|
|
223
|
+
sorted_deps = {k: deps_by_column[k] for k in sorted(deps_by_column)}
|
|
224
|
+
return dg.TableColumnLineage(deps_by_column=sorted_deps)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _get_id_column_mapping(
|
|
228
|
+
downstream_id_columns: tuple[str, ...],
|
|
229
|
+
upstream_id_columns: tuple[str, ...],
|
|
230
|
+
lineage: mx.LineageRelationship,
|
|
231
|
+
rename: dict[str, str],
|
|
232
|
+
) -> dict[str, str]:
|
|
233
|
+
"""Get mapping of downstream ID columns to upstream ID columns.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
downstream_id_columns: ID columns of the downstream feature.
|
|
237
|
+
upstream_id_columns: ID columns of the upstream feature.
|
|
238
|
+
lineage: The lineage relationship between features.
|
|
239
|
+
rename: Reverse rename map (downstream_name -> upstream_name).
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Mapping of downstream ID column names to upstream ID column names.
|
|
243
|
+
"""
|
|
244
|
+
from metaxy.models.lineage import (
|
|
245
|
+
AggregationRelationship,
|
|
246
|
+
ExpansionRelationship,
|
|
247
|
+
IdentityRelationship,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
mapping: dict[str, str] = {}
|
|
251
|
+
rel = lineage.relationship
|
|
252
|
+
|
|
253
|
+
if isinstance(rel, IdentityRelationship):
|
|
254
|
+
# 1:1 - downstream ID columns map to same-named upstream ID columns
|
|
255
|
+
# (accounting for any renames)
|
|
256
|
+
for downstream_col in downstream_id_columns:
|
|
257
|
+
# Check if this column was renamed from upstream
|
|
258
|
+
upstream_col = rename.get(downstream_col, downstream_col)
|
|
259
|
+
if upstream_col in upstream_id_columns:
|
|
260
|
+
mapping[downstream_col] = upstream_col
|
|
261
|
+
|
|
262
|
+
elif isinstance(rel, AggregationRelationship):
|
|
263
|
+
# N:1 - aggregation columns map to upstream
|
|
264
|
+
# Use `on` columns if specified, otherwise use all downstream ID columns
|
|
265
|
+
agg_columns = rel.on if rel.on is not None else downstream_id_columns
|
|
266
|
+
for downstream_col in agg_columns:
|
|
267
|
+
if downstream_col in downstream_id_columns:
|
|
268
|
+
upstream_col = rename.get(downstream_col, downstream_col)
|
|
269
|
+
if upstream_col in upstream_id_columns:
|
|
270
|
+
mapping[downstream_col] = upstream_col
|
|
271
|
+
|
|
272
|
+
elif isinstance(rel, ExpansionRelationship):
|
|
273
|
+
# 1:N - `on` columns (parent ID columns) map to upstream ID columns
|
|
274
|
+
for downstream_col in rel.on:
|
|
275
|
+
if downstream_col in downstream_id_columns:
|
|
276
|
+
upstream_col = rename.get(downstream_col, downstream_col)
|
|
277
|
+
if upstream_col in upstream_id_columns:
|
|
278
|
+
mapping[downstream_col] = upstream_col
|
|
279
|
+
|
|
280
|
+
return mapping
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def build_table_preview_metadata(
|
|
284
|
+
lazy_df: nw.LazyFrame[Any],
|
|
285
|
+
schema: dg.TableSchema,
|
|
286
|
+
*,
|
|
287
|
+
n_rows: int = 5,
|
|
288
|
+
) -> dg.TableMetadataValue:
|
|
289
|
+
"""Build a Dagster table preview from the last N rows of a LazyFrame.
|
|
290
|
+
|
|
291
|
+
Collects the last `n_rows` from the LazyFrame and converts them to
|
|
292
|
+
Dagster TableRecord objects suitable for display in the Dagster UI.
|
|
293
|
+
Complex types (Struct, List, Array) are converted to JSON strings;
|
|
294
|
+
primitive types (str, int, float, bool, None) are kept as-is.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
lazy_df: A narwhals LazyFrame to preview.
|
|
298
|
+
schema: The TableSchema for the table. Use `build_column_schema()` to
|
|
299
|
+
create this from a Metaxy feature class.
|
|
300
|
+
n_rows: Number of rows to include in the preview (from the end). Defaults to 5.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
A TableMetadataValue containing the preview rows as TableRecord objects.
|
|
304
|
+
Returns an empty table if the DataFrame is empty.
|
|
305
|
+
|
|
306
|
+
!!! tip
|
|
307
|
+
|
|
308
|
+
This is automatically injected by [`MetaxyIOManager`][metaxy.ext.dagster.io_manager.MetaxyIOManager]
|
|
309
|
+
"""
|
|
310
|
+
# Collect the last n_rows from the LazyFrame
|
|
311
|
+
collected_df = lazy_df.tail(n_rows).collect()
|
|
312
|
+
df_polars: pl.DataFrame = collected_df.to_native() # pyright: ignore[reportAssignmentType]
|
|
313
|
+
|
|
314
|
+
# Handle empty DataFrames
|
|
315
|
+
if df_polars.is_empty():
|
|
316
|
+
return dg.MetadataValue.table(records=[], schema=schema)
|
|
317
|
+
|
|
318
|
+
# Convert complex types to strings, keep primitives as-is
|
|
319
|
+
df_processed = _prepare_dataframe_for_table_record(df_polars)
|
|
320
|
+
|
|
321
|
+
# Convert to TableRecord objects
|
|
322
|
+
records = [dg.TableRecord(data=row) for row in df_processed.to_dicts()]
|
|
323
|
+
|
|
324
|
+
return dg.MetadataValue.table(records=records, schema=schema)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _prepare_dataframe_for_table_record(df: pl.DataFrame) -> pl.DataFrame:
|
|
328
|
+
"""Prepare a Polars DataFrame for conversion to Dagster TableRecord objects.
|
|
329
|
+
|
|
330
|
+
Complex types (Struct, List, Array) and temporal types are converted to strings.
|
|
331
|
+
Lists/Arrays with more than 4 items are truncated to show first 2 and last 2
|
|
332
|
+
with "..." in between.
|
|
333
|
+
Primitive types (str, int, float, bool, None) are kept as-is since
|
|
334
|
+
Dagster's TableRecord accepts them directly.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
df: The Polars DataFrame to prepare.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
A DataFrame with complex/temporal types converted to strings.
|
|
341
|
+
"""
|
|
342
|
+
exprs: list[pl.Expr] = []
|
|
343
|
+
|
|
344
|
+
for col_name in df.columns:
|
|
345
|
+
dtype = df[col_name].dtype
|
|
346
|
+
if isinstance(dtype, pl.Struct):
|
|
347
|
+
# Struct types: use json_encode for a clean JSON representation
|
|
348
|
+
exprs.append(pl.col(col_name).struct.json_encode())
|
|
349
|
+
elif isinstance(dtype, pl.List):
|
|
350
|
+
# List types: truncate and convert to string
|
|
351
|
+
exprs.append(_truncate_list_expr(pl.col(col_name), alias=col_name))
|
|
352
|
+
elif isinstance(dtype, pl.Array):
|
|
353
|
+
# Array types: convert to list first, then truncate
|
|
354
|
+
exprs.append(
|
|
355
|
+
_truncate_list_expr(pl.col(col_name).arr.to_list(), alias=col_name)
|
|
356
|
+
)
|
|
357
|
+
elif dtype in (pl.Datetime, pl.Date, pl.Time, pl.Duration) or isinstance(
|
|
358
|
+
dtype, (pl.Datetime, pl.Date, pl.Time, pl.Duration)
|
|
359
|
+
):
|
|
360
|
+
# Temporal types: cast to string (ISO format)
|
|
361
|
+
exprs.append(pl.col(col_name).cast(pl.String))
|
|
362
|
+
else:
|
|
363
|
+
# Primitive types: keep as-is
|
|
364
|
+
exprs.append(pl.col(col_name))
|
|
365
|
+
|
|
366
|
+
return df.select(exprs)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _truncate_list_expr(list_expr: pl.Expr, alias: str, max_items: int = 2) -> pl.Expr:
|
|
370
|
+
"""Truncate a list expression and convert to string.
|
|
371
|
+
|
|
372
|
+
Lists with more than max_items show first 1 and last 1 items with "..." between.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
list_expr: A Polars expression that evaluates to a List type.
|
|
376
|
+
alias: The output column name.
|
|
377
|
+
max_items: Maximum items to show without truncation. Default 4.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
A Polars expression that truncates and converts the list to string.
|
|
381
|
+
"""
|
|
382
|
+
list_len = list_expr.list.len()
|
|
383
|
+
half = max_items // 2
|
|
384
|
+
|
|
385
|
+
# For short lists: just json_encode the whole thing
|
|
386
|
+
# For long lists: concat first 2 + last 2 and json_encode, then insert "..."
|
|
387
|
+
truncated = pl.concat_list(
|
|
388
|
+
list_expr.list.head(half),
|
|
389
|
+
list_expr.list.tail(half),
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Convert to JSON string via struct wrapper
|
|
393
|
+
def to_json(expr: pl.Expr) -> pl.Expr:
|
|
394
|
+
return (
|
|
395
|
+
pl.struct(expr.alias("_"))
|
|
396
|
+
.struct.json_encode()
|
|
397
|
+
.str.extract(r'\{"_":(.*)\}', 1)
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
short_result = to_json(list_expr)
|
|
401
|
+
# For truncated: insert ".." after the first half elements
|
|
402
|
+
# e.g., [1,10] -> [1,..,10]
|
|
403
|
+
# Match: opening bracket, then `half` comma-separated values
|
|
404
|
+
# The pattern matches values that may contain nested brackets
|
|
405
|
+
value_pattern = r"[^\[\],]+(?:\[[^\]]*\])?" # matches value or value[...]
|
|
406
|
+
first_n_values = ",".join([value_pattern] * half)
|
|
407
|
+
long_result = to_json(truncated).str.replace(
|
|
408
|
+
r"^(\[" + first_n_values + r"),",
|
|
409
|
+
"$1,..,",
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
return (
|
|
413
|
+
pl.when(list_len <= max_items)
|
|
414
|
+
.then(short_result)
|
|
415
|
+
.otherwise(long_result)
|
|
416
|
+
.alias(alias)
|
|
417
|
+
)
|