metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
from collections.abc import Iterable, Iterator
|
|
2
|
+
from typing import Any, NamedTuple
|
|
3
|
+
|
|
4
|
+
import dagster as dg
|
|
5
|
+
import narwhals as nw
|
|
6
|
+
|
|
7
|
+
import metaxy as mx
|
|
8
|
+
from metaxy.ext.dagster.constants import (
|
|
9
|
+
DAGSTER_METAXY_FEATURE_METADATA_KEY,
|
|
10
|
+
DAGSTER_METAXY_PARTITION_KEY,
|
|
11
|
+
METAXY_DAGSTER_METADATA_KEY,
|
|
12
|
+
)
|
|
13
|
+
from metaxy.ext.dagster.resources import MetaxyStoreFromConfigResource
|
|
14
|
+
from metaxy.metadata_store.exceptions import FeatureNotFoundError
|
|
15
|
+
from metaxy.models.constants import METAXY_CREATED_AT, METAXY_MATERIALIZATION_ID
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FeatureStats(NamedTuple):
|
|
19
|
+
"""Statistics about a feature's metadata for Dagster events."""
|
|
20
|
+
|
|
21
|
+
row_count: int
|
|
22
|
+
data_version: dg.DataVersion
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_partition_filter(
|
|
26
|
+
partition_col: str | None,
|
|
27
|
+
partition_key: str | None,
|
|
28
|
+
) -> list[nw.Expr]:
|
|
29
|
+
"""Build partition filter expressions from column name and partition key.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
partition_col: The column to filter by (from `partition_by` metadata).
|
|
33
|
+
partition_key: The partition key value to filter for.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List with a single filter expression, or empty list if either arg is None.
|
|
37
|
+
"""
|
|
38
|
+
if partition_col is None or partition_key is None:
|
|
39
|
+
return []
|
|
40
|
+
return [nw.col(partition_col) == partition_key]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_partition_filter(
|
|
44
|
+
context: dg.AssetExecutionContext,
|
|
45
|
+
spec: dg.AssetSpec,
|
|
46
|
+
) -> list[nw.Expr]:
|
|
47
|
+
"""Get partition filter expressions for a partitioned asset.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
context: The Dagster asset execution context.
|
|
51
|
+
spec: The AssetSpec containing `partition_by` metadata.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of filter expressions. Empty if not partitioned or no partition_by metadata.
|
|
55
|
+
"""
|
|
56
|
+
if not context.has_partition_key:
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
partition_col = spec.metadata.get(DAGSTER_METAXY_PARTITION_KEY)
|
|
60
|
+
if not isinstance(partition_col, str):
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
return build_partition_filter(partition_col, context.partition_key)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def compute_row_count(lazy_df: nw.LazyFrame) -> int: # pyright: ignore[reportMissingTypeArgument]
|
|
67
|
+
"""Compute row count from a narwhals LazyFrame.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
lazy_df: A narwhals LazyFrame.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The number of rows in the frame.
|
|
74
|
+
"""
|
|
75
|
+
return lazy_df.select(nw.len()).collect().item(0, 0) # pyright: ignore[reportReturnType]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def compute_stats_from_lazy_frame(lazy_df: nw.LazyFrame) -> FeatureStats: # pyright: ignore[reportMissingTypeArgument]
|
|
79
|
+
"""Compute statistics from a narwhals LazyFrame.
|
|
80
|
+
|
|
81
|
+
Computes row count and data version from the frame.
|
|
82
|
+
The data version is based on mean(metaxy_created_at) to detect both
|
|
83
|
+
additions and deletions.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
lazy_df: A narwhals LazyFrame with metaxy metadata.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
FeatureStats with row_count and data_version.
|
|
90
|
+
"""
|
|
91
|
+
stats = lazy_df.select(
|
|
92
|
+
nw.len().alias("__count"),
|
|
93
|
+
nw.col(METAXY_CREATED_AT).mean().alias("__mean_ts"),
|
|
94
|
+
).collect()
|
|
95
|
+
|
|
96
|
+
row_count: int = stats.item(0, "__count")
|
|
97
|
+
if row_count == 0:
|
|
98
|
+
return FeatureStats(row_count=0, data_version=dg.DataVersion("empty"))
|
|
99
|
+
|
|
100
|
+
mean_ts = stats.item(0, "__mean_ts")
|
|
101
|
+
return FeatureStats(row_count=row_count, data_version=dg.DataVersion(str(mean_ts)))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def compute_feature_stats(
|
|
105
|
+
store: mx.MetadataStore,
|
|
106
|
+
feature: mx.CoercibleToFeatureKey,
|
|
107
|
+
) -> FeatureStats:
|
|
108
|
+
"""Compute statistics for a feature's metadata.
|
|
109
|
+
|
|
110
|
+
Reads the feature metadata and computes row count and data version.
|
|
111
|
+
The data version is based on mean(metaxy_created_at) to detect both
|
|
112
|
+
additions and deletions.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
store: The Metaxy metadata store to read from.
|
|
116
|
+
feature: The feature to compute stats for.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
FeatureStats with row_count and data_version.
|
|
120
|
+
"""
|
|
121
|
+
with store:
|
|
122
|
+
lazy_df = store.read_metadata(feature)
|
|
123
|
+
return compute_stats_from_lazy_frame(lazy_df)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_asset_key_for_metaxy_feature_spec(
|
|
127
|
+
feature_spec: mx.FeatureSpec,
|
|
128
|
+
) -> dg.AssetKey:
|
|
129
|
+
"""Get the Dagster asset key for a Metaxy feature spec.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
feature_spec: The Metaxy feature spec.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
The Dagster asset key, determined as follows:
|
|
136
|
+
|
|
137
|
+
1. If feature spec has `dagster/attributes.asset_key` set, that value is used.
|
|
138
|
+
|
|
139
|
+
2. Otherwise, the feature key is used.
|
|
140
|
+
"""
|
|
141
|
+
# If dagster/attributes.asset_key is set, use it as-is
|
|
142
|
+
dagster_attrs = feature_spec.metadata.get(METAXY_DAGSTER_METADATA_KEY)
|
|
143
|
+
if isinstance(dagster_attrs, dict) and (
|
|
144
|
+
custom_asset_key := dagster_attrs.get("asset_key")
|
|
145
|
+
):
|
|
146
|
+
return dg.AssetKey(custom_asset_key) # pyright: ignore[reportArgumentType]
|
|
147
|
+
|
|
148
|
+
# Use the feature key as the asset key
|
|
149
|
+
return dg.AssetKey(list(feature_spec.key.parts))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def generate_materialize_results(
|
|
153
|
+
context: dg.AssetExecutionContext,
|
|
154
|
+
store: mx.MetadataStore | MetaxyStoreFromConfigResource,
|
|
155
|
+
specs: Iterable[dg.AssetSpec] | None = None,
|
|
156
|
+
) -> Iterator[dg.MaterializeResult[None]]:
|
|
157
|
+
"""Generate `dagster.MaterializeResult` events for assets in topological order.
|
|
158
|
+
|
|
159
|
+
Yields a `MaterializeResult` for each asset spec, sorted by their associated
|
|
160
|
+
Metaxy features in topological order (dependencies before dependents).
|
|
161
|
+
Each result includes the row count as `"dagster/row_count"` metadata.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
context: The Dagster asset execution context.
|
|
165
|
+
store: The Metaxy metadata store to read from.
|
|
166
|
+
specs: Optional, concrete Dagster asset specs.
|
|
167
|
+
If missing, specs will be taken from the context.
|
|
168
|
+
|
|
169
|
+
Yields:
|
|
170
|
+
Materialization result for each asset in topological order.
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
```python
|
|
174
|
+
specs = [
|
|
175
|
+
dg.AssetSpec("output_a", metadata={"metaxy/feature": "my/feature/a"}),
|
|
176
|
+
dg.AssetSpec("output_b", metadata={"metaxy/feature": "my/feature/b"}),
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
@metaxify
|
|
180
|
+
@dg.multi_asset(specs=specs)
|
|
181
|
+
def my_multi_asset(context: dg.AssetExecutionContext, store: mx.MetadataStore):
|
|
182
|
+
# ... compute and write data ...
|
|
183
|
+
yield from generate_materialize_results(context, store)
|
|
184
|
+
```
|
|
185
|
+
"""
|
|
186
|
+
# Build mapping from feature key to asset spec
|
|
187
|
+
spec_by_feature_key: dict[mx.FeatureKey, dg.AssetSpec] = {}
|
|
188
|
+
specs = specs or context.assets_def.specs
|
|
189
|
+
for spec in specs:
|
|
190
|
+
if feature_key_raw := spec.metadata.get(DAGSTER_METAXY_FEATURE_METADATA_KEY):
|
|
191
|
+
feature_key = mx.coerce_to_feature_key(feature_key_raw)
|
|
192
|
+
spec_by_feature_key[feature_key] = spec
|
|
193
|
+
|
|
194
|
+
# Sort by topological order of feature keys
|
|
195
|
+
graph = mx.FeatureGraph.get_active()
|
|
196
|
+
sorted_keys = graph.topological_sort_features(list(spec_by_feature_key.keys()))
|
|
197
|
+
|
|
198
|
+
for key in sorted_keys:
|
|
199
|
+
asset_spec = spec_by_feature_key[key]
|
|
200
|
+
partition_filters = get_partition_filter(context, asset_spec)
|
|
201
|
+
|
|
202
|
+
with store:
|
|
203
|
+
try:
|
|
204
|
+
lazy_df = store.read_metadata(key, filters=partition_filters)
|
|
205
|
+
except FeatureNotFoundError:
|
|
206
|
+
context.log.exception(
|
|
207
|
+
f"Feature {key.to_string()} not found in store, skipping materialization result"
|
|
208
|
+
)
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
stats = compute_stats_from_lazy_frame(lazy_df)
|
|
212
|
+
|
|
213
|
+
# Build runtime metadata using shared function, passing pre-computed row count
|
|
214
|
+
metadata = build_runtime_feature_metadata(
|
|
215
|
+
key, store, lazy_df, context, partition_row_count=stats.row_count
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Get materialized-in-run count if materialization_id is set
|
|
219
|
+
if store.materialization_id is not None:
|
|
220
|
+
mat_filters = partition_filters + [
|
|
221
|
+
nw.col(METAXY_MATERIALIZATION_ID) == store.materialization_id
|
|
222
|
+
]
|
|
223
|
+
mat_df = store.read_metadata(key, filters=mat_filters)
|
|
224
|
+
metadata["metaxy/materialized_in_run"] = (
|
|
225
|
+
mat_df.select(nw.len()).collect().item(0, 0)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
yield dg.MaterializeResult(
|
|
229
|
+
value=None,
|
|
230
|
+
asset_key=asset_spec.key,
|
|
231
|
+
metadata=metadata,
|
|
232
|
+
data_version=stats.data_version,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def build_feature_info_metadata(
|
|
237
|
+
feature: mx.CoercibleToFeatureKey,
|
|
238
|
+
) -> dict[str, Any]:
|
|
239
|
+
"""Build feature info metadata dict for Dagster assets.
|
|
240
|
+
|
|
241
|
+
Creates a dictionary with information about the Metaxy feature that can be
|
|
242
|
+
used as Dagster asset metadata under the `"metaxy/feature_info"` key.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
feature: The Metaxy feature (class, key, or string).
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
A nested dictionary containing:
|
|
249
|
+
|
|
250
|
+
- `feature`: Feature information
|
|
251
|
+
- `project`: The project name
|
|
252
|
+
- `spec`: The full feature spec as a dict (via `model_dump()`)
|
|
253
|
+
- `version`: The feature version string
|
|
254
|
+
- `type`: The feature class module path
|
|
255
|
+
- `metaxy`: Metaxy library information
|
|
256
|
+
- `version`: The metaxy library version
|
|
257
|
+
|
|
258
|
+
!!! tip
|
|
259
|
+
This is automatically injected by [`@metaxify`][metaxy.ext.dagster.metaxify.metaxify]
|
|
260
|
+
|
|
261
|
+
Example:
|
|
262
|
+
```python
|
|
263
|
+
from metaxy.ext.dagster.utils import build_feature_info_metadata
|
|
264
|
+
|
|
265
|
+
info = build_feature_info_metadata(MyFeature)
|
|
266
|
+
# {
|
|
267
|
+
# "feature": {
|
|
268
|
+
# "project": "my_project",
|
|
269
|
+
# "spec": {...}, # Full FeatureSpec model_dump()
|
|
270
|
+
# "version": "my__feature@abc123",
|
|
271
|
+
# "type": "myproject.features",
|
|
272
|
+
# },
|
|
273
|
+
# "metaxy": {
|
|
274
|
+
# "version": "0.1.0",
|
|
275
|
+
# },
|
|
276
|
+
# }
|
|
277
|
+
```
|
|
278
|
+
"""
|
|
279
|
+
feature_key = mx.coerce_to_feature_key(feature)
|
|
280
|
+
feature_cls = mx.get_feature_by_key(feature_key)
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
"feature": {
|
|
284
|
+
"project": feature_cls.project,
|
|
285
|
+
"spec": feature_cls.spec().model_dump(mode="json"),
|
|
286
|
+
"version": feature_cls.feature_version(),
|
|
287
|
+
"type": feature_cls.__module__,
|
|
288
|
+
},
|
|
289
|
+
"metaxy": {
|
|
290
|
+
"version": mx.__version__,
|
|
291
|
+
"plugins": mx.MetaxyConfig.get().plugins,
|
|
292
|
+
},
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def build_runtime_feature_metadata(
|
|
297
|
+
feature_key: mx.FeatureKey,
|
|
298
|
+
store: mx.MetadataStore | MetaxyStoreFromConfigResource,
|
|
299
|
+
lazy_df: nw.LazyFrame[Any],
|
|
300
|
+
context: dg.AssetExecutionContext | dg.OutputContext,
|
|
301
|
+
*,
|
|
302
|
+
partition_row_count: int | None = None,
|
|
303
|
+
) -> dict[str, Any]:
|
|
304
|
+
"""Build runtime metadata for a Metaxy feature in Dagster.
|
|
305
|
+
|
|
306
|
+
This function consolidates all runtime metadata construction for Dagster events.
|
|
307
|
+
It is used by the IOManager, generate_materialize_results, and generate_observe_results.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
feature_key: The Metaxy feature key.
|
|
311
|
+
store: The metadata store (used for store-specific metadata like URI, table_name).
|
|
312
|
+
lazy_df: The LazyFrame containing the feature data (for stats and preview).
|
|
313
|
+
For partitioned assets, this should be filtered to the current partition.
|
|
314
|
+
context: Dagster context for determining partition state and logging errors.
|
|
315
|
+
partition_row_count: Optional pre-computed partition row count to avoid re-computing.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
A dictionary containing all runtime metadata:
|
|
319
|
+
- `metaxy/feature`: Feature key as string
|
|
320
|
+
- `metaxy/info`: Feature and metaxy library information (from `build_feature_info_metadata`)
|
|
321
|
+
- `metaxy/store`: Store type and configuration
|
|
322
|
+
- `dagster/row_count`: Total row count (across all partitions)
|
|
323
|
+
- `dagster/partition_row_count`: Row count for current partition (only if partitioned)
|
|
324
|
+
- `dagster/table_name`: Table name from store (if available)
|
|
325
|
+
- `dagster/uri`: URI from store (if available)
|
|
326
|
+
- `dagster/table`: Table preview
|
|
327
|
+
|
|
328
|
+
Returns empty dict if an error occurs during metadata collection.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
```python
|
|
332
|
+
with store:
|
|
333
|
+
lazy_df = store.read_metadata(feature_key)
|
|
334
|
+
metadata = build_runtime_feature_metadata(feature_key, store, lazy_df, context)
|
|
335
|
+
context.add_output_metadata(metadata)
|
|
336
|
+
```
|
|
337
|
+
"""
|
|
338
|
+
# Import here to avoid circular import
|
|
339
|
+
from metaxy.ext.dagster.table_metadata import (
|
|
340
|
+
build_column_schema,
|
|
341
|
+
build_table_preview_metadata,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
# Use pre-computed partition_row_count if provided, otherwise compute
|
|
346
|
+
if partition_row_count is None:
|
|
347
|
+
partition_row_count = compute_row_count(lazy_df)
|
|
348
|
+
|
|
349
|
+
# Get store metadata
|
|
350
|
+
store_metadata = store.get_store_metadata(feature_key)
|
|
351
|
+
|
|
352
|
+
# Build metadata dict with metaxy info and store info
|
|
353
|
+
store_cls = store.__class__
|
|
354
|
+
metadata: dict[str, Any] = {
|
|
355
|
+
"metaxy/feature": feature_key.to_string(),
|
|
356
|
+
"metaxy/info": build_feature_info_metadata(feature_key),
|
|
357
|
+
"metaxy/store": {
|
|
358
|
+
"type": f"{store_cls.__module__}.{store_cls.__qualname__}",
|
|
359
|
+
"display": store.display(),
|
|
360
|
+
"versioning_engine": store._versioning_engine,
|
|
361
|
+
**store_metadata,
|
|
362
|
+
},
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# For partitioned assets, compute total row count by re-reading without filters
|
|
366
|
+
if context is not None and context.has_partition_key:
|
|
367
|
+
# Read entire feature (no partition filter) for total count
|
|
368
|
+
full_lazy_df = store.read_metadata(feature_key)
|
|
369
|
+
metadata["dagster/row_count"] = compute_row_count(full_lazy_df)
|
|
370
|
+
metadata["dagster/partition_row_count"] = partition_row_count
|
|
371
|
+
else:
|
|
372
|
+
metadata["dagster/row_count"] = partition_row_count
|
|
373
|
+
|
|
374
|
+
# Map store metadata to dagster standard keys
|
|
375
|
+
if "table_name" in store_metadata:
|
|
376
|
+
metadata["dagster/table_name"] = store_metadata["table_name"]
|
|
377
|
+
|
|
378
|
+
if "uri" in store_metadata:
|
|
379
|
+
metadata["dagster/uri"] = dg.MetadataValue.path(store_metadata["uri"])
|
|
380
|
+
|
|
381
|
+
# Build table preview
|
|
382
|
+
feature_cls = mx.get_feature_by_key(feature_key)
|
|
383
|
+
schema = build_column_schema(feature_cls)
|
|
384
|
+
metadata["dagster/table"] = build_table_preview_metadata(lazy_df, schema)
|
|
385
|
+
|
|
386
|
+
return metadata
|
|
387
|
+
except Exception:
|
|
388
|
+
context.log.exception(
|
|
389
|
+
f"Failed to build runtime metadata for feature {feature_key.to_string()}"
|
|
390
|
+
)
|
|
391
|
+
return {}
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def generate_observe_results(
|
|
395
|
+
context: dg.AssetExecutionContext,
|
|
396
|
+
store: mx.MetadataStore | MetaxyStoreFromConfigResource,
|
|
397
|
+
specs: Iterable[dg.AssetSpec] | None = None,
|
|
398
|
+
) -> Iterator[dg.ObserveResult]:
|
|
399
|
+
"""Generate `dagster.ObserveResult` events for assets in topological order.
|
|
400
|
+
|
|
401
|
+
Yields an `ObserveResult` for each asset spec that has `"metaxy/feature"` metadata key set, sorted by their associated
|
|
402
|
+
Metaxy features in topological order.
|
|
403
|
+
Each result includes the row count as `"dagster/row_count"` metadata.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
context: The Dagster asset execution context.
|
|
407
|
+
store: The Metaxy metadata store to read from.
|
|
408
|
+
specs: Optional, concrete Dagster asset specs.
|
|
409
|
+
If missing, this function will take the current specs from the context.
|
|
410
|
+
|
|
411
|
+
Yields:
|
|
412
|
+
Observation result for each asset in topological order.
|
|
413
|
+
|
|
414
|
+
Example:
|
|
415
|
+
```python
|
|
416
|
+
specs = [
|
|
417
|
+
dg.AssetSpec("output_a", metadata={"metaxy/feature": "my/feature/a"}),
|
|
418
|
+
dg.AssetSpec("output_b", metadata={"metaxy/feature": "my/feature/b"}),
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
@metaxify
|
|
422
|
+
@dg.multi_observable_source_asset(specs=specs)
|
|
423
|
+
def my_observable_assets(context: dg.AssetExecutionContext, store: mx.MetadataStore):
|
|
424
|
+
yield from generate_observe_results(context, store)
|
|
425
|
+
```
|
|
426
|
+
"""
|
|
427
|
+
# Build mapping from feature key to asset spec
|
|
428
|
+
spec_by_feature_key: dict[mx.FeatureKey, dg.AssetSpec] = {}
|
|
429
|
+
specs = specs or context.assets_def.specs
|
|
430
|
+
|
|
431
|
+
for spec in specs:
|
|
432
|
+
if feature_key_raw := spec.metadata.get(DAGSTER_METAXY_FEATURE_METADATA_KEY):
|
|
433
|
+
feature_key = mx.coerce_to_feature_key(feature_key_raw)
|
|
434
|
+
spec_by_feature_key[feature_key] = spec
|
|
435
|
+
|
|
436
|
+
# Sort by topological order of feature keys
|
|
437
|
+
graph = mx.FeatureGraph.get_active()
|
|
438
|
+
sorted_keys = graph.topological_sort_features(list(spec_by_feature_key.keys()))
|
|
439
|
+
|
|
440
|
+
for key in sorted_keys:
|
|
441
|
+
asset_spec = spec_by_feature_key[key]
|
|
442
|
+
partition_filters = get_partition_filter(context, asset_spec)
|
|
443
|
+
|
|
444
|
+
with store:
|
|
445
|
+
try:
|
|
446
|
+
lazy_df = store.read_metadata(key, filters=partition_filters)
|
|
447
|
+
except FeatureNotFoundError:
|
|
448
|
+
context.log.exception(
|
|
449
|
+
f"Feature {key.to_string()} not found in store, skipping observation result"
|
|
450
|
+
)
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
stats = compute_stats_from_lazy_frame(lazy_df)
|
|
454
|
+
metadata = build_runtime_feature_metadata(
|
|
455
|
+
key, store, lazy_df, context, partition_row_count=stats.row_count
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
yield dg.ObserveResult(
|
|
459
|
+
asset_key=asset_spec.key,
|
|
460
|
+
metadata=metadata,
|
|
461
|
+
data_version=stats.data_version,
|
|
462
|
+
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""SQLAlchemy integration for metaxy.
|
|
2
|
+
|
|
3
|
+
This module provides SQLAlchemy table definitions and helpers for metaxy.
|
|
4
|
+
These can be used with migration tools like Alembic.
|
|
5
|
+
|
|
6
|
+
The main functions return tuples of (sqlalchemy_url, metadata) for easy
|
|
7
|
+
integration with migration tools:
|
|
8
|
+
|
|
9
|
+
- `get_system_slqa_metadata`: Get URL and system table metadata for a store
|
|
10
|
+
- `filter_feature_sqla_metadata`: Get URL and feature table metadata for a store
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from metaxy.ext.sqlalchemy.config import SQLAlchemyConfig
|
|
14
|
+
from metaxy.ext.sqlalchemy.plugin import (
|
|
15
|
+
filter_feature_sqla_metadata,
|
|
16
|
+
get_system_slqa_metadata,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"SQLAlchemyConfig",
|
|
21
|
+
"get_system_slqa_metadata",
|
|
22
|
+
"filter_feature_sqla_metadata",
|
|
23
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Configuration for SQLAlchemy integration."""
|
|
2
|
+
|
|
3
|
+
from pydantic import Field as PydanticField
|
|
4
|
+
from pydantic_settings import SettingsConfigDict
|
|
5
|
+
|
|
6
|
+
from metaxy.config import PluginConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SQLAlchemyConfig(PluginConfig):
|
|
10
|
+
"""Configuration for SQLAlchemy integration.
|
|
11
|
+
|
|
12
|
+
This plugin provides helpers for working with SQLAlchemy metadata
|
|
13
|
+
and table definitions.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
model_config = SettingsConfigDict(
|
|
17
|
+
env_prefix="METAXY_EXT__SQLALCHEMY_",
|
|
18
|
+
extra="forbid",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
inject_primary_key: bool = PydanticField(
|
|
22
|
+
default=False,
|
|
23
|
+
description="Automatically inject composite primary key constraints on user-defined feature tables. The key is composed of ID columns, `metaxy_created_at`, and `metaxy_data_version`.",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
inject_index: bool = PydanticField(
|
|
27
|
+
default=False,
|
|
28
|
+
description="Automatically inject composite index on user-defined feature tables. The index covers ID columns, `metaxy_created_at`, and `metaxy_data_version`.",
|
|
29
|
+
)
|