metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
metaxy/migrations/ops.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
"""Migration operation types."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseOperation(BaseSettings, ABC): # pyright: ignore[reportUnsafeMultipleInheritance]
|
|
14
|
+
"""Base class for all migration operations with environment variable support.
|
|
15
|
+
|
|
16
|
+
Operations are instantiated from YAML configs and execute on individual features.
|
|
17
|
+
Subclasses implement execute_for_feature() to perform the actual migration logic.
|
|
18
|
+
|
|
19
|
+
Environment variables are automatically read using pydantic_settings. Define config
|
|
20
|
+
fields as regular Pydantic fields and they will be populated from env vars or config dict.
|
|
21
|
+
|
|
22
|
+
The 'type' field is automatically computed from the class's module and name.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
class PostgreSQLBackfill(BaseOperation):
|
|
26
|
+
postgresql_url: str # Reads from POSTGRESQL_URL env var or config dict
|
|
27
|
+
batch_size: int = 1000 # Optional with default
|
|
28
|
+
|
|
29
|
+
def execute_for_feature(self, store, feature_key, *, snapshot_version, from_snapshot_version=None, dry_run=False):
|
|
30
|
+
# Implementation here
|
|
31
|
+
return 0
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
model_config = SettingsConfigDict(
|
|
35
|
+
extra="ignore", # Ignore extra fields like 'type' and 'features' from YAML
|
|
36
|
+
frozen=True,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@pydantic.model_validator(mode="before")
|
|
40
|
+
@classmethod
|
|
41
|
+
def _substitute_env_vars(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
42
|
+
"""Substitute ${VAR} patterns with environment variables.
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
postgresql_url: "${POSTGRESQL_URL}" -> postgresql_url: "postgresql://..."
|
|
46
|
+
"""
|
|
47
|
+
import os
|
|
48
|
+
import re
|
|
49
|
+
|
|
50
|
+
def substitute_value(value):
|
|
51
|
+
if isinstance(value, str):
|
|
52
|
+
# Replace ${VAR} with os.environ.get('VAR')
|
|
53
|
+
def replacer(match):
|
|
54
|
+
var_name = match.group(1)
|
|
55
|
+
env_value = os.environ.get(var_name)
|
|
56
|
+
if env_value is None:
|
|
57
|
+
raise ValueError(f"Environment variable {var_name} is not set")
|
|
58
|
+
return env_value
|
|
59
|
+
|
|
60
|
+
return re.sub(r"\$\{([^}]+)\}", replacer, value)
|
|
61
|
+
return value
|
|
62
|
+
|
|
63
|
+
# Create a new dict to avoid mutating the input
|
|
64
|
+
result = {}
|
|
65
|
+
for key, value in data.items():
|
|
66
|
+
result[key] = substitute_value(value)
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def type(self) -> str:
|
|
71
|
+
"""Return the fully qualified class name for this operation."""
|
|
72
|
+
return f"{self.__class__.__module__}.{self.__class__.__name__}"
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def execute_for_feature(
|
|
76
|
+
self,
|
|
77
|
+
store: "MetadataStore",
|
|
78
|
+
feature_key: str,
|
|
79
|
+
*,
|
|
80
|
+
snapshot_version: str,
|
|
81
|
+
from_snapshot_version: str | None = None,
|
|
82
|
+
dry_run: bool = False,
|
|
83
|
+
) -> int:
|
|
84
|
+
"""Execute operation for a single feature.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
store: Metadata store to operate on
|
|
88
|
+
feature_key: Feature key string (e.g., "video/scene")
|
|
89
|
+
snapshot_version: Target snapshot version
|
|
90
|
+
from_snapshot_version: Source snapshot version (optional, for cross-snapshot migrations)
|
|
91
|
+
dry_run: If True, only validate and return count without executing
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Number of rows affected
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
Exception: If operation fails
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class DataVersionReconciliation(BaseOperation):
|
|
103
|
+
"""Reconcile field provenance when feature definition changes BUT computation is unchanged.
|
|
104
|
+
|
|
105
|
+
This operation applies to affected features specified in the migration configuration.
|
|
106
|
+
Feature keys are provided in the migration YAML operations list.
|
|
107
|
+
|
|
108
|
+
This operation:
|
|
109
|
+
1. For each affected feature, derives old/new feature_versions from snapshots
|
|
110
|
+
2. Finds rows with old feature_version
|
|
111
|
+
3. Recalculates field_provenance based on new feature definition
|
|
112
|
+
4. Writes new rows with updated feature_version and provenance_by_field
|
|
113
|
+
5. Preserves all user metadata columns (immutable)
|
|
114
|
+
|
|
115
|
+
Use ONLY when code changed but computation results would be identical:
|
|
116
|
+
- Dependency graph refactoring (more precise field dependencies)
|
|
117
|
+
- Field structure changes (renaming, splitting, better schema)
|
|
118
|
+
- Code organization improvements (imports, typing, refactoring)
|
|
119
|
+
|
|
120
|
+
Do NOT use when computation actually changed:
|
|
121
|
+
- Different algorithm/model → re-run pipeline instead
|
|
122
|
+
- Bug fixes that affect output → re-run pipeline instead
|
|
123
|
+
- New model version → re-run pipeline instead
|
|
124
|
+
|
|
125
|
+
Feature versions are automatically derived from the migration's snapshot versions.
|
|
126
|
+
|
|
127
|
+
Example YAML:
|
|
128
|
+
operations:
|
|
129
|
+
- type: metaxy.migrations.ops.DataVersionReconciliation
|
|
130
|
+
features: ["video/scene", "video/frames"]
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def execute_for_feature(
|
|
134
|
+
self,
|
|
135
|
+
store: "MetadataStore",
|
|
136
|
+
feature_key: str,
|
|
137
|
+
*,
|
|
138
|
+
snapshot_version: str,
|
|
139
|
+
from_snapshot_version: str | None = None,
|
|
140
|
+
dry_run: bool = False,
|
|
141
|
+
) -> int:
|
|
142
|
+
"""Execute field provenance reconciliation for a single feature.
|
|
143
|
+
|
|
144
|
+
Only works for features with upstream dependencies. For root features
|
|
145
|
+
(no upstream), field_provenance are user-defined and cannot be automatically
|
|
146
|
+
reconciled - user must re-run their computation pipeline.
|
|
147
|
+
|
|
148
|
+
Process:
|
|
149
|
+
1. Verify feature has upstream dependencies
|
|
150
|
+
2. Query old and new feature_versions from snapshot metadata
|
|
151
|
+
3. Load existing metadata with old feature_version
|
|
152
|
+
4. Use resolve_update() to calculate expected field_provenance based on current upstream
|
|
153
|
+
5. Join existing user metadata with new field_provenance
|
|
154
|
+
6. Write with new feature_version and snapshot_version
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
store: Metadata store
|
|
158
|
+
feature_key: Feature key string (e.g., "examples/child")
|
|
159
|
+
snapshot_version: Target snapshot version (new state)
|
|
160
|
+
from_snapshot_version: Source snapshot version (old state, required for this operation)
|
|
161
|
+
dry_run: If True, return row count without executing
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Number of rows affected
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
ValueError: If feature has no upstream dependencies (root feature) or from_snapshot_version not provided
|
|
168
|
+
"""
|
|
169
|
+
if from_snapshot_version is None:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"DataVersionReconciliation requires from_snapshot_version for feature {feature_key}"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
to_snapshot_version = snapshot_version
|
|
175
|
+
import narwhals as nw
|
|
176
|
+
|
|
177
|
+
from metaxy.metadata_store.base import allow_feature_version_override
|
|
178
|
+
from metaxy.metadata_store.exceptions import FeatureNotFoundError
|
|
179
|
+
from metaxy.metadata_store.system import FEATURE_VERSIONS_KEY
|
|
180
|
+
from metaxy.models.feature import FeatureGraph
|
|
181
|
+
from metaxy.models.types import FeatureKey
|
|
182
|
+
|
|
183
|
+
feature_key_obj = FeatureKey(feature_key.split("/"))
|
|
184
|
+
feature_key_str = feature_key_obj.to_string()
|
|
185
|
+
graph = FeatureGraph.get_active()
|
|
186
|
+
feature_cls = graph.features_by_key[feature_key_obj]
|
|
187
|
+
|
|
188
|
+
# 1. Verify feature has upstream dependencies
|
|
189
|
+
plan = graph.get_feature_plan(feature_key_obj)
|
|
190
|
+
has_upstream = plan.deps is not None and len(plan.deps) > 0
|
|
191
|
+
|
|
192
|
+
if not has_upstream:
|
|
193
|
+
raise ValueError(
|
|
194
|
+
f"DataVersionReconciliation cannot be used for root feature {feature_key_str}. "
|
|
195
|
+
f"Root features have user-defined field_provenance that cannot be automatically reconciled. "
|
|
196
|
+
f"User must re-run their computation pipeline to generate new data."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# 2. Query feature versions from snapshot metadata
|
|
200
|
+
try:
|
|
201
|
+
from_version_data = store.read_metadata(
|
|
202
|
+
FEATURE_VERSIONS_KEY,
|
|
203
|
+
current_only=False,
|
|
204
|
+
allow_fallback=False,
|
|
205
|
+
filters=[
|
|
206
|
+
(nw.col("metaxy_snapshot_version") == from_snapshot_version)
|
|
207
|
+
& (nw.col("feature_key") == feature_key_str)
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
except FeatureNotFoundError:
|
|
211
|
+
from_version_data = None
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
to_version_data = store.read_metadata(
|
|
215
|
+
FEATURE_VERSIONS_KEY,
|
|
216
|
+
current_only=False,
|
|
217
|
+
allow_fallback=False,
|
|
218
|
+
filters=[
|
|
219
|
+
(nw.col("metaxy_snapshot_version") == to_snapshot_version)
|
|
220
|
+
& (nw.col("feature_key") == feature_key_str)
|
|
221
|
+
],
|
|
222
|
+
)
|
|
223
|
+
except FeatureNotFoundError:
|
|
224
|
+
to_version_data = None
|
|
225
|
+
|
|
226
|
+
# Extract feature versions from lazy frames
|
|
227
|
+
# Since we filter by snapshot_version and feature_key, there should be exactly one row
|
|
228
|
+
# We don't care about feature_spec_version changes, so just get the first row without sorting
|
|
229
|
+
from_feature_version: str | None = None
|
|
230
|
+
to_feature_version: str | None = None
|
|
231
|
+
|
|
232
|
+
if from_version_data is not None:
|
|
233
|
+
# Use .head(1) to limit at query level - no need to sort since we don't care about feature_spec_version
|
|
234
|
+
from_version_df = from_version_data.head(1).collect()
|
|
235
|
+
if from_version_df.shape[0] > 0:
|
|
236
|
+
from_feature_version = str(from_version_df["metaxy_feature_version"][0])
|
|
237
|
+
else:
|
|
238
|
+
from_version_data = None
|
|
239
|
+
|
|
240
|
+
if to_version_data is not None:
|
|
241
|
+
# Use .head(1) to limit at query level - no need to sort since we don't care about feature_spec_version
|
|
242
|
+
to_version_df = to_version_data.head(1).collect()
|
|
243
|
+
if to_version_df.shape[0] > 0:
|
|
244
|
+
to_feature_version = str(to_version_df["metaxy_feature_version"][0])
|
|
245
|
+
else:
|
|
246
|
+
to_version_data = None
|
|
247
|
+
|
|
248
|
+
if from_version_data is None:
|
|
249
|
+
raise ValueError(
|
|
250
|
+
f"Feature {feature_key_str} not found in from_snapshot {from_snapshot_version}"
|
|
251
|
+
)
|
|
252
|
+
if to_version_data is None:
|
|
253
|
+
raise ValueError(
|
|
254
|
+
f"Feature {feature_key_str} not found in to_snapshot {to_snapshot_version}"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
assert from_feature_version is not None
|
|
258
|
+
assert to_feature_version is not None
|
|
259
|
+
|
|
260
|
+
# 3. Load existing metadata with old feature_version
|
|
261
|
+
try:
|
|
262
|
+
existing_metadata = store.read_metadata(
|
|
263
|
+
feature_cls,
|
|
264
|
+
current_only=False,
|
|
265
|
+
filters=[nw.col("metaxy_feature_version") == from_feature_version],
|
|
266
|
+
allow_fallback=False,
|
|
267
|
+
)
|
|
268
|
+
except FeatureNotFoundError:
|
|
269
|
+
# Feature doesn't exist yet - nothing to migrate
|
|
270
|
+
return 0
|
|
271
|
+
|
|
272
|
+
# Collect to check existence and get row count
|
|
273
|
+
existing_metadata_df = existing_metadata.collect()
|
|
274
|
+
if existing_metadata_df.shape[0] == 0:
|
|
275
|
+
# Already migrated (idempotent)
|
|
276
|
+
return 0
|
|
277
|
+
|
|
278
|
+
if dry_run:
|
|
279
|
+
return existing_metadata_df.shape[0]
|
|
280
|
+
|
|
281
|
+
# 4. Get sample metadata (exclude system columns)
|
|
282
|
+
user_columns = [
|
|
283
|
+
c
|
|
284
|
+
for c in existing_metadata_df.columns
|
|
285
|
+
if c
|
|
286
|
+
not in [
|
|
287
|
+
"metaxy_provenance_by_field",
|
|
288
|
+
"metaxy_feature_version",
|
|
289
|
+
"metaxy_snapshot_version",
|
|
290
|
+
]
|
|
291
|
+
]
|
|
292
|
+
sample_metadata = existing_metadata_df.select(user_columns)
|
|
293
|
+
|
|
294
|
+
# 5. Use resolve_update to calculate field_provenance based on current upstream
|
|
295
|
+
# Don't pass samples - let resolve_update auto-load upstream and calculate provenance_by_field
|
|
296
|
+
diff_result = store.resolve_update(feature_cls)
|
|
297
|
+
|
|
298
|
+
# Convert to Polars for the join to avoid cross-backend issues
|
|
299
|
+
sample_metadata_pl = nw.from_native(sample_metadata.to_native()).to_polars()
|
|
300
|
+
|
|
301
|
+
# Use 'changed' for reconciliation (field_provenance changed due to upstream)
|
|
302
|
+
# Use 'added' for new feature materialization
|
|
303
|
+
# Convert results to Polars for consistent joining
|
|
304
|
+
if len(diff_result.changed) > 0:
|
|
305
|
+
changed_pl = nw.from_native(diff_result.changed.to_native()).to_polars()
|
|
306
|
+
new_provenance = changed_pl.select(
|
|
307
|
+
["sample_uid", "metaxy_provenance_by_field"]
|
|
308
|
+
)
|
|
309
|
+
df_to_write = sample_metadata_pl.join(
|
|
310
|
+
new_provenance, on="sample_uid", how="inner"
|
|
311
|
+
)
|
|
312
|
+
elif len(diff_result.added) > 0:
|
|
313
|
+
df_to_write = nw.from_native(diff_result.added.to_native()).to_polars()
|
|
314
|
+
else:
|
|
315
|
+
return 0
|
|
316
|
+
|
|
317
|
+
# 6. Write with new feature_version and snapshot_version
|
|
318
|
+
# Wrap in Narwhals for write_metadata
|
|
319
|
+
df_to_write_nw = nw.from_native(df_to_write)
|
|
320
|
+
df_to_write_nw = df_to_write_nw.with_columns(
|
|
321
|
+
nw.lit(to_feature_version).alias("metaxy_feature_version"),
|
|
322
|
+
nw.lit(to_snapshot_version).alias("metaxy_snapshot_version"),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
with allow_feature_version_override():
|
|
326
|
+
with store.allow_cross_project_writes():
|
|
327
|
+
store.write_metadata(feature_cls, df_to_write_nw)
|
|
328
|
+
|
|
329
|
+
return len(df_to_write)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
class MetadataBackfill(BaseOperation, ABC):
|
|
333
|
+
"""Base class for metadata backfill operations.
|
|
334
|
+
|
|
335
|
+
Users subclass this to implement custom backfill logic with complete
|
|
336
|
+
control over the entire process: loading, transforming, joining, filtering,
|
|
337
|
+
and writing metadata.
|
|
338
|
+
|
|
339
|
+
The user implements execute_for_feature() and can:
|
|
340
|
+
- Load metadata from any external source (S3, database, API, etc.)
|
|
341
|
+
- Perform custom transformations and filtering
|
|
342
|
+
- Join with Metaxy's calculated field_provenance however they want
|
|
343
|
+
- Write results to the store
|
|
344
|
+
|
|
345
|
+
Example Subclass:
|
|
346
|
+
class S3VideoBackfill(MetadataBackfill):
|
|
347
|
+
s3_bucket: str
|
|
348
|
+
s3_prefix: str
|
|
349
|
+
min_size_mb: int = 10
|
|
350
|
+
|
|
351
|
+
def execute_for_feature(
|
|
352
|
+
self,
|
|
353
|
+
store,
|
|
354
|
+
feature_key,
|
|
355
|
+
*,
|
|
356
|
+
snapshot_version,
|
|
357
|
+
from_snapshot_version=None,
|
|
358
|
+
dry_run=False
|
|
359
|
+
):
|
|
360
|
+
import boto3
|
|
361
|
+
from metaxy.models.feature import FeatureGraph
|
|
362
|
+
from metaxy.models.types import FeatureKey
|
|
363
|
+
|
|
364
|
+
# Load from S3
|
|
365
|
+
s3 = boto3.client('s3')
|
|
366
|
+
objects = s3.list_objects_v2(
|
|
367
|
+
Bucket=self.s3_bucket,
|
|
368
|
+
Prefix=self.s3_prefix
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
external_df = pl.DataFrame([
|
|
372
|
+
{
|
|
373
|
+
"sample_uid": obj['Key'],
|
|
374
|
+
"path": f"s3://{self.s3_bucket}/{obj['Key']}",
|
|
375
|
+
"size_bytes": obj['Size']
|
|
376
|
+
}
|
|
377
|
+
for obj in objects['Contents']
|
|
378
|
+
])
|
|
379
|
+
|
|
380
|
+
# Filter
|
|
381
|
+
external_df = external_df.filter(
|
|
382
|
+
pl.col("size_bytes") > self.min_size_mb * 1024 * 1024
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if dry_run:
|
|
386
|
+
return len(external_df)
|
|
387
|
+
|
|
388
|
+
# Get field provenance from Metaxy
|
|
389
|
+
graph = FeatureGraph.get_active()
|
|
390
|
+
feature_key_obj = FeatureKey(feature_key.split("/"))
|
|
391
|
+
feature_cls = graph.features_by_key[feature_key_obj]
|
|
392
|
+
|
|
393
|
+
diff = store.resolve_update(
|
|
394
|
+
feature_cls,
|
|
395
|
+
samples=external_df.select(["sample_uid"])
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Join external metadata with calculated field_provenance
|
|
399
|
+
to_write = external_df.join(diff.added, on="sample_uid", how="inner")
|
|
400
|
+
|
|
401
|
+
# Write
|
|
402
|
+
store.write_metadata(feature_cls, to_write)
|
|
403
|
+
return len(to_write)
|
|
404
|
+
|
|
405
|
+
Example YAML:
|
|
406
|
+
operations:
|
|
407
|
+
- type: "myproject.migrations.S3VideoBackfill"
|
|
408
|
+
features: ["video/files"]
|
|
409
|
+
s3_bucket: "prod-videos"
|
|
410
|
+
s3_prefix: "processed/"
|
|
411
|
+
min_size_mb: 10
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
# No additional required fields - user subclasses add their own
|
|
415
|
+
|
|
416
|
+
@abstractmethod
|
|
417
|
+
def execute_for_feature(
|
|
418
|
+
self,
|
|
419
|
+
store: "MetadataStore",
|
|
420
|
+
feature_key: str,
|
|
421
|
+
*,
|
|
422
|
+
snapshot_version: str,
|
|
423
|
+
from_snapshot_version: str | None = None,
|
|
424
|
+
dry_run: bool = False,
|
|
425
|
+
) -> int:
|
|
426
|
+
"""User implements backfill logic for a single feature.
|
|
427
|
+
|
|
428
|
+
User has complete control over:
|
|
429
|
+
- Loading external metadata (S3, database, API, files, etc.)
|
|
430
|
+
- Transforming and filtering data
|
|
431
|
+
- Joining with Metaxy's field_provenance
|
|
432
|
+
- Writing to store
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
store: Metadata store to write to
|
|
436
|
+
feature_key: Feature key string (e.g., "video/files")
|
|
437
|
+
snapshot_version: Target snapshot version
|
|
438
|
+
from_snapshot_version: Source snapshot version (optional, for cross-snapshot backfills)
|
|
439
|
+
dry_run: If True, validate and return count without writing
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
Number of rows written (or would be written if dry_run)
|
|
443
|
+
|
|
444
|
+
Raises:
|
|
445
|
+
Exception: If backfill fails (will be recorded in migration progress)
|
|
446
|
+
"""
|
|
447
|
+
pass
|
|
File without changes
|
metaxy/models/bases.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import pydantic
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FrozenBaseModel(pydantic.BaseModel):
|
|
5
|
+
# config class is deprecated
|
|
6
|
+
model_config = pydantic.ConfigDict(frozen=True)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VersioningEngineMismatchError(Exception):
|
|
10
|
+
"""Raised when versioning_engine='native' is requested but data has wrong implementation."""
|
|
11
|
+
|
|
12
|
+
pass
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Shared constants for system-managed column names.
|
|
2
|
+
|
|
3
|
+
All system columns use the metaxy_ prefix to avoid conflicts with user columns.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
# Default code version for initial feature definitions
|
|
9
|
+
DEFAULT_CODE_VERSION = "__metaxy_initial__"
|
|
10
|
+
|
|
11
|
+
# System column prefix
|
|
12
|
+
SYSTEM_COLUMN_PREFIX = "metaxy_"
|
|
13
|
+
|
|
14
|
+
# --- System Column Names -----------------------------------------------------------
|
|
15
|
+
# All system columns that Metaxy manages internally. These columns are automatically
|
|
16
|
+
# added to metadata DataFrames and should not be defined by users.
|
|
17
|
+
|
|
18
|
+
METAXY_PROVENANCE_BY_FIELD = f"{SYSTEM_COLUMN_PREFIX}provenance_by_field"
|
|
19
|
+
"""Field-level provenance hashes (struct column mapping field names to hashes)."""
|
|
20
|
+
|
|
21
|
+
METAXY_PROVENANCE = f"{SYSTEM_COLUMN_PREFIX}provenance"
|
|
22
|
+
"""Hash of`metaxy_provenance_by_field` -- a single string value."""
|
|
23
|
+
|
|
24
|
+
METAXY_FEATURE_VERSION = f"{SYSTEM_COLUMN_PREFIX}feature_version"
|
|
25
|
+
"""Hash of the feature definition (dependencies + fields + code_versions)."""
|
|
26
|
+
|
|
27
|
+
METAXY_SNAPSHOT_VERSION = f"{SYSTEM_COLUMN_PREFIX}snapshot_version"
|
|
28
|
+
"""Hash of the entire feature graph snapshot (recorded during deployment)."""
|
|
29
|
+
|
|
30
|
+
METAXY_FEATURE_SPEC_VERSION = f"{SYSTEM_COLUMN_PREFIX}feature_spec_version"
|
|
31
|
+
"""Hash of the complete feature specification."""
|
|
32
|
+
|
|
33
|
+
METAXY_FULL_DEFINITION_VERSION = f"{SYSTEM_COLUMN_PREFIX}full_definition_version"
|
|
34
|
+
"""Hash of the complete feature definition including Pydantic schema, feature spec, and project.
|
|
35
|
+
|
|
36
|
+
This comprehensive hash captures ALL aspects of a feature definition:
|
|
37
|
+
- Pydantic model schema (field types, descriptions, validators, serializers, etc.)
|
|
38
|
+
- Feature specification (dependencies, fields, code_versions, metadata)
|
|
39
|
+
- Project name
|
|
40
|
+
|
|
41
|
+
Used in system tables to detect when ANY part of a feature changes."""
|
|
42
|
+
|
|
43
|
+
METAXY_DATA_VERSION_BY_FIELD = f"{SYSTEM_COLUMN_PREFIX}data_version_by_field"
|
|
44
|
+
"""Field-level data version hashes (struct column mapping field names to version hashes).
|
|
45
|
+
|
|
46
|
+
Similar to provenance_by_field, but can be user-overridden to implement custom versioning
|
|
47
|
+
(e.g., content hashes, timestamps, semantic versions)."""
|
|
48
|
+
|
|
49
|
+
METAXY_DATA_VERSION = f"{SYSTEM_COLUMN_PREFIX}data_version"
|
|
50
|
+
"""Hash of metaxy_data_version_by_field -- a single string value."""
|
|
51
|
+
|
|
52
|
+
METAXY_CREATED_AT = f"{SYSTEM_COLUMN_PREFIX}created_at"
|
|
53
|
+
"""Timestamp when the metadata row was created."""
|
|
54
|
+
|
|
55
|
+
METAXY_MATERIALIZATION_ID = f"{SYSTEM_COLUMN_PREFIX}materialization_id"
|
|
56
|
+
"""External orchestration run ID (e.g., Dagster Run ID, Airflow Run ID) for tracking pipeline executions."""
|
|
57
|
+
|
|
58
|
+
# --- System Column Sets ------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
ALL_SYSTEM_COLUMNS = frozenset(
|
|
61
|
+
{
|
|
62
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
63
|
+
METAXY_PROVENANCE,
|
|
64
|
+
METAXY_FEATURE_VERSION,
|
|
65
|
+
METAXY_SNAPSHOT_VERSION,
|
|
66
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
67
|
+
METAXY_DATA_VERSION,
|
|
68
|
+
METAXY_CREATED_AT,
|
|
69
|
+
METAXY_MATERIALIZATION_ID,
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
"""All Metaxy-managed column names that are injected into feature tables."""
|
|
73
|
+
|
|
74
|
+
# Columns that should be dropped when joining upstream features (will be recalculated)
|
|
75
|
+
_DROPPABLE_COLUMNS = frozenset(
|
|
76
|
+
{
|
|
77
|
+
METAXY_FEATURE_VERSION,
|
|
78
|
+
METAXY_SNAPSHOT_VERSION,
|
|
79
|
+
METAXY_CREATED_AT,
|
|
80
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
81
|
+
METAXY_DATA_VERSION,
|
|
82
|
+
METAXY_MATERIALIZATION_ID,
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# --- Utility Functions -------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def is_system_column(name: str) -> bool:
|
|
91
|
+
"""Check whether a column name is a system-managed column.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
name: Column name to check
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
True if the column is a system column, False otherwise
|
|
98
|
+
|
|
99
|
+
Examples:
|
|
100
|
+
>>> is_system_column("metaxy_feature_version")
|
|
101
|
+
True
|
|
102
|
+
>>> is_system_column("my_column")
|
|
103
|
+
False
|
|
104
|
+
"""
|
|
105
|
+
return name in ALL_SYSTEM_COLUMNS
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def is_droppable_system_column(name: str) -> bool:
|
|
109
|
+
"""Check whether a column should be dropped when joining upstream features.
|
|
110
|
+
|
|
111
|
+
Droppable columns (feature_version, snapshot_version) are recalculated for
|
|
112
|
+
each feature, so keeping them from upstream would cause conflicts.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
name: Column name to check
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
True if the column should be dropped during joins, False otherwise
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
>>> is_droppable_system_column("metaxy_feature_version")
|
|
122
|
+
True
|
|
123
|
+
>>> is_droppable_system_column("metaxy_provenance_by_field")
|
|
124
|
+
False
|
|
125
|
+
"""
|
|
126
|
+
return name in _DROPPABLE_COLUMNS
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# System columns that have lineage from upstream features
|
|
130
|
+
# These columns are computed from corresponding upstream columns (same column name)
|
|
131
|
+
# With 5 parents, each of these columns will have 5 dependencies
|
|
132
|
+
SYSTEM_COLUMNS_WITH_LINEAGE: frozenset[str] = frozenset(
|
|
133
|
+
{
|
|
134
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
135
|
+
METAXY_PROVENANCE,
|
|
136
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
137
|
+
METAXY_DATA_VERSION,
|
|
138
|
+
}
|
|
139
|
+
)
|