metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,447 @@
1
+ """Migration operation types."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import pydantic
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+ if TYPE_CHECKING:
10
+ from metaxy.metadata_store.base import MetadataStore
11
+
12
+
13
+ class BaseOperation(BaseSettings, ABC): # pyright: ignore[reportUnsafeMultipleInheritance]
14
+ """Base class for all migration operations with environment variable support.
15
+
16
+ Operations are instantiated from YAML configs and execute on individual features.
17
+ Subclasses implement execute_for_feature() to perform the actual migration logic.
18
+
19
+ Environment variables are automatically read using pydantic_settings. Define config
20
+ fields as regular Pydantic fields and they will be populated from env vars or config dict.
21
+
22
+ The 'type' field is automatically computed from the class's module and name.
23
+
24
+ Example:
25
+ class PostgreSQLBackfill(BaseOperation):
26
+ postgresql_url: str # Reads from POSTGRESQL_URL env var or config dict
27
+ batch_size: int = 1000 # Optional with default
28
+
29
+ def execute_for_feature(self, store, feature_key, *, snapshot_version, from_snapshot_version=None, dry_run=False):
30
+ # Implementation here
31
+ return 0
32
+ """
33
+
34
+ model_config = SettingsConfigDict(
35
+ extra="ignore", # Ignore extra fields like 'type' and 'features' from YAML
36
+ frozen=True,
37
+ )
38
+
39
+ @pydantic.model_validator(mode="before")
40
+ @classmethod
41
+ def _substitute_env_vars(cls, data: dict[str, Any]) -> dict[str, Any]:
42
+ """Substitute ${VAR} patterns with environment variables.
43
+
44
+ Example:
45
+ postgresql_url: "${POSTGRESQL_URL}" -> postgresql_url: "postgresql://..."
46
+ """
47
+ import os
48
+ import re
49
+
50
+ def substitute_value(value):
51
+ if isinstance(value, str):
52
+ # Replace ${VAR} with os.environ.get('VAR')
53
+ def replacer(match):
54
+ var_name = match.group(1)
55
+ env_value = os.environ.get(var_name)
56
+ if env_value is None:
57
+ raise ValueError(f"Environment variable {var_name} is not set")
58
+ return env_value
59
+
60
+ return re.sub(r"\$\{([^}]+)\}", replacer, value)
61
+ return value
62
+
63
+ # Create a new dict to avoid mutating the input
64
+ result = {}
65
+ for key, value in data.items():
66
+ result[key] = substitute_value(value)
67
+ return result
68
+
69
+ @property
70
+ def type(self) -> str:
71
+ """Return the fully qualified class name for this operation."""
72
+ return f"{self.__class__.__module__}.{self.__class__.__name__}"
73
+
74
+ @abstractmethod
75
+ def execute_for_feature(
76
+ self,
77
+ store: "MetadataStore",
78
+ feature_key: str,
79
+ *,
80
+ snapshot_version: str,
81
+ from_snapshot_version: str | None = None,
82
+ dry_run: bool = False,
83
+ ) -> int:
84
+ """Execute operation for a single feature.
85
+
86
+ Args:
87
+ store: Metadata store to operate on
88
+ feature_key: Feature key string (e.g., "video/scene")
89
+ snapshot_version: Target snapshot version
90
+ from_snapshot_version: Source snapshot version (optional, for cross-snapshot migrations)
91
+ dry_run: If True, only validate and return count without executing
92
+
93
+ Returns:
94
+ Number of rows affected
95
+
96
+ Raises:
97
+ Exception: If operation fails
98
+ """
99
+ pass
100
+
101
+
102
+ class DataVersionReconciliation(BaseOperation):
103
+ """Reconcile field provenance when feature definition changes BUT computation is unchanged.
104
+
105
+ This operation applies to affected features specified in the migration configuration.
106
+ Feature keys are provided in the migration YAML operations list.
107
+
108
+ This operation:
109
+ 1. For each affected feature, derives old/new feature_versions from snapshots
110
+ 2. Finds rows with old feature_version
111
+ 3. Recalculates field_provenance based on new feature definition
112
+ 4. Writes new rows with updated feature_version and provenance_by_field
113
+ 5. Preserves all user metadata columns (immutable)
114
+
115
+ Use ONLY when code changed but computation results would be identical:
116
+ - Dependency graph refactoring (more precise field dependencies)
117
+ - Field structure changes (renaming, splitting, better schema)
118
+ - Code organization improvements (imports, typing, refactoring)
119
+
120
+ Do NOT use when computation actually changed:
121
+ - Different algorithm/model → re-run pipeline instead
122
+ - Bug fixes that affect output → re-run pipeline instead
123
+ - New model version → re-run pipeline instead
124
+
125
+ Feature versions are automatically derived from the migration's snapshot versions.
126
+
127
+ Example YAML:
128
+ operations:
129
+ - type: metaxy.migrations.ops.DataVersionReconciliation
130
+ features: ["video/scene", "video/frames"]
131
+ """
132
+
133
+ def execute_for_feature(
134
+ self,
135
+ store: "MetadataStore",
136
+ feature_key: str,
137
+ *,
138
+ snapshot_version: str,
139
+ from_snapshot_version: str | None = None,
140
+ dry_run: bool = False,
141
+ ) -> int:
142
+ """Execute field provenance reconciliation for a single feature.
143
+
144
+ Only works for features with upstream dependencies. For root features
145
+ (no upstream), field_provenance are user-defined and cannot be automatically
146
+ reconciled - user must re-run their computation pipeline.
147
+
148
+ Process:
149
+ 1. Verify feature has upstream dependencies
150
+ 2. Query old and new feature_versions from snapshot metadata
151
+ 3. Load existing metadata with old feature_version
152
+ 4. Use resolve_update() to calculate expected field_provenance based on current upstream
153
+ 5. Join existing user metadata with new field_provenance
154
+ 6. Write with new feature_version and snapshot_version
155
+
156
+ Args:
157
+ store: Metadata store
158
+ feature_key: Feature key string (e.g., "examples/child")
159
+ snapshot_version: Target snapshot version (new state)
160
+ from_snapshot_version: Source snapshot version (old state, required for this operation)
161
+ dry_run: If True, return row count without executing
162
+
163
+ Returns:
164
+ Number of rows affected
165
+
166
+ Raises:
167
+ ValueError: If feature has no upstream dependencies (root feature) or from_snapshot_version not provided
168
+ """
169
+ if from_snapshot_version is None:
170
+ raise ValueError(
171
+ f"DataVersionReconciliation requires from_snapshot_version for feature {feature_key}"
172
+ )
173
+
174
+ to_snapshot_version = snapshot_version
175
+ import narwhals as nw
176
+
177
+ from metaxy.metadata_store.base import allow_feature_version_override
178
+ from metaxy.metadata_store.exceptions import FeatureNotFoundError
179
+ from metaxy.metadata_store.system import FEATURE_VERSIONS_KEY
180
+ from metaxy.models.feature import FeatureGraph
181
+ from metaxy.models.types import FeatureKey
182
+
183
+ feature_key_obj = FeatureKey(feature_key.split("/"))
184
+ feature_key_str = feature_key_obj.to_string()
185
+ graph = FeatureGraph.get_active()
186
+ feature_cls = graph.features_by_key[feature_key_obj]
187
+
188
+ # 1. Verify feature has upstream dependencies
189
+ plan = graph.get_feature_plan(feature_key_obj)
190
+ has_upstream = plan.deps is not None and len(plan.deps) > 0
191
+
192
+ if not has_upstream:
193
+ raise ValueError(
194
+ f"DataVersionReconciliation cannot be used for root feature {feature_key_str}. "
195
+ f"Root features have user-defined field_provenance that cannot be automatically reconciled. "
196
+ f"User must re-run their computation pipeline to generate new data."
197
+ )
198
+
199
+ # 2. Query feature versions from snapshot metadata
200
+ try:
201
+ from_version_data = store.read_metadata(
202
+ FEATURE_VERSIONS_KEY,
203
+ current_only=False,
204
+ allow_fallback=False,
205
+ filters=[
206
+ (nw.col("metaxy_snapshot_version") == from_snapshot_version)
207
+ & (nw.col("feature_key") == feature_key_str)
208
+ ],
209
+ )
210
+ except FeatureNotFoundError:
211
+ from_version_data = None
212
+
213
+ try:
214
+ to_version_data = store.read_metadata(
215
+ FEATURE_VERSIONS_KEY,
216
+ current_only=False,
217
+ allow_fallback=False,
218
+ filters=[
219
+ (nw.col("metaxy_snapshot_version") == to_snapshot_version)
220
+ & (nw.col("feature_key") == feature_key_str)
221
+ ],
222
+ )
223
+ except FeatureNotFoundError:
224
+ to_version_data = None
225
+
226
+ # Extract feature versions from lazy frames
227
+ # Since we filter by snapshot_version and feature_key, there should be exactly one row
228
+ # We don't care about feature_spec_version changes, so just get the first row without sorting
229
+ from_feature_version: str | None = None
230
+ to_feature_version: str | None = None
231
+
232
+ if from_version_data is not None:
233
+ # Use .head(1) to limit at query level - no need to sort since we don't care about feature_spec_version
234
+ from_version_df = from_version_data.head(1).collect()
235
+ if from_version_df.shape[0] > 0:
236
+ from_feature_version = str(from_version_df["metaxy_feature_version"][0])
237
+ else:
238
+ from_version_data = None
239
+
240
+ if to_version_data is not None:
241
+ # Use .head(1) to limit at query level - no need to sort since we don't care about feature_spec_version
242
+ to_version_df = to_version_data.head(1).collect()
243
+ if to_version_df.shape[0] > 0:
244
+ to_feature_version = str(to_version_df["metaxy_feature_version"][0])
245
+ else:
246
+ to_version_data = None
247
+
248
+ if from_version_data is None:
249
+ raise ValueError(
250
+ f"Feature {feature_key_str} not found in from_snapshot {from_snapshot_version}"
251
+ )
252
+ if to_version_data is None:
253
+ raise ValueError(
254
+ f"Feature {feature_key_str} not found in to_snapshot {to_snapshot_version}"
255
+ )
256
+
257
+ assert from_feature_version is not None
258
+ assert to_feature_version is not None
259
+
260
+ # 3. Load existing metadata with old feature_version
261
+ try:
262
+ existing_metadata = store.read_metadata(
263
+ feature_cls,
264
+ current_only=False,
265
+ filters=[nw.col("metaxy_feature_version") == from_feature_version],
266
+ allow_fallback=False,
267
+ )
268
+ except FeatureNotFoundError:
269
+ # Feature doesn't exist yet - nothing to migrate
270
+ return 0
271
+
272
+ # Collect to check existence and get row count
273
+ existing_metadata_df = existing_metadata.collect()
274
+ if existing_metadata_df.shape[0] == 0:
275
+ # Already migrated (idempotent)
276
+ return 0
277
+
278
+ if dry_run:
279
+ return existing_metadata_df.shape[0]
280
+
281
+ # 4. Get sample metadata (exclude system columns)
282
+ user_columns = [
283
+ c
284
+ for c in existing_metadata_df.columns
285
+ if c
286
+ not in [
287
+ "metaxy_provenance_by_field",
288
+ "metaxy_feature_version",
289
+ "metaxy_snapshot_version",
290
+ ]
291
+ ]
292
+ sample_metadata = existing_metadata_df.select(user_columns)
293
+
294
+ # 5. Use resolve_update to calculate field_provenance based on current upstream
295
+ # Don't pass samples - let resolve_update auto-load upstream and calculate provenance_by_field
296
+ diff_result = store.resolve_update(feature_cls)
297
+
298
+ # Convert to Polars for the join to avoid cross-backend issues
299
+ sample_metadata_pl = nw.from_native(sample_metadata.to_native()).to_polars()
300
+
301
+ # Use 'changed' for reconciliation (field_provenance changed due to upstream)
302
+ # Use 'added' for new feature materialization
303
+ # Convert results to Polars for consistent joining
304
+ if len(diff_result.changed) > 0:
305
+ changed_pl = nw.from_native(diff_result.changed.to_native()).to_polars()
306
+ new_provenance = changed_pl.select(
307
+ ["sample_uid", "metaxy_provenance_by_field"]
308
+ )
309
+ df_to_write = sample_metadata_pl.join(
310
+ new_provenance, on="sample_uid", how="inner"
311
+ )
312
+ elif len(diff_result.added) > 0:
313
+ df_to_write = nw.from_native(diff_result.added.to_native()).to_polars()
314
+ else:
315
+ return 0
316
+
317
+ # 6. Write with new feature_version and snapshot_version
318
+ # Wrap in Narwhals for write_metadata
319
+ df_to_write_nw = nw.from_native(df_to_write)
320
+ df_to_write_nw = df_to_write_nw.with_columns(
321
+ nw.lit(to_feature_version).alias("metaxy_feature_version"),
322
+ nw.lit(to_snapshot_version).alias("metaxy_snapshot_version"),
323
+ )
324
+
325
+ with allow_feature_version_override():
326
+ with store.allow_cross_project_writes():
327
+ store.write_metadata(feature_cls, df_to_write_nw)
328
+
329
+ return len(df_to_write)
330
+
331
+
332
+ class MetadataBackfill(BaseOperation, ABC):
333
+ """Base class for metadata backfill operations.
334
+
335
+ Users subclass this to implement custom backfill logic with complete
336
+ control over the entire process: loading, transforming, joining, filtering,
337
+ and writing metadata.
338
+
339
+ The user implements execute_for_feature() and can:
340
+ - Load metadata from any external source (S3, database, API, etc.)
341
+ - Perform custom transformations and filtering
342
+ - Join with Metaxy's calculated field_provenance however they want
343
+ - Write results to the store
344
+
345
+ Example Subclass:
346
+ class S3VideoBackfill(MetadataBackfill):
347
+ s3_bucket: str
348
+ s3_prefix: str
349
+ min_size_mb: int = 10
350
+
351
+ def execute_for_feature(
352
+ self,
353
+ store,
354
+ feature_key,
355
+ *,
356
+ snapshot_version,
357
+ from_snapshot_version=None,
358
+ dry_run=False
359
+ ):
360
+ import boto3
361
+ from metaxy.models.feature import FeatureGraph
362
+ from metaxy.models.types import FeatureKey
363
+
364
+ # Load from S3
365
+ s3 = boto3.client('s3')
366
+ objects = s3.list_objects_v2(
367
+ Bucket=self.s3_bucket,
368
+ Prefix=self.s3_prefix
369
+ )
370
+
371
+ external_df = pl.DataFrame([
372
+ {
373
+ "sample_uid": obj['Key'],
374
+ "path": f"s3://{self.s3_bucket}/{obj['Key']}",
375
+ "size_bytes": obj['Size']
376
+ }
377
+ for obj in objects['Contents']
378
+ ])
379
+
380
+ # Filter
381
+ external_df = external_df.filter(
382
+ pl.col("size_bytes") > self.min_size_mb * 1024 * 1024
383
+ )
384
+
385
+ if dry_run:
386
+ return len(external_df)
387
+
388
+ # Get field provenance from Metaxy
389
+ graph = FeatureGraph.get_active()
390
+ feature_key_obj = FeatureKey(feature_key.split("/"))
391
+ feature_cls = graph.features_by_key[feature_key_obj]
392
+
393
+ diff = store.resolve_update(
394
+ feature_cls,
395
+ samples=external_df.select(["sample_uid"])
396
+ )
397
+
398
+ # Join external metadata with calculated field_provenance
399
+ to_write = external_df.join(diff.added, on="sample_uid", how="inner")
400
+
401
+ # Write
402
+ store.write_metadata(feature_cls, to_write)
403
+ return len(to_write)
404
+
405
+ Example YAML:
406
+ operations:
407
+ - type: "myproject.migrations.S3VideoBackfill"
408
+ features: ["video/files"]
409
+ s3_bucket: "prod-videos"
410
+ s3_prefix: "processed/"
411
+ min_size_mb: 10
412
+ """
413
+
414
+ # No additional required fields - user subclasses add their own
415
+
416
+ @abstractmethod
417
+ def execute_for_feature(
418
+ self,
419
+ store: "MetadataStore",
420
+ feature_key: str,
421
+ *,
422
+ snapshot_version: str,
423
+ from_snapshot_version: str | None = None,
424
+ dry_run: bool = False,
425
+ ) -> int:
426
+ """User implements backfill logic for a single feature.
427
+
428
+ User has complete control over:
429
+ - Loading external metadata (S3, database, API, files, etc.)
430
+ - Transforming and filtering data
431
+ - Joining with Metaxy's field_provenance
432
+ - Writing to store
433
+
434
+ Args:
435
+ store: Metadata store to write to
436
+ feature_key: Feature key string (e.g., "video/files")
437
+ snapshot_version: Target snapshot version
438
+ from_snapshot_version: Source snapshot version (optional, for cross-snapshot backfills)
439
+ dry_run: If True, validate and return count without writing
440
+
441
+ Returns:
442
+ Number of rows written (or would be written if dry_run)
443
+
444
+ Raises:
445
+ Exception: If backfill fails (will be recorded in migration progress)
446
+ """
447
+ pass
File without changes
metaxy/models/bases.py ADDED
@@ -0,0 +1,12 @@
1
+ import pydantic
2
+
3
+
4
+ class FrozenBaseModel(pydantic.BaseModel):
5
+ # config class is deprecated
6
+ model_config = pydantic.ConfigDict(frozen=True)
7
+
8
+
9
+ class VersioningEngineMismatchError(Exception):
10
+ """Raised when versioning_engine='native' is requested but data has wrong implementation."""
11
+
12
+ pass
@@ -0,0 +1,139 @@
1
+ """Shared constants for system-managed column names.
2
+
3
+ All system columns use the metaxy_ prefix to avoid conflicts with user columns.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ # Default code version for initial feature definitions
9
+ DEFAULT_CODE_VERSION = "__metaxy_initial__"
10
+
11
+ # System column prefix
12
+ SYSTEM_COLUMN_PREFIX = "metaxy_"
13
+
14
+ # --- System Column Names -----------------------------------------------------------
15
+ # All system columns that Metaxy manages internally. These columns are automatically
16
+ # added to metadata DataFrames and should not be defined by users.
17
+
18
+ METAXY_PROVENANCE_BY_FIELD = f"{SYSTEM_COLUMN_PREFIX}provenance_by_field"
19
+ """Field-level provenance hashes (struct column mapping field names to hashes)."""
20
+
21
+ METAXY_PROVENANCE = f"{SYSTEM_COLUMN_PREFIX}provenance"
22
+ """Hash of`metaxy_provenance_by_field` -- a single string value."""
23
+
24
+ METAXY_FEATURE_VERSION = f"{SYSTEM_COLUMN_PREFIX}feature_version"
25
+ """Hash of the feature definition (dependencies + fields + code_versions)."""
26
+
27
+ METAXY_SNAPSHOT_VERSION = f"{SYSTEM_COLUMN_PREFIX}snapshot_version"
28
+ """Hash of the entire feature graph snapshot (recorded during deployment)."""
29
+
30
+ METAXY_FEATURE_SPEC_VERSION = f"{SYSTEM_COLUMN_PREFIX}feature_spec_version"
31
+ """Hash of the complete feature specification."""
32
+
33
+ METAXY_FULL_DEFINITION_VERSION = f"{SYSTEM_COLUMN_PREFIX}full_definition_version"
34
+ """Hash of the complete feature definition including Pydantic schema, feature spec, and project.
35
+
36
+ This comprehensive hash captures ALL aspects of a feature definition:
37
+ - Pydantic model schema (field types, descriptions, validators, serializers, etc.)
38
+ - Feature specification (dependencies, fields, code_versions, metadata)
39
+ - Project name
40
+
41
+ Used in system tables to detect when ANY part of a feature changes."""
42
+
43
+ METAXY_DATA_VERSION_BY_FIELD = f"{SYSTEM_COLUMN_PREFIX}data_version_by_field"
44
+ """Field-level data version hashes (struct column mapping field names to version hashes).
45
+
46
+ Similar to provenance_by_field, but can be user-overridden to implement custom versioning
47
+ (e.g., content hashes, timestamps, semantic versions)."""
48
+
49
+ METAXY_DATA_VERSION = f"{SYSTEM_COLUMN_PREFIX}data_version"
50
+ """Hash of metaxy_data_version_by_field -- a single string value."""
51
+
52
+ METAXY_CREATED_AT = f"{SYSTEM_COLUMN_PREFIX}created_at"
53
+ """Timestamp when the metadata row was created."""
54
+
55
+ METAXY_MATERIALIZATION_ID = f"{SYSTEM_COLUMN_PREFIX}materialization_id"
56
+ """External orchestration run ID (e.g., Dagster Run ID, Airflow Run ID) for tracking pipeline executions."""
57
+
58
+ # --- System Column Sets ------------------------------------------------------------
59
+
60
+ ALL_SYSTEM_COLUMNS = frozenset(
61
+ {
62
+ METAXY_PROVENANCE_BY_FIELD,
63
+ METAXY_PROVENANCE,
64
+ METAXY_FEATURE_VERSION,
65
+ METAXY_SNAPSHOT_VERSION,
66
+ METAXY_DATA_VERSION_BY_FIELD,
67
+ METAXY_DATA_VERSION,
68
+ METAXY_CREATED_AT,
69
+ METAXY_MATERIALIZATION_ID,
70
+ }
71
+ )
72
+ """All Metaxy-managed column names that are injected into feature tables."""
73
+
74
+ # Columns that should be dropped when joining upstream features (will be recalculated)
75
+ _DROPPABLE_COLUMNS = frozenset(
76
+ {
77
+ METAXY_FEATURE_VERSION,
78
+ METAXY_SNAPSHOT_VERSION,
79
+ METAXY_CREATED_AT,
80
+ METAXY_DATA_VERSION_BY_FIELD,
81
+ METAXY_DATA_VERSION,
82
+ METAXY_MATERIALIZATION_ID,
83
+ }
84
+ )
85
+
86
+
87
+ # --- Utility Functions -------------------------------------------------------------
88
+
89
+
90
+ def is_system_column(name: str) -> bool:
91
+ """Check whether a column name is a system-managed column.
92
+
93
+ Args:
94
+ name: Column name to check
95
+
96
+ Returns:
97
+ True if the column is a system column, False otherwise
98
+
99
+ Examples:
100
+ >>> is_system_column("metaxy_feature_version")
101
+ True
102
+ >>> is_system_column("my_column")
103
+ False
104
+ """
105
+ return name in ALL_SYSTEM_COLUMNS
106
+
107
+
108
+ def is_droppable_system_column(name: str) -> bool:
109
+ """Check whether a column should be dropped when joining upstream features.
110
+
111
+ Droppable columns (feature_version, snapshot_version) are recalculated for
112
+ each feature, so keeping them from upstream would cause conflicts.
113
+
114
+ Args:
115
+ name: Column name to check
116
+
117
+ Returns:
118
+ True if the column should be dropped during joins, False otherwise
119
+
120
+ Examples:
121
+ >>> is_droppable_system_column("metaxy_feature_version")
122
+ True
123
+ >>> is_droppable_system_column("metaxy_provenance_by_field")
124
+ False
125
+ """
126
+ return name in _DROPPABLE_COLUMNS
127
+
128
+
129
+ # System columns that have lineage from upstream features
130
+ # These columns are computed from corresponding upstream columns (same column name)
131
+ # With 5 parents, each of these columns will have 5 dependencies
132
+ SYSTEM_COLUMNS_WITH_LINEAGE: frozenset[str] = frozenset(
133
+ {
134
+ METAXY_PROVENANCE_BY_FIELD,
135
+ METAXY_PROVENANCE,
136
+ METAXY_DATA_VERSION_BY_FIELD,
137
+ METAXY_DATA_VERSION,
138
+ }
139
+ )