metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,105 @@
1
+ from collections.abc import Mapping
2
+ from functools import cached_property
3
+
4
+ import pydantic
5
+
6
+ from metaxy.models.constants import ALL_SYSTEM_COLUMNS
7
+ from metaxy.models.field import FieldSpec, SpecialFieldDep
8
+ from metaxy.models.types import FeatureKey, FieldKey
9
+
10
+
11
+ class FeatureDep(pydantic.BaseModel):
12
+ """Feature dependency specification with optional column selection and renaming.
13
+
14
+ Attributes:
15
+ key: The feature key to depend on
16
+ columns: Optional tuple of column names to select from upstream feature.
17
+ - None (default): Keep all columns from upstream
18
+ - Empty tuple (): Keep only system columns (sample_uid, data_version, etc.)
19
+ - Tuple of names: Keep only specified columns (plus system columns)
20
+ rename: Optional mapping of old column names to new names.
21
+ Applied after column selection.
22
+
23
+ Examples:
24
+ >>> # Keep all columns (default behavior)
25
+ >>> FeatureDep(key=FeatureKey(["upstream"]))
26
+
27
+ >>> # Keep only specific columns
28
+ >>> FeatureDep(
29
+ ... key=FeatureKey(["upstream"]),
30
+ ... columns=("col1", "col2")
31
+ ... )
32
+
33
+ >>> # Rename columns to avoid conflicts
34
+ >>> FeatureDep(
35
+ ... key=FeatureKey(["upstream"]),
36
+ ... rename={"old_name": "new_name"}
37
+ ... )
38
+
39
+ >>> # Select and rename
40
+ >>> FeatureDep(
41
+ ... key=FeatureKey(["upstream"]),
42
+ ... columns=("col1", "col2"),
43
+ ... rename={"col1": "upstream_col1"}
44
+ ... )
45
+ """
46
+
47
+ key: FeatureKey
48
+ columns: tuple[str, ...] | None = (
49
+ None # None = all columns, () = only system columns
50
+ )
51
+ rename: dict[str, str] | None = None # Column renaming mapping
52
+
53
+ @pydantic.model_validator(mode="after")
54
+ def validate_column_operations(self) -> "FeatureDep":
55
+ """Validate column selection and renaming operations."""
56
+ if self.rename is not None:
57
+ for new_name in self.rename.values():
58
+ if new_name in ALL_SYSTEM_COLUMNS:
59
+ raise ValueError(
60
+ f"Cannot rename column to system column name: {new_name}. "
61
+ f"System columns: {ALL_SYSTEM_COLUMNS}"
62
+ )
63
+ return self
64
+
65
+ def table_name(self) -> str:
66
+ """Get SQL-like table name for this feature spec."""
67
+ return self.key.table_name
68
+
69
+
70
+ class FeatureSpec(pydantic.BaseModel):
71
+ key: FeatureKey
72
+ deps: list[FeatureDep] | None
73
+ fields: list[FieldSpec] = pydantic.Field(
74
+ default_factory=lambda: [
75
+ FieldSpec(
76
+ key=FieldKey(["default"]),
77
+ code_version=1,
78
+ deps=SpecialFieldDep.ALL,
79
+ )
80
+ ]
81
+ )
82
+ code_version: int = 1
83
+
84
+ @cached_property
85
+ def fields_by_key(self) -> Mapping[FieldKey, FieldSpec]:
86
+ return {c.key: c for c in self.fields}
87
+
88
+ def table_name(self) -> str:
89
+ """Get SQL-like table name for this feature spec."""
90
+ return self.key.table_name
91
+
92
+ @pydantic.model_validator(mode="after")
93
+ def validate_unique_field_keys(self) -> "FeatureSpec":
94
+ """Validate that all fields have unique keys."""
95
+ seen_keys: set[tuple[str, ...]] = set()
96
+ for field in self.fields:
97
+ # Convert to tuple for hashability in case it's a plain list
98
+ key_tuple = tuple(field.key)
99
+ if key_tuple in seen_keys:
100
+ raise ValueError(
101
+ f"Duplicate field key found: {field.key}. "
102
+ f"All fields must have unique keys."
103
+ )
104
+ seen_keys.add(key_tuple)
105
+ return self
metaxy/models/field.py ADDED
@@ -0,0 +1,25 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import Field as PydanticField
4
+
5
+ from metaxy.models.bases import FrozenBaseModel
6
+ from metaxy.models.types import FeatureKey, FieldKey
7
+
8
+
9
+ class SpecialFieldDep(Enum):
10
+ ALL = "__METAXY_ALL_DEP__"
11
+
12
+
13
+ class FieldDep(FrozenBaseModel):
14
+ feature_key: FeatureKey
15
+ fields: list[FieldKey] | SpecialFieldDep = SpecialFieldDep.ALL
16
+
17
+
18
+ class FieldSpec(FrozenBaseModel):
19
+ key: FieldKey = PydanticField(default_factory=lambda: FieldKey(["default"]))
20
+ code_version: int = 1
21
+
22
+ # field-level dependencies can be one of the following:
23
+ # - the default SpecialFieldDep.ALL to depend on all upstream features and all their fields
24
+ # - a list of FieldDep to depend on particular fields of specific features
25
+ deps: SpecialFieldDep | list[FieldDep] = SpecialFieldDep.ALL
metaxy/models/plan.py ADDED
@@ -0,0 +1,155 @@
1
+ from collections.abc import Mapping
2
+ from functools import cached_property
3
+
4
+ from metaxy.models.bases import FrozenBaseModel
5
+ from metaxy.models.feature_spec import FeatureKey, FeatureSpec
6
+ from metaxy.models.field import (
7
+ FieldDep,
8
+ FieldKey,
9
+ FieldSpec,
10
+ SpecialFieldDep,
11
+ )
12
+
13
+
14
+ class FQFieldKey(FrozenBaseModel):
15
+ field: FieldKey
16
+ feature: FeatureKey
17
+
18
+ def to_string(self) -> str:
19
+ return f"{self.feature.to_string()}.{self.field.to_string()}"
20
+
21
+ def __repr__(self) -> str:
22
+ return self.to_string()
23
+
24
+ def __lt__(self, other: "FQFieldKey") -> bool:
25
+ """Enable sorting of FQFieldKey objects."""
26
+ return self.to_string() < other.to_string()
27
+
28
+ def __le__(self, other: "FQFieldKey") -> bool:
29
+ """Enable sorting of FQFieldKey objects."""
30
+ return self.to_string() <= other.to_string()
31
+
32
+ def __gt__(self, other: "FQFieldKey") -> bool:
33
+ """Enable sorting of FQFieldKey objects."""
34
+ return self.to_string() > other.to_string()
35
+
36
+ def __ge__(self, other: "FQFieldKey") -> bool:
37
+ """Enable sorting of FQFieldKey objects."""
38
+ return self.to_string() >= other.to_string()
39
+
40
+
41
+ class FeaturePlan(FrozenBaseModel):
42
+ """Slice of the feature graph that includes a given feature and its parents"""
43
+
44
+ feature: FeatureSpec
45
+ deps: list[FeatureSpec] | None
46
+
47
+ @cached_property
48
+ def parent_features_by_key(self) -> Mapping[FeatureKey, FeatureSpec]:
49
+ return {feature.key: feature for feature in self.deps or []}
50
+
51
+ @cached_property
52
+ def all_parent_fields_by_key(self) -> Mapping[FQFieldKey, FieldSpec]:
53
+ res: dict[FQFieldKey, FieldSpec] = {}
54
+
55
+ for feature in self.deps or []:
56
+ for field in feature.fields:
57
+ res[FQFieldKey(field=field.key, feature=feature.key)] = field
58
+
59
+ return res
60
+
61
+ @cached_property
62
+ def parent_fields_by_key(self) -> Mapping[FQFieldKey, FieldSpec]:
63
+ res: dict[FQFieldKey, FieldSpec] = {}
64
+
65
+ for field in self.feature.fields:
66
+ res.update(self.get_parent_fields_for_field(field.key))
67
+
68
+ return res
69
+
70
+ def get_parent_fields_for_field(
71
+ self, key: FieldKey
72
+ ) -> Mapping[FQFieldKey, FieldSpec]:
73
+ res = {}
74
+
75
+ field = self.feature.fields_by_key[key]
76
+
77
+ if field.deps == SpecialFieldDep.ALL:
78
+ # we depend on all upstream features and their fields
79
+ for feature in self.deps or []:
80
+ for field in feature.fields:
81
+ res[FQFieldKey(field=field.key, feature=feature.key)] = field
82
+ elif isinstance(field.deps, list):
83
+ for field_dep in field.deps:
84
+ if field_dep.fields == SpecialFieldDep.ALL:
85
+ # we depend on all fields of the corresponding upstream feature
86
+ for parent_field in self.parent_features_by_key[
87
+ field_dep.feature_key
88
+ ].fields:
89
+ res[
90
+ FQFieldKey(
91
+ field=parent_field.key,
92
+ feature=field_dep.feature_key,
93
+ )
94
+ ] = parent_field
95
+
96
+ elif isinstance(field_dep, FieldDep):
97
+ #
98
+ for field_key in field_dep.fields:
99
+ fq_key = FQFieldKey(
100
+ field=field_key,
101
+ feature=field_dep.feature_key,
102
+ )
103
+ res[fq_key] = self.all_parent_fields_by_key[fq_key]
104
+ else:
105
+ raise ValueError(f"Unsupported dependency type: {type(field_dep)}")
106
+ else:
107
+ raise TypeError(f"Unsupported dependencies type: {type(field.deps)}")
108
+
109
+ return res
110
+
111
+ @cached_property
112
+ def field_dependencies(
113
+ self,
114
+ ) -> Mapping[FieldKey, Mapping[FeatureKey, list[FieldKey]]]:
115
+ """Get dependencies for each field in this feature.
116
+
117
+ Returns a mapping from field key to its upstream dependencies.
118
+ Each dependency maps an upstream feature key to a list of field keys
119
+ that this field depends on.
120
+
121
+ This is the format needed by DataVersionResolver.
122
+
123
+ Returns:
124
+ Mapping of field keys to their dependency specifications.
125
+ Format: {field_key: {upstream_feature_key: [upstream_field_keys]}}
126
+ """
127
+ result: dict[FieldKey, dict[FeatureKey, list[FieldKey]]] = {}
128
+
129
+ for field in self.feature.fields:
130
+ field_deps: dict[FeatureKey, list[FieldKey]] = {}
131
+
132
+ if field.deps == SpecialFieldDep.ALL:
133
+ # Depend on all upstream features and all their fields
134
+ for upstream_feature in self.deps or []:
135
+ field_deps[upstream_feature.key] = [
136
+ c.key for c in upstream_feature.fields
137
+ ]
138
+ elif isinstance(field.deps, list):
139
+ # Specific dependencies defined
140
+ for field_dep in field.deps:
141
+ feature_key = field_dep.feature_key
142
+
143
+ if field_dep.fields == SpecialFieldDep.ALL:
144
+ # All fields from this upstream feature
145
+ upstream_feature_spec = self.parent_features_by_key[feature_key]
146
+ field_deps[feature_key] = [
147
+ c.key for c in upstream_feature_spec.fields
148
+ ]
149
+ elif isinstance(field_dep.fields, list):
150
+ # Specific fields
151
+ field_deps[feature_key] = field_dep.fields
152
+
153
+ result[field.key] = field_deps
154
+
155
+ return result
metaxy/models/types.py ADDED
@@ -0,0 +1,157 @@
1
+ from typing import Any, TypeAlias
2
+
3
+ from pydantic import GetCoreSchemaHandler
4
+ from pydantic_core import core_schema
5
+
6
+ FEATURE_KEY_SEPARATOR = "/"
7
+ FIELD_KEY_SEPARATOR = "/"
8
+
9
+
10
+ class FeatureKey(list): # pyright: ignore[reportMissingTypeArgument]
11
+ """
12
+ Feature key as a list of strings.
13
+
14
+ Hashable for use as dict keys in registries.
15
+
16
+ Parts cannot contain forward slashes (/) or double underscores (__).
17
+ """
18
+
19
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
20
+ super().__init__(*args, **kwargs)
21
+ # Validate that no part contains "/" or "__"
22
+ for part in self:
23
+ if not isinstance(part, str):
24
+ raise ValueError(
25
+ f"FeatureKey parts must be strings, got {type(part).__name__}"
26
+ )
27
+ if "/" in part:
28
+ raise ValueError(
29
+ f"FeatureKey part '{part}' cannot contain forward slashes (/). "
30
+ f"Forward slashes are reserved as the separator in to_string(). "
31
+ f"Use underscores or hyphens instead."
32
+ )
33
+ if "__" in part:
34
+ raise ValueError(
35
+ f"FeatureKey part '{part}' cannot contain double underscores (__). "
36
+ f"Use single underscores or hyphens instead."
37
+ )
38
+
39
+ def to_string(self) -> str:
40
+ return FEATURE_KEY_SEPARATOR.join(self)
41
+
42
+ def __repr__(self) -> str:
43
+ return self.to_string()
44
+
45
+ def __hash__(self): # pyright: ignore[reportIncompatibleVariableOverride]
46
+ return hash(tuple(self))
47
+
48
+ def __eq__(self, other):
49
+ if isinstance(other, FeatureKey):
50
+ return list.__eq__(self, other)
51
+ return list.__eq__(self, other)
52
+
53
+ @classmethod
54
+ def __get_pydantic_core_schema__(
55
+ cls, source_type: Any, handler: GetCoreSchemaHandler
56
+ ) -> core_schema.CoreSchema:
57
+ """Pydantic schema that preserves FeatureKey type."""
58
+ # python_schema = core_schema.is_instance_schema(cls)
59
+
60
+ list_of_str_schema = core_schema.list_schema(core_schema.str_schema())
61
+
62
+ return core_schema.no_info_wrap_validator_function(
63
+ cls._validate,
64
+ list_of_str_schema,
65
+ serialization=core_schema.plain_serializer_function_ser_schema(
66
+ lambda x: list(x)
67
+ ),
68
+ )
69
+
70
+ @classmethod
71
+ def _validate(cls, value, handler):
72
+ """Validate and wrap in FeatureKey."""
73
+ if isinstance(value, cls):
74
+ return value
75
+ # Let the list schema validate first
76
+ validated = handler(value)
77
+ # Wrap in FeatureKey
78
+ return cls(validated)
79
+
80
+ @property
81
+ def table_name(self) -> str:
82
+ """Get SQL-like table name for this feature key."""
83
+ return "__".join(self)
84
+
85
+
86
+ class FieldKey(list): # pyright: ignore[reportMissingTypeArgument]
87
+ """
88
+ Field key as a list of strings.
89
+
90
+ Hashable for use as dict keys in registries.
91
+
92
+ Parts cannot contain forward slashes (/) or double underscores (__).
93
+ """
94
+
95
+ def __init__(self, *args, **kwargs):
96
+ super().__init__(*args, **kwargs) # pyrefly: ignore[not-iterable]
97
+ # Validate that no part contains "/" or "__"
98
+ for part in self:
99
+ if not isinstance(part, str):
100
+ raise ValueError(
101
+ f"FieldKey parts must be strings, got {type(part).__name__}"
102
+ )
103
+ if "/" in part:
104
+ raise ValueError(
105
+ f"FieldKey part '{part}' cannot contain forward slashes (/). "
106
+ f"Forward slashes are reserved as the separator in to_string(). "
107
+ f"Use underscores or hyphens instead."
108
+ )
109
+ if "__" in part:
110
+ raise ValueError(
111
+ f"FieldKey part '{part}' cannot contain double underscores (__). "
112
+ f"Use single underscores or hyphens instead."
113
+ )
114
+
115
+ def to_string(self) -> str:
116
+ return FIELD_KEY_SEPARATOR.join(self)
117
+
118
+ def __repr__(self) -> str:
119
+ return self.to_string()
120
+
121
+ def __hash__(self): # pyright: ignore[reportIncompatibleVariableOverride]
122
+ return hash(tuple(self))
123
+
124
+ def __eq__(self, other):
125
+ if isinstance(other, FieldKey):
126
+ return list.__eq__(self, other)
127
+ return list.__eq__(self, other)
128
+
129
+ @classmethod
130
+ def __get_pydantic_core_schema__(
131
+ cls, source_type: Any, handler: GetCoreSchemaHandler
132
+ ) -> core_schema.CoreSchema:
133
+ """Pydantic schema that preserves FieldKey type."""
134
+ # python_schema = core_schema.is_instance_schema(cls)
135
+
136
+ list_of_str_schema = core_schema.list_schema(core_schema.str_schema())
137
+
138
+ return core_schema.no_info_wrap_validator_function(
139
+ cls._validate,
140
+ list_of_str_schema,
141
+ serialization=core_schema.plain_serializer_function_ser_schema(
142
+ lambda x: list(x)
143
+ ),
144
+ )
145
+
146
+ @classmethod
147
+ def _validate(cls, value, handler):
148
+ """Validate and wrap in FieldKey."""
149
+ if isinstance(value, cls):
150
+ return value
151
+ # Let the list schema validate first
152
+ validated = handler(value)
153
+ # Wrap in FieldKey
154
+ return cls(validated)
155
+
156
+
157
+ FeatureDepMetadata: TypeAlias = dict[str, Any]
metaxy/py.typed ADDED
File without changes
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.3
2
+ Name: metaxy
3
+ Version: 0.0.0
4
+ Summary: Add your description here
5
+ Author: Daniel Gafni
6
+ Author-email: Daniel Gafni <danielgafni16@gmail.com>
7
+ Requires-Dist: cyclopts==4.0.0b1
8
+ Requires-Dist: narwhals>=2.9.0
9
+ Requires-Dist: polars>=1.33.1
10
+ Requires-Dist: polars-hash>=0.5.1
11
+ Requires-Dist: pydantic>=2.11.9
12
+ Requires-Dist: pydantic-settings>=2.11.0
13
+ Requires-Dist: pyyaml>=6.0.0
14
+ Requires-Dist: tomli>=2.3.0
15
+ Requires-Dist: rich>=13.0.0
16
+ Requires-Dist: pygraphviz>=1.14 ; extra == 'graphviz'
17
+ Requires-Dist: pyarrow>=18.0.0 ; extra == 'ibis'
18
+ Requires-Dist: ibis-framework>=11.0.0 ; extra == 'ibis'
19
+ Requires-Dist: mermaid-py>=0.8.0 ; extra == 'mermaid'
20
+ Requires-Dist: sqlmodel>=0.0.27 ; extra == 'sqlmodel'
21
+ Requires-Python: >=3.10
22
+ Provides-Extra: graphviz
23
+ Provides-Extra: ibis
24
+ Provides-Extra: mermaid
25
+ Provides-Extra: sqlmodel
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Metaxy
29
+
30
+ ## Overview
31
+
32
+ **Metaxy** is a declarative metadata management system for multi-modal data and machine learning pipelines. Metaxy allows statically defining graphs of features with versioned **fields** -- logical components like `audio`, `frames` for `.mp4` files and **columns** for feature metadata stored in Metaxy's metadata store. With this in place, Metaxy provides:
33
+
34
+ - **Sample-level data versioning**: Track field and column lineage, compute versions as hashes of upstream versions for each sample
35
+ - **Incremental computation**: Automatically detect which samples need recomputation when upstream fields change
36
+ - **Migration system**: When feature code changes without changing outputs (refactoring, graph restructuring), Metaxy can reconcile metadata versions without recomputing expensive features
37
+ - **Storage flexibility**: Pluggable backends (DuckDB, ClickHouse, PostgreSQL, SQLite, in-memory) with native SQL optimization where possible
38
+ - **Big Metadata**: Metaxy is designed with large-scale distributed systems in mind and can handle large amounts of metadata efficiently.
39
+
40
+ Metaxy is designed for production data and ML systems where data and features evolve over time, and you need to track what changed, why, and whether expensive recomputation is actually necessary.
41
+
42
+ ## Data Versioning
43
+
44
+ To demonstrate how Metaxy handles data versioning, let's consider a video processing pipeline:
45
+
46
+ ```py
47
+ from metaxy import (
48
+ Feature,
49
+ FeatureDep,
50
+ FeatureKey,
51
+ FeatureSpec,
52
+ FieldDep,
53
+ FieldKey,
54
+ FieldSpec,
55
+ )
56
+
57
+
58
+ class Video(
59
+ Feature,
60
+ spec=FeatureSpec(
61
+ key=FeatureKey(["example", "video"]),
62
+ deps=None, # Root feature
63
+ fields=[
64
+ FieldSpec(
65
+ key=FieldKey(["audio"]),
66
+ code_version=1,
67
+ ),
68
+ FieldSpec(
69
+ key=FieldKey(["frames"]),
70
+ code_version=1,
71
+ ),
72
+ ],
73
+ ),
74
+ ):
75
+ """Video metadata feature (root)."""
76
+
77
+ frames: int
78
+ duration: float
79
+ size: int
80
+
81
+
82
+ class Crop(
83
+ Feature,
84
+ spec=FeatureSpec(
85
+ key=FeatureKey(["example", "crop"]),
86
+ deps=[FeatureDep(key=Video.spec.key)],
87
+ fields=[
88
+ FieldSpec(
89
+ key=FieldKey(["audio"]),
90
+ code_version=1,
91
+ deps=[
92
+ FieldDep(
93
+ feature_key=Video.spec.key,
94
+ fields=[FieldKey(["audio"])],
95
+ )
96
+ ],
97
+ ),
98
+ FieldSpec(
99
+ key=FieldKey(["frames"]),
100
+ code_version=1,
101
+ deps=[
102
+ FieldDep(
103
+ feature_key=Video.spec.key,
104
+ fields=[FieldKey(["frames"])],
105
+ )
106
+ ],
107
+ ),
108
+ ],
109
+ ),
110
+ ):
111
+ pass # omit columns for the sake of simplicity
112
+
113
+
114
+ class FaceDetection(
115
+ Feature,
116
+ spec=FeatureSpec(
117
+ key=FeatureKey(["example", "face_detection"]),
118
+ deps=[
119
+ FeatureDep(
120
+ key=Crop.spec.key,
121
+ )
122
+ ],
123
+ fields=[
124
+ FieldSpec(
125
+ key=FieldKey(["faces"]),
126
+ code_version=1,
127
+ deps=[
128
+ FieldDep(
129
+ feature_key=Crop.spec.key,
130
+ fields=[FieldKey(["frames"])],
131
+ )
132
+ ],
133
+ ),
134
+ ],
135
+ ),
136
+ ):
137
+ pass
138
+
139
+
140
+ class SpeechToText(
141
+ Feature,
142
+ spec=FeatureSpec(
143
+ key=FeatureKey(["overview", "stt"]),
144
+ deps=[
145
+ FeatureDep(
146
+ key=Video.spec.key,
147
+ )
148
+ ],
149
+ fields=[
150
+ FieldSpec(
151
+ key=FieldKey(["transcription"]),
152
+ code_version=1,
153
+ deps=[
154
+ FieldDep(
155
+ feature_key=Video.spec.key,
156
+ fields=[FieldKey(["audio"])],
157
+ )
158
+ ],
159
+ ),
160
+ ],
161
+ ),
162
+ ):
163
+ pass
164
+ ```
165
+
166
+ When provided with this Python module, `metaxy graph render --format mermaid` (that's handy, right?) produces the following graph:
167
+
168
+ ```mermaid
169
+ ---
170
+ title: Feature Graph
171
+ ---
172
+ flowchart TB
173
+ %% Snapshot version: 8468950d
174
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
175
+ example_video["<div style="text-align:left"><b>example/video</b><br/><small>(v: bc9ca835)</small><br/><font
176
+ color="#999">---</font><br/>• audio <small>(v: 22742381)</small><br/>• frames <small>(v: 794116a9)</small></div>"]
177
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><small>(v: 3ac04df8)</small><br/><font
178
+ color="#999">---</font><br/>• audio <small>(v: 76c8bdc9)</small><br/>• frames <small>(v: abc79017)</small></div>"]
179
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/><small>(v: 1ac83b07)</small><br/><font
180
+ color="#999">---</font><br/>• faces <small>(v: 2d75f0bd)</small></div>"]
181
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><small>(v: c83a754a)</small><br/><font
182
+ color="#999">---</font><br/>• transcription <small>(v: ac412b3c)</small></div>"]
183
+ example_video --> example_crop
184
+ example_crop --> example_face_detection
185
+ example_video --> example_stt
186
+ ```
187
+
188
+ Now imagine the `audio` logical field (don't mix up with metadata columns!) of the very first `Video` feature has been changed. Perhaps it has been cleaned or denoised.
189
+
190
+ ```diff
191
+ key=FeatureKey(["example", "video"]),
192
+ deps=None, # Root feature
193
+ fields=[
194
+ FieldSpec(
195
+ key=FieldKey(["audio"]),
196
+ - code_version=1,
197
+ + code_version=2,
198
+ ),
199
+ ```
200
+
201
+ In this case we'd typically want to recompute the downstream `Crop`, `SpeechToText` and `Embeddings` features, but not the `FaceDetection` feature, since it only depends on `frames` and not on `audio`.
202
+
203
+ `metaxy graph diff` reveals exactly that:
204
+
205
+ ```mermaid
206
+ ---
207
+ title: Merged Graph Diff
208
+ ---
209
+ flowchart TB
210
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
211
+
212
+ example_video["<div style="text-align:left"><b>example/video</b><br/><font color="#CC0000">bc9ca8</font> → <font
213
+ color="#00AA00">6db302</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
214
+ color="#CC0000">227423</font> → <font color="#00AA00">09c839</font>)<br/>- frames (794116)</div>"]
215
+ style example_video stroke:#FFA500,stroke-width:3px
216
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><font color="#CC0000">3ac04d</font> → <font
217
+ color="#00AA00">54dc7f</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
218
+ color="#CC0000">76c8bd</font> → <font color="#00AA00">f3130c</font>)<br/>- frames (abc790)</div>"]
219
+ style example_crop stroke:#FFA500,stroke-width:3px
220
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/>1ac83b<br/><font
221
+ color="#999">---</font><br/>- faces (2d75f0)</div>"]
222
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><font color="#CC0000">c83a75</font> → <font
223
+ color="#00AA00">066d34</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">transcription</font> (<font
224
+ color="#CC0000">ac412b</font> → <font color="#00AA00">058410</font>)</div>"]
225
+ style example_stt stroke:#FFA500,stroke-width:3px
226
+
227
+ example_video --> example_crop
228
+ example_crop --> example_face_detection
229
+ example_video --> example_stt
230
+ ```
231
+
232
+ The versions of `audio` fields through the graph as well as the whole `FaceDetection` feature stayed the same!
233
+
234
+ We can use Metaxy's static graph analysis to identify which features need to be recomputed when a new version of a feature is introduced. In addition to feature and field level versions, Metaxy can also compute a sample-level version (may be different for each sample in the one million dataset you have) ahead of computations through the whole graph. This enables exciting features such as processing cost prediction and automatic migrations for metadata.
235
+
236
+ ## Development
237
+
238
+ Setting up the environment:
239
+
240
+ ```shell
241
+ uv sync --all-extras
242
+ uv run prek install
243
+ ```
244
+
245
+ ## Examples
246
+
247
+ See [examples](examples/README.md).