metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from functools import cached_property
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any, TypeAlias, overload
|
|
8
|
+
|
|
9
|
+
import narwhals as nw
|
|
10
|
+
import pydantic
|
|
11
|
+
from pydantic import BeforeValidator
|
|
12
|
+
from pydantic.types import JsonValue
|
|
13
|
+
from typing_extensions import Self
|
|
14
|
+
|
|
15
|
+
from metaxy.models.bases import FrozenBaseModel
|
|
16
|
+
from metaxy.models.field import CoersibleToFieldSpecsTypeAdapter, FieldSpec
|
|
17
|
+
from metaxy.models.fields_mapping import FieldsMapping
|
|
18
|
+
from metaxy.models.filter_expression import parse_filter_string
|
|
19
|
+
from metaxy.models.lineage import LineageRelationship
|
|
20
|
+
from metaxy.models.types import (
|
|
21
|
+
CoercibleToFeatureKey,
|
|
22
|
+
FeatureKey,
|
|
23
|
+
FeatureKeyAdapter,
|
|
24
|
+
FieldKey,
|
|
25
|
+
ValidatedFeatureKey,
|
|
26
|
+
)
|
|
27
|
+
from metaxy.utils.hashing import truncate_hash
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
# yes, these are circular imports, the TYPE_CHECKING block hides them at runtime.
|
|
31
|
+
# neither pyright not basedpyright allow ignoring `reportImportCycles` because they think it's a bad practice
|
|
32
|
+
# and it would be very smart to force the user to restructure their project instead
|
|
33
|
+
# context: https://github.com/microsoft/pyright/issues/1825
|
|
34
|
+
# however, considering the recursive nature of graphs, and the syntactic sugar that we want to support,
|
|
35
|
+
# I decided to just put these errors into `.basedpyright/baseline.json` (after ensuring this is the only error produced by basedpyright)
|
|
36
|
+
from metaxy.models.feature import BaseFeature
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FeatureDep(pydantic.BaseModel):
|
|
40
|
+
"""Feature dependency specification with optional column selection and renaming.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
key: The feature key to depend on. Accepts string ("a/b/c"), list (["a", "b", "c"]),
|
|
44
|
+
FeatureKey instance, or BaseFeature class.
|
|
45
|
+
columns: Optional tuple of column names to select from upstream feature.
|
|
46
|
+
- None (default): Keep all columns from upstream
|
|
47
|
+
- Empty tuple (): Keep only system columns (sample_uid, provenance_by_field, etc.)
|
|
48
|
+
- Tuple of names: Keep only specified columns (plus system columns)
|
|
49
|
+
rename: Optional mapping of old column names to new names.
|
|
50
|
+
Applied after column selection.
|
|
51
|
+
fields_mapping: Optional field mapping configuration for automatic field dependency resolution.
|
|
52
|
+
When provided, fields without explicit deps will automatically map to matching upstream fields.
|
|
53
|
+
Defaults to using `[FieldsMapping.default()][metaxy.models.fields_mapping.DefaultFieldsMapping]`.
|
|
54
|
+
filters: Optional SQL-like filter strings applied to this dependency. Automatically parsed into
|
|
55
|
+
Narwhals expressions (accessible via the `filters` property). Filters are automatically
|
|
56
|
+
applied by FeatureDepTransformer after renames during all FeatureDep operations (including
|
|
57
|
+
resolve_update and version computation).
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
```py
|
|
61
|
+
# Keep all columns with default field mapping
|
|
62
|
+
FeatureDep(feature="upstream")
|
|
63
|
+
|
|
64
|
+
# Keep all columns with suffix matching
|
|
65
|
+
FeatureDep(feature="upstream", fields_mapping=FieldsMapping.default(match_suffix=True))
|
|
66
|
+
|
|
67
|
+
# Keep all columns with all fields mapping
|
|
68
|
+
FeatureDep(feature="upstream", fields_mapping=FieldsMapping.all())
|
|
69
|
+
|
|
70
|
+
# Keep only specific columns
|
|
71
|
+
FeatureDep(
|
|
72
|
+
feature="upstream/feature",
|
|
73
|
+
columns=("col1", "col2")
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Rename columns to avoid conflicts
|
|
77
|
+
FeatureDep(
|
|
78
|
+
feature="upstream/feature",
|
|
79
|
+
rename={"old_name": "new_name"}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Select and rename
|
|
83
|
+
FeatureDep(
|
|
84
|
+
feature="upstream/feature",
|
|
85
|
+
columns=("col1", "col2"),
|
|
86
|
+
rename={"col1": "upstream_col1"}
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# SQL filters
|
|
90
|
+
FeatureDep(
|
|
91
|
+
feature="upstream",
|
|
92
|
+
filters=["age >= 25", "status = 'active'"]
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
feature: ValidatedFeatureKey
|
|
98
|
+
columns: tuple[str, ...] | None = (
|
|
99
|
+
None # None = all columns, () = only system columns
|
|
100
|
+
)
|
|
101
|
+
rename: dict[str, str] | None = None # Column renaming mapping
|
|
102
|
+
fields_mapping: FieldsMapping = pydantic.Field(
|
|
103
|
+
default_factory=FieldsMapping.default
|
|
104
|
+
)
|
|
105
|
+
sql_filters: tuple[str, ...] | None = pydantic.Field(
|
|
106
|
+
default=None,
|
|
107
|
+
description="SQL-like filter strings applied to this dependency.",
|
|
108
|
+
validation_alias=pydantic.AliasChoices("filters", "sql_filters"),
|
|
109
|
+
serialization_alias="filters",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if TYPE_CHECKING:
|
|
113
|
+
|
|
114
|
+
def __init__( # pyright: ignore[reportMissingSuperCall]
|
|
115
|
+
self,
|
|
116
|
+
*,
|
|
117
|
+
feature: str | Sequence[str] | FeatureKey | type[BaseFeature],
|
|
118
|
+
columns: tuple[str, ...] | None = None,
|
|
119
|
+
rename: dict[str, str] | None = None,
|
|
120
|
+
fields_mapping: FieldsMapping | None = None,
|
|
121
|
+
filters: Sequence[str] | None = None,
|
|
122
|
+
) -> None: ... # pyright: ignore[reportMissingSuperCall]
|
|
123
|
+
|
|
124
|
+
@cached_property
|
|
125
|
+
def filters(self) -> tuple[nw.Expr, ...]:
|
|
126
|
+
"""Parse sql_filters into Narwhals expressions."""
|
|
127
|
+
if self.sql_filters is None:
|
|
128
|
+
return ()
|
|
129
|
+
return tuple(parse_filter_string(filter_str) for filter_str in self.sql_filters)
|
|
130
|
+
|
|
131
|
+
def table_name(self) -> str:
|
|
132
|
+
"""Get SQL-like table name for this feature spec."""
|
|
133
|
+
return self.feature.table_name
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
IDColumns: TypeAlias = Sequence[
|
|
137
|
+
str
|
|
138
|
+
] # non-bound, should be used for feature specs with arbitrary id columns
|
|
139
|
+
|
|
140
|
+
CoercibleToFeatureDep: TypeAlias = (
|
|
141
|
+
FeatureDep | type["BaseFeature"] | str | Sequence[str] | FeatureKey
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _validate_id_columns(value: Any) -> tuple[str, ...]:
|
|
146
|
+
"""Coerce id_columns to tuple."""
|
|
147
|
+
if isinstance(value, tuple):
|
|
148
|
+
return value
|
|
149
|
+
return tuple(value)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _validate_deps(value: Any) -> list[FeatureDep]:
|
|
153
|
+
"""Coerce deps list, converting Feature classes to FeatureDep instances."""
|
|
154
|
+
# Import here to avoid circular dependency at module level
|
|
155
|
+
from metaxy.models.feature import BaseFeature
|
|
156
|
+
|
|
157
|
+
if not isinstance(value, list):
|
|
158
|
+
value = list(value) if hasattr(value, "__iter__") else [value]
|
|
159
|
+
|
|
160
|
+
result = []
|
|
161
|
+
for item in value:
|
|
162
|
+
if isinstance(item, FeatureDep):
|
|
163
|
+
# Already a FeatureDep, keep as-is
|
|
164
|
+
result.append(item)
|
|
165
|
+
elif isinstance(item, dict):
|
|
166
|
+
# It's a dict (from deserialization), let Pydantic construct FeatureDep from it
|
|
167
|
+
result.append(FeatureDep.model_validate(item))
|
|
168
|
+
elif isinstance(item, type) and issubclass(item, BaseFeature):
|
|
169
|
+
# It's a Feature class, convert to FeatureDep
|
|
170
|
+
result.append(FeatureDep(feature=item))
|
|
171
|
+
else:
|
|
172
|
+
# Try to construct FeatureDep from the item (handles FeatureSpec, etc.)
|
|
173
|
+
result.append(FeatureDep(feature=item))
|
|
174
|
+
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class FeatureSpec(FrozenBaseModel):
|
|
179
|
+
key: Annotated[FeatureKey, BeforeValidator(FeatureKeyAdapter.validate_python)]
|
|
180
|
+
id_columns: Annotated[tuple[str, ...], BeforeValidator(_validate_id_columns)] = (
|
|
181
|
+
pydantic.Field(
|
|
182
|
+
...,
|
|
183
|
+
description="Columns that uniquely identify a sample in this feature.",
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
deps: Annotated[list[FeatureDep], BeforeValidator(_validate_deps)] = pydantic.Field(
|
|
187
|
+
default_factory=list
|
|
188
|
+
)
|
|
189
|
+
fields: Annotated[
|
|
190
|
+
list[FieldSpec],
|
|
191
|
+
BeforeValidator(CoersibleToFieldSpecsTypeAdapter.validate_python),
|
|
192
|
+
] = pydantic.Field(
|
|
193
|
+
default_factory=lambda: [
|
|
194
|
+
FieldSpec(
|
|
195
|
+
key=FieldKey(["default"]),
|
|
196
|
+
)
|
|
197
|
+
],
|
|
198
|
+
)
|
|
199
|
+
lineage: LineageRelationship = pydantic.Field(
|
|
200
|
+
default_factory=LineageRelationship.identity,
|
|
201
|
+
description="Lineage relationship of this feature.",
|
|
202
|
+
)
|
|
203
|
+
metadata: dict[str, JsonValue] = pydantic.Field(
|
|
204
|
+
default_factory=dict,
|
|
205
|
+
description="Metadata attached to this feature.",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if TYPE_CHECKING:
|
|
209
|
+
# Overload for common case: list of FeatureDep instances
|
|
210
|
+
@overload
|
|
211
|
+
def __init__(
|
|
212
|
+
self,
|
|
213
|
+
*,
|
|
214
|
+
key: CoercibleToFeatureKey,
|
|
215
|
+
id_columns: IDColumns,
|
|
216
|
+
deps: list[FeatureDep] | None = None,
|
|
217
|
+
fields: Sequence[str | FieldSpec] | None = None,
|
|
218
|
+
lineage: LineageRelationship | None = None,
|
|
219
|
+
metadata: Mapping[str, JsonValue] | None = None,
|
|
220
|
+
**kwargs: Any,
|
|
221
|
+
) -> None: ...
|
|
222
|
+
|
|
223
|
+
# Overload for flexible case: list of coercible types
|
|
224
|
+
@overload
|
|
225
|
+
def __init__(
|
|
226
|
+
self,
|
|
227
|
+
*,
|
|
228
|
+
key: CoercibleToFeatureKey,
|
|
229
|
+
id_columns: IDColumns,
|
|
230
|
+
deps: list[CoercibleToFeatureDep] | None = None,
|
|
231
|
+
fields: Sequence[str | FieldSpec] | None = None,
|
|
232
|
+
lineage: LineageRelationship | None = None,
|
|
233
|
+
metadata: Mapping[str, JsonValue] | None = None,
|
|
234
|
+
**kwargs: Any,
|
|
235
|
+
) -> None: ...
|
|
236
|
+
|
|
237
|
+
# Implementation signature
|
|
238
|
+
def __init__( # pyright: ignore[reportMissingSuperCall]
|
|
239
|
+
self,
|
|
240
|
+
*,
|
|
241
|
+
key: CoercibleToFeatureKey,
|
|
242
|
+
id_columns: IDColumns,
|
|
243
|
+
deps: list[FeatureDep] | list[CoercibleToFeatureDep] | None = None,
|
|
244
|
+
fields: Sequence[str | FieldSpec] | None = None,
|
|
245
|
+
lineage: LineageRelationship | None = None,
|
|
246
|
+
metadata: Mapping[str, JsonValue] | None = None,
|
|
247
|
+
**kwargs: Any,
|
|
248
|
+
) -> None: ... # pyright: ignore[reportMissingSuperCall]
|
|
249
|
+
|
|
250
|
+
@cached_property
|
|
251
|
+
def fields_by_key(self) -> Mapping[FieldKey, FieldSpec]:
|
|
252
|
+
return {c.key: c for c in self.fields}
|
|
253
|
+
|
|
254
|
+
@cached_property
|
|
255
|
+
def code_version(self) -> str:
|
|
256
|
+
"""Hash of this feature's field code_versions only (no dependencies)."""
|
|
257
|
+
hasher = hashlib.sha256()
|
|
258
|
+
|
|
259
|
+
# Sort fields by key for deterministic ordering
|
|
260
|
+
sorted_fields = sorted(self.fields, key=lambda field: field.key.to_string())
|
|
261
|
+
|
|
262
|
+
for field in sorted_fields:
|
|
263
|
+
hasher.update(field.key.to_string().encode("utf-8"))
|
|
264
|
+
hasher.update(str(field.code_version).encode("utf-8"))
|
|
265
|
+
|
|
266
|
+
return truncate_hash(hasher.hexdigest())
|
|
267
|
+
|
|
268
|
+
def table_name(self) -> str:
|
|
269
|
+
"""Get SQL-like table name for this feature spec."""
|
|
270
|
+
return self.key.table_name
|
|
271
|
+
|
|
272
|
+
@pydantic.model_validator(mode="after")
|
|
273
|
+
def validate_unique_field_keys(self) -> Self:
|
|
274
|
+
"""Validate that all fields have unique keys."""
|
|
275
|
+
seen_keys: set[tuple[str, ...]] = set()
|
|
276
|
+
for field in self.fields:
|
|
277
|
+
# Convert to tuple for hashability in case it's a plain list
|
|
278
|
+
key_tuple = tuple(field.key)
|
|
279
|
+
if key_tuple in seen_keys:
|
|
280
|
+
raise ValueError(
|
|
281
|
+
f"Duplicate field key found: {field.key}. "
|
|
282
|
+
f"All fields must have unique keys."
|
|
283
|
+
)
|
|
284
|
+
seen_keys.add(key_tuple)
|
|
285
|
+
return self
|
|
286
|
+
|
|
287
|
+
@pydantic.model_validator(mode="after")
|
|
288
|
+
def validate_id_columns(self) -> Self:
|
|
289
|
+
"""Validate that id_columns is non-empty if specified."""
|
|
290
|
+
if self.id_columns is not None and len(self.id_columns) == 0:
|
|
291
|
+
raise ValueError(
|
|
292
|
+
"id_columns must be non-empty if specified. Use None for default."
|
|
293
|
+
)
|
|
294
|
+
return self
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def feature_spec_version(self) -> str:
|
|
298
|
+
"""Compute SHA256 hash of the complete feature specification.
|
|
299
|
+
|
|
300
|
+
This property provides a deterministic hash of ALL specification properties,
|
|
301
|
+
including key, deps, fields, and any metadata/tags.
|
|
302
|
+
Used for audit trail and tracking specification changes.
|
|
303
|
+
|
|
304
|
+
Unlike feature_version which only hashes computational properties
|
|
305
|
+
(for migration triggering), feature_spec_version captures the entire specification
|
|
306
|
+
for complete reproducibility and audit purposes.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SHA256 hex digest of the specification
|
|
310
|
+
|
|
311
|
+
Example:
|
|
312
|
+
```py
|
|
313
|
+
spec = FeatureSpec(
|
|
314
|
+
key=FeatureKey(["my", "feature"]),
|
|
315
|
+
fields=[FieldSpec(key=FieldKey(["default"]))],
|
|
316
|
+
)
|
|
317
|
+
spec.feature_spec_version
|
|
318
|
+
# 'abc123...' # 64-character hex string
|
|
319
|
+
```
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
# Use model_dump with mode="json" for deterministic serialization
|
|
323
|
+
# This ensures all types (like FeatureKey) are properly serialized
|
|
324
|
+
spec_dict = self.model_dump(mode="json")
|
|
325
|
+
|
|
326
|
+
# Sort keys to ensure deterministic ordering
|
|
327
|
+
spec_json = json.dumps(spec_dict, sort_keys=True)
|
|
328
|
+
|
|
329
|
+
# Compute SHA256 hash
|
|
330
|
+
hasher = hashlib.sha256()
|
|
331
|
+
hasher.update(spec_json.encode("utf-8"))
|
|
332
|
+
|
|
333
|
+
return hasher.hexdigest()
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
FeatureSpecWithIDColumns: TypeAlias = FeatureSpec
|
|
337
|
+
|
|
338
|
+
CoercibleToFieldSpec: TypeAlias = str | FieldSpec
|
metaxy/models/field.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal, overload
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, BeforeValidator, TypeAdapter
|
|
6
|
+
from pydantic import Field as PydanticField
|
|
7
|
+
|
|
8
|
+
from metaxy.models.constants import DEFAULT_CODE_VERSION
|
|
9
|
+
from metaxy.models.types import (
|
|
10
|
+
CoercibleToFieldKey,
|
|
11
|
+
FeatureKey,
|
|
12
|
+
FeatureKeyAdapter,
|
|
13
|
+
FieldKey,
|
|
14
|
+
FieldKeyAdapter,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
# yes, these are circular imports, the TYPE_CHECKING block hides them at runtime.
|
|
19
|
+
# neither pyright not basedpyright allow ignoring `reportImportCycles` because they think it's a bad practice
|
|
20
|
+
# and it would be very smart to force the user to restructure their project instead
|
|
21
|
+
# context: https://github.com/microsoft/pyright/issues/1825
|
|
22
|
+
# however, considering the recursive nature of graphs, and the syntactic sugar that we want to support,
|
|
23
|
+
# I decided to just put these errors into `.basedpyright/baseline.json` (after ensuring this is the only error produced by basedpyright)
|
|
24
|
+
from metaxy.models.feature import BaseFeature
|
|
25
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SpecialFieldDep(Enum):
|
|
29
|
+
ALL = "__METAXY_ALL_DEP__"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _validate_field_dep_feature(value: Any) -> FeatureKey:
|
|
33
|
+
"""Coerce various input types to FeatureKey for FieldDep."""
|
|
34
|
+
# Import here to avoid circular dependency at module level
|
|
35
|
+
from metaxy.models.feature import BaseFeature
|
|
36
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
37
|
+
|
|
38
|
+
if isinstance(value, FeatureKey):
|
|
39
|
+
return value
|
|
40
|
+
elif isinstance(value, FeatureSpec):
|
|
41
|
+
return value.key
|
|
42
|
+
elif isinstance(value, type) and issubclass(value, BaseFeature):
|
|
43
|
+
return value.spec().key
|
|
44
|
+
else:
|
|
45
|
+
# Handle str, Sequence[str], etc.
|
|
46
|
+
return FeatureKeyAdapter.validate_python(value)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _validate_field_dep_fields(
|
|
50
|
+
value: Any,
|
|
51
|
+
) -> list[FieldKey] | Literal[SpecialFieldDep.ALL]:
|
|
52
|
+
"""Coerce list of field keys to validated FieldKey instances."""
|
|
53
|
+
if value is SpecialFieldDep.ALL:
|
|
54
|
+
return SpecialFieldDep.ALL
|
|
55
|
+
if isinstance(value, str):
|
|
56
|
+
if value == SpecialFieldDep.ALL.value:
|
|
57
|
+
return SpecialFieldDep.ALL
|
|
58
|
+
# Invalid string value - will be caught by Pydantic validation
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"String value must be {SpecialFieldDep.ALL.value}, got {value}"
|
|
61
|
+
)
|
|
62
|
+
# Validate as list of FieldKeys
|
|
63
|
+
return TypeAdapter(list[FieldKey]).validate_python(value)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class FieldDep(BaseModel):
|
|
67
|
+
feature: Annotated[FeatureKey, BeforeValidator(_validate_field_dep_feature)]
|
|
68
|
+
fields: Annotated[
|
|
69
|
+
list[FieldKey] | Literal[SpecialFieldDep.ALL],
|
|
70
|
+
BeforeValidator(_validate_field_dep_fields),
|
|
71
|
+
] = SpecialFieldDep.ALL
|
|
72
|
+
|
|
73
|
+
if TYPE_CHECKING:
|
|
74
|
+
|
|
75
|
+
@overload
|
|
76
|
+
@overload
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
feature: str,
|
|
80
|
+
**kwargs: Any,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Initialize from string feature key."""
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
@overload
|
|
86
|
+
@overload
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
feature: Sequence[str],
|
|
90
|
+
**kwargs: Any,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Initialize from sequence of parts."""
|
|
93
|
+
...
|
|
94
|
+
|
|
95
|
+
@overload
|
|
96
|
+
@overload
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
feature: FeatureKey,
|
|
100
|
+
**kwargs: Any,
|
|
101
|
+
) -> None:
|
|
102
|
+
"""Initialize from FeatureKey instance."""
|
|
103
|
+
...
|
|
104
|
+
|
|
105
|
+
@overload
|
|
106
|
+
@overload
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
feature: "FeatureSpec",
|
|
110
|
+
**kwargs: Any,
|
|
111
|
+
) -> None:
|
|
112
|
+
"""Initialize from FeatureSpec instance."""
|
|
113
|
+
...
|
|
114
|
+
|
|
115
|
+
@overload
|
|
116
|
+
@overload
|
|
117
|
+
def __init__(
|
|
118
|
+
self,
|
|
119
|
+
feature: type["BaseFeature"],
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Initialize from BaseFeature class."""
|
|
123
|
+
...
|
|
124
|
+
|
|
125
|
+
# Final signature combining all overloads
|
|
126
|
+
def __init__( # pyright: ignore[reportMissingSuperCall]
|
|
127
|
+
self,
|
|
128
|
+
feature: str
|
|
129
|
+
| Sequence[str]
|
|
130
|
+
| FeatureKey
|
|
131
|
+
| "FeatureSpec"
|
|
132
|
+
| type["BaseFeature"],
|
|
133
|
+
fields: list[CoercibleToFieldKey]
|
|
134
|
+
| Literal[SpecialFieldDep.ALL] = SpecialFieldDep.ALL,
|
|
135
|
+
**kwargs: Any,
|
|
136
|
+
) -> None: ... # pyright: ignore[reportMissingSuperCall]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _validate_field_spec_from_string(value: Any) -> Any:
|
|
140
|
+
"""Validator function to convert string to FieldSpec dict.
|
|
141
|
+
|
|
142
|
+
This allows FieldSpec to be constructed from just a string key:
|
|
143
|
+
- "my_field" -> FieldSpec(key="my_field", code_version="1")
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
value: The value to validate (can be str, dict, or FieldSpec)
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Either the original value or a dict that Pydantic will use to construct FieldSpec
|
|
150
|
+
"""
|
|
151
|
+
# If it's a string, convert to dict with key field
|
|
152
|
+
if isinstance(value, str):
|
|
153
|
+
return {"key": value}
|
|
154
|
+
|
|
155
|
+
# Otherwise return as-is for normal Pydantic processing
|
|
156
|
+
return value
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _validate_field_spec_key(value: Any) -> FieldKey:
|
|
160
|
+
"""Coerce various input types to FieldKey."""
|
|
161
|
+
if isinstance(value, FieldKey):
|
|
162
|
+
return value
|
|
163
|
+
return FieldKeyAdapter.validate_python(value)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class FieldSpec(BaseModel):
|
|
167
|
+
key: Annotated[FieldKey, BeforeValidator(_validate_field_spec_key)] = PydanticField(
|
|
168
|
+
default_factory=lambda: FieldKey(["default"])
|
|
169
|
+
)
|
|
170
|
+
code_version: str = DEFAULT_CODE_VERSION
|
|
171
|
+
|
|
172
|
+
# Field-level explicit dependencies
|
|
173
|
+
# - SpecialFieldDep.ALL: explicitly depend on all upstream features and all their fields
|
|
174
|
+
# - list[FieldDep]: depend on particular fields of specific features
|
|
175
|
+
deps: SpecialFieldDep | list[FieldDep] = PydanticField(default_factory=list)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def __get_pydantic_core_schema__(cls, source_type, handler):
|
|
179
|
+
"""Add custom validator to coerce strings to FieldSpec."""
|
|
180
|
+
from pydantic_core import core_schema
|
|
181
|
+
|
|
182
|
+
# Get the default schema
|
|
183
|
+
python_schema = handler(source_type)
|
|
184
|
+
|
|
185
|
+
# Wrap it with a before validator that converts strings
|
|
186
|
+
return core_schema.no_info_before_validator_function(
|
|
187
|
+
_validate_field_spec_from_string,
|
|
188
|
+
python_schema,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if TYPE_CHECKING:
|
|
192
|
+
|
|
193
|
+
@overload
|
|
194
|
+
def __init__(self, key: CoercibleToFieldKey, **kwargs) -> None:
|
|
195
|
+
"""Initialize from key and no other arguments."""
|
|
196
|
+
...
|
|
197
|
+
|
|
198
|
+
@overload
|
|
199
|
+
def __init__(
|
|
200
|
+
self,
|
|
201
|
+
key: str,
|
|
202
|
+
code_version: str,
|
|
203
|
+
deps: SpecialFieldDep | list[FieldDep] | None = None,
|
|
204
|
+
) -> None:
|
|
205
|
+
"""Initialize from string key."""
|
|
206
|
+
...
|
|
207
|
+
|
|
208
|
+
@overload
|
|
209
|
+
def __init__(
|
|
210
|
+
self,
|
|
211
|
+
key: Sequence[str],
|
|
212
|
+
code_version: str,
|
|
213
|
+
deps: SpecialFieldDep | list[FieldDep] | None = None,
|
|
214
|
+
) -> None:
|
|
215
|
+
"""Initialize from sequence of parts."""
|
|
216
|
+
...
|
|
217
|
+
|
|
218
|
+
@overload
|
|
219
|
+
def __init__(
|
|
220
|
+
self,
|
|
221
|
+
key: FieldKey,
|
|
222
|
+
code_version: str,
|
|
223
|
+
deps: SpecialFieldDep | list[FieldDep] | None = None,
|
|
224
|
+
) -> None:
|
|
225
|
+
"""Initialize from FieldKey instance."""
|
|
226
|
+
...
|
|
227
|
+
|
|
228
|
+
# Final signature combining all overloads
|
|
229
|
+
def __init__( # pyright: ignore[reportMissingSuperCall]
|
|
230
|
+
self,
|
|
231
|
+
key: CoercibleToFieldKey,
|
|
232
|
+
code_version: str = DEFAULT_CODE_VERSION,
|
|
233
|
+
deps: SpecialFieldDep | list[FieldDep] | None = None,
|
|
234
|
+
**kwargs: Any,
|
|
235
|
+
) -> None: ...
|
|
236
|
+
|
|
237
|
+
# Runtime __init__ to handle positional arguments
|
|
238
|
+
def __init__(
|
|
239
|
+
self,
|
|
240
|
+
key: CoercibleToFieldKey,
|
|
241
|
+
code_version: str = DEFAULT_CODE_VERSION,
|
|
242
|
+
deps: SpecialFieldDep | list[FieldDep] | None = None,
|
|
243
|
+
*args,
|
|
244
|
+
**kwargs: Any,
|
|
245
|
+
) -> None:
|
|
246
|
+
validated_key = FieldKeyAdapter.validate_python(key)
|
|
247
|
+
|
|
248
|
+
# Handle None deps - use empty list as default
|
|
249
|
+
if deps is None:
|
|
250
|
+
deps = []
|
|
251
|
+
|
|
252
|
+
super().__init__(
|
|
253
|
+
key=validated_key,
|
|
254
|
+
code_version=code_version,
|
|
255
|
+
deps=deps,
|
|
256
|
+
*args,
|
|
257
|
+
**kwargs,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Type adapter for validating FieldSpec with string coercion support
|
|
262
|
+
|
|
263
|
+
CoersibleToFieldSpecsTypeAdapter = TypeAdapter(list[FieldSpec])
|