metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Abstract base class for metadata diff resolvers."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Any, NamedTuple
|
|
5
|
+
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LazyDiffResult(NamedTuple):
|
|
13
|
+
"""Result of diffing with lazy Narwhals LazyFrames (opt-in via lazy=True).
|
|
14
|
+
|
|
15
|
+
Contains lazy Narwhals LazyFrames - users decide when/how to materialize.
|
|
16
|
+
|
|
17
|
+
Users can:
|
|
18
|
+
- Keep lazy for further operations: result.added.filter(...)
|
|
19
|
+
- Materialize to Polars: result.added.collect().to_native()
|
|
20
|
+
- Materialize to Pandas: result.added.collect().to_pandas()
|
|
21
|
+
- Materialize to PyArrow: result.added.collect().to_arrow()
|
|
22
|
+
- Convert to DiffResult: result.collect()
|
|
23
|
+
|
|
24
|
+
Backend execution:
|
|
25
|
+
- SQL stores: All operations stay in SQL until .collect()
|
|
26
|
+
- Polars stores: Operations stay lazy until .collect()
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
added: New samples (lazy, never None - empty LazyFrame instead)
|
|
30
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
31
|
+
changed: Changed samples (lazy, never None)
|
|
32
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
33
|
+
removed: Removed samples (lazy, never None)
|
|
34
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
35
|
+
|
|
36
|
+
Note:
|
|
37
|
+
May contain additional user columns beyond sample_uid and data_version,
|
|
38
|
+
depending on what was passed to resolve_update() via align_upstream_metadata.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
added: nw.LazyFrame[Any]
|
|
42
|
+
changed: nw.LazyFrame[Any]
|
|
43
|
+
removed: nw.LazyFrame[Any]
|
|
44
|
+
|
|
45
|
+
def collect(self) -> "DiffResult":
|
|
46
|
+
"""Materialize all lazy frames to create a DiffResult.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
DiffResult with all frames materialized to eager DataFrames.
|
|
50
|
+
"""
|
|
51
|
+
return DiffResult(
|
|
52
|
+
added=self.added.collect(),
|
|
53
|
+
changed=self.changed.collect(),
|
|
54
|
+
removed=self.removed.collect(),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DiffResult(NamedTuple):
|
|
59
|
+
"""Result of diffing with eager Narwhals DataFrames (default).
|
|
60
|
+
|
|
61
|
+
Contains materialized Narwhals DataFrames - ready to use immediately.
|
|
62
|
+
|
|
63
|
+
Users can convert to their preferred format:
|
|
64
|
+
- Polars: result.added.to_native()
|
|
65
|
+
- Pandas: result.added.to_pandas()
|
|
66
|
+
- PyArrow: result.added.to_arrow()
|
|
67
|
+
|
|
68
|
+
Attributes:
|
|
69
|
+
added: New samples (eager, never None - empty DataFrame instead)
|
|
70
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
71
|
+
changed: Changed samples (eager, never None)
|
|
72
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
73
|
+
removed: Removed samples (eager, never None)
|
|
74
|
+
Columns: [sample_uid, data_version, ...user columns...]
|
|
75
|
+
|
|
76
|
+
Note:
|
|
77
|
+
May contain additional user columns beyond sample_uid and data_version,
|
|
78
|
+
depending on what was passed to resolve_update() via align_upstream_metadata.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
added: nw.DataFrame[Any]
|
|
82
|
+
changed: nw.DataFrame[Any]
|
|
83
|
+
removed: nw.DataFrame[Any]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class MetadataDiffResolver(ABC):
|
|
87
|
+
"""Identifies rows with changed data_versions by comparing target with current.
|
|
88
|
+
|
|
89
|
+
The diff resolver compares newly calculated data_versions (target) with
|
|
90
|
+
existing metadata (current) to identify what needs to be written.
|
|
91
|
+
|
|
92
|
+
This is Step 3 in the data versioning process:
|
|
93
|
+
1. Join upstream features → unified upstream view
|
|
94
|
+
2. Calculate data_version from upstream → target versions
|
|
95
|
+
3. Diff with current metadata → identify changes ← THIS STEP
|
|
96
|
+
|
|
97
|
+
All component boundaries use Narwhals LazyFrames for backend-agnostic processing.
|
|
98
|
+
|
|
99
|
+
Examples:
|
|
100
|
+
- NarwhalsDiffResolver: Backend-agnostic using Narwhals expressions
|
|
101
|
+
- IbisDiffResolver: Converts to Ibis internally for SQL processing
|
|
102
|
+
|
|
103
|
+
Important Design:
|
|
104
|
+
Takes lazy Narwhals refs as input, returns LazyDiffResult as output.
|
|
105
|
+
This minimizes query execution:
|
|
106
|
+
- SQL backends: One query with CTEs computes all three categories
|
|
107
|
+
- Polars: Uses lazy operations, splits into three LazyFrames
|
|
108
|
+
|
|
109
|
+
Users can override Feature.resolve_data_version_diff to customize:
|
|
110
|
+
- Ignore certain field changes
|
|
111
|
+
- Apply custom change detection rules
|
|
112
|
+
- Filter out specific samples
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def find_changes(
|
|
117
|
+
self,
|
|
118
|
+
target_versions: nw.LazyFrame[Any],
|
|
119
|
+
current_metadata: nw.LazyFrame[Any] | None,
|
|
120
|
+
) -> LazyDiffResult:
|
|
121
|
+
"""Find all changes between target and current metadata.
|
|
122
|
+
|
|
123
|
+
Compares target data_versions (newly calculated) with current metadata
|
|
124
|
+
and categorizes all differences.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
target_versions: Narwhals LazyFrame with newly calculated data_versions
|
|
128
|
+
Shape: [sample_uid, data_version (calculated), upstream columns...]
|
|
129
|
+
current_metadata: Narwhals LazyFrame with current metadata, or None
|
|
130
|
+
Shape: [sample_uid, data_version (existing), feature_version, custom columns...]
|
|
131
|
+
Should be pre-filtered by feature_version at the caller level if needed.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
LazyDiffResult with three lazy Narwhals LazyFrames.
|
|
135
|
+
Caller materializes to DiffResult if needed (for lazy=False).
|
|
136
|
+
|
|
137
|
+
Implementation Note:
|
|
138
|
+
Should build lazy operations without materializing:
|
|
139
|
+
- SQL backends: Build one lazy query with CTEs for all three categories
|
|
140
|
+
- Polars: Use lazy operations, no collect() calls
|
|
141
|
+
|
|
142
|
+
Note:
|
|
143
|
+
For immutable append-only storage, typically only 'added' and 'changed'
|
|
144
|
+
are written. 'removed' is useful for validation/reporting.
|
|
145
|
+
|
|
146
|
+
Feature version filtering should happen at the read_metadata() level,
|
|
147
|
+
not in the diff resolver. The diff resolver just compares whatever
|
|
148
|
+
metadata is passed to it.
|
|
149
|
+
"""
|
|
150
|
+
pass
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Narwhals implementation of metadata diff resolver.
|
|
2
|
+
|
|
3
|
+
Unified diff resolver that works with any backend (Polars, Ibis/SQL) through Narwhals.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
import narwhals as nw
|
|
9
|
+
|
|
10
|
+
from metaxy.data_versioning.diff.base import (
|
|
11
|
+
LazyDiffResult,
|
|
12
|
+
MetadataDiffResolver,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NarwhalsDiffResolver(MetadataDiffResolver):
|
|
20
|
+
"""Identifies changed rows using Narwhals operations.
|
|
21
|
+
|
|
22
|
+
Uses Narwhals LazyFrames (works with Polars, Ibis, Pandas, PyArrow)
|
|
23
|
+
|
|
24
|
+
Strategy:
|
|
25
|
+
- Categorizes changes into added, changed, and removed
|
|
26
|
+
- Uses LEFT/RIGHT JOINs to identify each category
|
|
27
|
+
- Materializes once and splits into three DataFrames (efficient)
|
|
28
|
+
- Backend-agnostic: same code works for in-memory and SQL backends
|
|
29
|
+
|
|
30
|
+
The underlying backend (Polars vs Ibis) determines execution:
|
|
31
|
+
- Polars backend → operations happen in-memory
|
|
32
|
+
- Ibis backend → operations happen in SQL database
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def find_changes(
|
|
36
|
+
self,
|
|
37
|
+
target_versions: nw.LazyFrame[Any],
|
|
38
|
+
current_metadata: nw.LazyFrame[Any] | None,
|
|
39
|
+
) -> LazyDiffResult:
|
|
40
|
+
"""Find all changes between target and current.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
target_versions: Narwhals LazyFrame with calculated data_versions
|
|
44
|
+
current_metadata: Narwhals LazyFrame with current metadata, or None.
|
|
45
|
+
Should be pre-filtered by feature_version at caller level if needed.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
LazyDiffResult with three lazy Narwhals frames (caller materializes if needed)
|
|
49
|
+
"""
|
|
50
|
+
# Select only sample_uid and data_version from target_versions
|
|
51
|
+
# (it may have intermediate joined columns from upstream)
|
|
52
|
+
target_versions = target_versions.select(["sample_uid", "data_version"])
|
|
53
|
+
|
|
54
|
+
if current_metadata is None:
|
|
55
|
+
# No existing metadata - all target rows are new
|
|
56
|
+
# Create empty LazyFrame with proper schema
|
|
57
|
+
import polars as pl
|
|
58
|
+
|
|
59
|
+
empty_lazy = nw.from_native(
|
|
60
|
+
pl.LazyFrame({"sample_uid": [], "data_version": []})
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return LazyDiffResult(
|
|
64
|
+
added=target_versions,
|
|
65
|
+
changed=empty_lazy,
|
|
66
|
+
removed=empty_lazy,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Keep only sample_uid and data_version from current for comparison
|
|
70
|
+
current_comparison = current_metadata.select(
|
|
71
|
+
"sample_uid", nw.col("data_version").alias("__current_data_version")
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# LEFT JOIN target with current
|
|
75
|
+
compared = target_versions.join(
|
|
76
|
+
current_comparison,
|
|
77
|
+
on="sample_uid",
|
|
78
|
+
how="left",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Build lazy queries for each category
|
|
82
|
+
added_lazy = (
|
|
83
|
+
compared.filter(nw.col("__current_data_version").is_null())
|
|
84
|
+
.drop("__current_data_version")
|
|
85
|
+
.select("sample_uid", "data_version")
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
changed_lazy = (
|
|
89
|
+
compared.filter(
|
|
90
|
+
~nw.col("__current_data_version").is_null()
|
|
91
|
+
& (nw.col("data_version") != nw.col("__current_data_version"))
|
|
92
|
+
)
|
|
93
|
+
.drop("__current_data_version")
|
|
94
|
+
.select("sample_uid", "data_version")
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
removed_lazy = current_metadata.join(
|
|
98
|
+
target_versions.select("sample_uid"),
|
|
99
|
+
on="sample_uid",
|
|
100
|
+
how="anti",
|
|
101
|
+
).select("sample_uid", "data_version")
|
|
102
|
+
|
|
103
|
+
# Return lazy frames - caller will materialize if needed
|
|
104
|
+
return LazyDiffResult(
|
|
105
|
+
added=added_lazy,
|
|
106
|
+
changed=changed_lazy,
|
|
107
|
+
removed=removed_lazy,
|
|
108
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Hash algorithms supported for data versioning."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HashAlgorithm(Enum):
|
|
7
|
+
"""Supported hash algorithms for data versioning.
|
|
8
|
+
|
|
9
|
+
These algorithms are chosen for:
|
|
10
|
+
- Speed (non-cryptographic hashes preferred)
|
|
11
|
+
- Cross-database availability
|
|
12
|
+
- Good collision resistance for data versioning
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
XXHASH64 = "xxhash64" # Fast, available in DuckDB, ClickHouse, Polars
|
|
16
|
+
XXHASH32 = "xxhash32" # Faster for small data, less collision resistant
|
|
17
|
+
WYHASH = "wyhash" # Very fast, Polars-specific
|
|
18
|
+
SHA256 = "sha256" # Cryptographic, slower, universally available
|
|
19
|
+
MD5 = "md5" # Legacy, widely available, not recommended for new code
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Abstract base class for upstream joiners."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
10
|
+
from metaxy.models.plan import FeaturePlan
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UpstreamJoiner(ABC):
|
|
14
|
+
"""Joins upstream feature metadata together.
|
|
15
|
+
|
|
16
|
+
The joiner takes upstream feature metadata (which already has data_version columns)
|
|
17
|
+
and joins them together to create a unified view of all dependencies.
|
|
18
|
+
|
|
19
|
+
This is Step 1 in the data versioning process:
|
|
20
|
+
1. Join upstream features → unified upstream view
|
|
21
|
+
2. Calculate data_version from upstream → target versions
|
|
22
|
+
3. Diff with current metadata → identify changes
|
|
23
|
+
|
|
24
|
+
All component boundaries use Narwhals LazyFrames for backend-agnostic processing.
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
- NarwhalsJoiner: Backend-agnostic using Narwhals expressions
|
|
28
|
+
- IbisJoiner: Converts to Ibis internally for SQL processing
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def join_upstream(
|
|
33
|
+
self,
|
|
34
|
+
upstream_refs: dict[str, nw.LazyFrame[Any]],
|
|
35
|
+
feature_spec: "FeatureSpec",
|
|
36
|
+
feature_plan: "FeaturePlan",
|
|
37
|
+
upstream_columns: dict[str, tuple[str, ...] | None] | None = None,
|
|
38
|
+
upstream_renames: dict[str, dict[str, str] | None] | None = None,
|
|
39
|
+
) -> tuple[nw.LazyFrame[Any], dict[str, str]]:
|
|
40
|
+
"""Join all upstream features together with optional column selection/renaming.
|
|
41
|
+
|
|
42
|
+
Joins upstream feature metadata on sample_uid to create a unified reference
|
|
43
|
+
containing all upstream data_version columns needed for hash calculation,
|
|
44
|
+
plus any additional user-specified columns.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
upstream_refs: Upstream feature metadata Narwhals LazyFrames
|
|
48
|
+
Keys are upstream feature keys (using to_string() format)
|
|
49
|
+
Values are Narwhals LazyFrames with upstream metadata
|
|
50
|
+
feature_spec: Specification of the feature being computed
|
|
51
|
+
feature_plan: Resolved feature plan with dependencies
|
|
52
|
+
upstream_columns: Optional dict mapping upstream feature keys to tuple of columns to keep.
|
|
53
|
+
None or missing key = keep all columns. Empty tuple = only system columns.
|
|
54
|
+
upstream_renames: Optional dict mapping upstream feature keys to rename dicts.
|
|
55
|
+
Applied after column selection.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tuple of (joined_ref, upstream_column_mapping):
|
|
59
|
+
- joined_ref: Narwhals LazyFrame with all upstream data joined
|
|
60
|
+
Contains: sample_uid, data_version columns, and any user columns
|
|
61
|
+
- upstream_column_mapping: Maps upstream feature key -> data_version column name
|
|
62
|
+
Example: {"video": "__upstream_video__data_version"}
|
|
63
|
+
|
|
64
|
+
Note:
|
|
65
|
+
- Uses INNER join by default - only sample_uids present in ALL upstream features
|
|
66
|
+
are included. This ensures we can compute valid data_versions.
|
|
67
|
+
- System columns (sample_uid, data_version) are always preserved
|
|
68
|
+
- User columns are preserved based on columns parameter (default: all)
|
|
69
|
+
"""
|
|
70
|
+
pass
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Narwhals implementation of upstream joiner.
|
|
2
|
+
|
|
3
|
+
Unified joiner that works with any backend (Polars, Ibis/SQL) through Narwhals.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
import narwhals as nw
|
|
9
|
+
|
|
10
|
+
from metaxy.data_versioning.joiners.base import UpstreamJoiner
|
|
11
|
+
from metaxy.models.constants import DROPPABLE_SYSTEM_COLUMNS, ESSENTIAL_SYSTEM_COLUMNS
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
15
|
+
from metaxy.models.plan import FeaturePlan
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NarwhalsJoiner(UpstreamJoiner):
|
|
19
|
+
"""Joins upstream features using Narwhals LazyFrames.
|
|
20
|
+
|
|
21
|
+
Type Parameters:
|
|
22
|
+
TRef = nw.LazyFrame (works with Polars, Ibis, Pandas, PyArrow)
|
|
23
|
+
|
|
24
|
+
Strategy:
|
|
25
|
+
- Starts with first upstream feature
|
|
26
|
+
- Sequentially inner joins remaining upstream features on sample_uid
|
|
27
|
+
- Renames data_version columns to avoid conflicts
|
|
28
|
+
- All operations are lazy (no materialization until collect)
|
|
29
|
+
- Backend-agnostic: same code works for in-memory and SQL backends
|
|
30
|
+
|
|
31
|
+
The underlying backend (Polars vs Ibis) is determined by what's wrapped:
|
|
32
|
+
- nw.from_native(pl.LazyFrame) → stays in Polars
|
|
33
|
+
- nw.from_native(ibis.Table) → stays in SQL until collect()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def join_upstream(
|
|
37
|
+
self,
|
|
38
|
+
upstream_refs: dict[str, nw.LazyFrame[Any]],
|
|
39
|
+
feature_spec: "FeatureSpec",
|
|
40
|
+
feature_plan: "FeaturePlan",
|
|
41
|
+
upstream_columns: dict[str, tuple[str, ...] | None] | None = None,
|
|
42
|
+
upstream_renames: dict[str, dict[str, str] | None] | None = None,
|
|
43
|
+
) -> tuple[nw.LazyFrame[Any], dict[str, str]]:
|
|
44
|
+
"""Join upstream Narwhals LazyFrames together with column selection/renaming.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
upstream_refs: Dict of upstream feature key -> Narwhals LazyFrame
|
|
48
|
+
feature_spec: Feature specification
|
|
49
|
+
feature_plan: Feature plan
|
|
50
|
+
upstream_columns: Optional column selection per upstream feature
|
|
51
|
+
upstream_renames: Optional column renaming per upstream feature
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
(joined Narwhals LazyFrame, column mapping)
|
|
55
|
+
"""
|
|
56
|
+
if not upstream_refs:
|
|
57
|
+
# No upstream dependencies - source feature
|
|
58
|
+
# Return empty LazyFrame with just sample_uid column (with proper type)
|
|
59
|
+
import polars as pl
|
|
60
|
+
|
|
61
|
+
# Create empty frame with explicit Int64 type for sample_uid
|
|
62
|
+
# This ensures it's not NULL-typed which would fail with Ibis backends
|
|
63
|
+
empty_df = pl.LazyFrame(
|
|
64
|
+
{"sample_uid": pl.Series("sample_uid", [], dtype=pl.Int64)}
|
|
65
|
+
)
|
|
66
|
+
return nw.from_native(empty_df), {}
|
|
67
|
+
|
|
68
|
+
# Initialize parameters if not provided
|
|
69
|
+
upstream_columns = upstream_columns or {}
|
|
70
|
+
upstream_renames = upstream_renames or {}
|
|
71
|
+
|
|
72
|
+
# Use imported constants for system columns
|
|
73
|
+
system_cols = ESSENTIAL_SYSTEM_COLUMNS
|
|
74
|
+
system_cols_to_drop = DROPPABLE_SYSTEM_COLUMNS
|
|
75
|
+
|
|
76
|
+
# Track all column names to detect conflicts
|
|
77
|
+
all_columns: dict[str, str] = {} # column_name -> source_feature
|
|
78
|
+
|
|
79
|
+
# Process and join upstream features
|
|
80
|
+
upstream_keys = sorted(upstream_refs.keys())
|
|
81
|
+
first_key = upstream_keys[0]
|
|
82
|
+
upstream_mapping = {}
|
|
83
|
+
|
|
84
|
+
# Process first upstream feature
|
|
85
|
+
first_ref = upstream_refs[first_key]
|
|
86
|
+
first_columns_spec = upstream_columns.get(first_key)
|
|
87
|
+
first_renames_spec = upstream_renames.get(first_key) or {}
|
|
88
|
+
|
|
89
|
+
# Get column names from first upstream
|
|
90
|
+
# We need to collect schema to know available columns
|
|
91
|
+
# Use lazy evaluation where possible
|
|
92
|
+
first_schema = first_ref.collect_schema()
|
|
93
|
+
available_cols = set(first_schema.names())
|
|
94
|
+
|
|
95
|
+
# Determine columns to select
|
|
96
|
+
if first_columns_spec is None:
|
|
97
|
+
# Keep all columns (new default behavior) except problematic system columns
|
|
98
|
+
cols_to_select = [c for c in available_cols if c not in system_cols_to_drop]
|
|
99
|
+
elif first_columns_spec == ():
|
|
100
|
+
# Keep only essential system columns
|
|
101
|
+
cols_to_select = [c for c in available_cols if c in system_cols]
|
|
102
|
+
else:
|
|
103
|
+
# Keep specified columns plus essential system columns
|
|
104
|
+
requested = set(first_columns_spec)
|
|
105
|
+
# Filter out problematic system columns even if requested
|
|
106
|
+
requested = requested - system_cols_to_drop
|
|
107
|
+
cols_to_select = list(requested | (available_cols & system_cols))
|
|
108
|
+
|
|
109
|
+
# Warn about missing columns
|
|
110
|
+
missing = requested - available_cols
|
|
111
|
+
if missing:
|
|
112
|
+
import warnings
|
|
113
|
+
|
|
114
|
+
warnings.warn(
|
|
115
|
+
f"Columns {missing} requested but not found in upstream feature {first_key}",
|
|
116
|
+
UserWarning,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Build select expressions with renaming for first upstream
|
|
120
|
+
select_exprs = []
|
|
121
|
+
for col in cols_to_select:
|
|
122
|
+
if col == "data_version":
|
|
123
|
+
# Always rename data_version to avoid conflicts
|
|
124
|
+
new_name = f"__upstream_{first_key}__data_version"
|
|
125
|
+
select_exprs.append(nw.col(col).alias(new_name))
|
|
126
|
+
upstream_mapping[first_key] = new_name
|
|
127
|
+
elif col in first_renames_spec:
|
|
128
|
+
# Apply user-specified rename
|
|
129
|
+
new_name = first_renames_spec[col]
|
|
130
|
+
if new_name in all_columns:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Column name conflict: '{new_name}' from {first_key} "
|
|
133
|
+
f"conflicts with column from {all_columns[new_name]}. "
|
|
134
|
+
f"Use the 'rename' parameter to resolve the conflict."
|
|
135
|
+
)
|
|
136
|
+
select_exprs.append(nw.col(col).alias(new_name))
|
|
137
|
+
all_columns[new_name] = first_key
|
|
138
|
+
else:
|
|
139
|
+
# Keep original name
|
|
140
|
+
if col != "sample_uid" and col in all_columns:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Column name conflict: '{col}' appears in both "
|
|
143
|
+
f"{first_key} and {all_columns[col]}. "
|
|
144
|
+
f"Use the 'rename' parameter to resolve the conflict."
|
|
145
|
+
)
|
|
146
|
+
select_exprs.append(nw.col(col))
|
|
147
|
+
if col != "sample_uid":
|
|
148
|
+
all_columns[col] = first_key
|
|
149
|
+
|
|
150
|
+
joined = first_ref.select(select_exprs)
|
|
151
|
+
|
|
152
|
+
# Join remaining upstream features
|
|
153
|
+
for upstream_key in upstream_keys[1:]:
|
|
154
|
+
upstream_ref = upstream_refs[upstream_key]
|
|
155
|
+
columns_spec = upstream_columns.get(upstream_key)
|
|
156
|
+
renames_spec = upstream_renames.get(upstream_key) or {}
|
|
157
|
+
|
|
158
|
+
# Get available columns
|
|
159
|
+
schema = upstream_ref.collect_schema()
|
|
160
|
+
available_cols = set(schema.names())
|
|
161
|
+
|
|
162
|
+
# Determine columns to select
|
|
163
|
+
if columns_spec is None:
|
|
164
|
+
# Keep all columns except problematic system columns
|
|
165
|
+
cols_to_select = [
|
|
166
|
+
c for c in available_cols if c not in system_cols_to_drop
|
|
167
|
+
]
|
|
168
|
+
elif columns_spec == ():
|
|
169
|
+
# Keep only essential system columns
|
|
170
|
+
cols_to_select = [c for c in available_cols if c in system_cols]
|
|
171
|
+
else:
|
|
172
|
+
# Keep specified columns plus essential system columns
|
|
173
|
+
requested = set(columns_spec)
|
|
174
|
+
# Filter out problematic system columns even if requested
|
|
175
|
+
requested = requested - system_cols_to_drop
|
|
176
|
+
cols_to_select = list(requested | (available_cols & system_cols))
|
|
177
|
+
|
|
178
|
+
# Warn about missing columns
|
|
179
|
+
missing = requested - available_cols
|
|
180
|
+
if missing:
|
|
181
|
+
import warnings
|
|
182
|
+
|
|
183
|
+
warnings.warn(
|
|
184
|
+
f"Columns {missing} requested but not found in upstream feature {upstream_key}",
|
|
185
|
+
UserWarning,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Build select expressions with renaming
|
|
189
|
+
select_exprs = []
|
|
190
|
+
join_cols = [] # Columns to include in join (exclude sample_uid)
|
|
191
|
+
|
|
192
|
+
for col in cols_to_select:
|
|
193
|
+
if col == "sample_uid":
|
|
194
|
+
# Always include sample_uid for joining, but don't duplicate it
|
|
195
|
+
select_exprs.append(nw.col(col))
|
|
196
|
+
elif col == "data_version":
|
|
197
|
+
# Always rename data_version to avoid conflicts
|
|
198
|
+
new_name = f"__upstream_{upstream_key}__data_version"
|
|
199
|
+
select_exprs.append(nw.col(col).alias(new_name))
|
|
200
|
+
join_cols.append(new_name)
|
|
201
|
+
upstream_mapping[upstream_key] = new_name
|
|
202
|
+
elif col in renames_spec:
|
|
203
|
+
# Apply user-specified rename
|
|
204
|
+
new_name = renames_spec[col]
|
|
205
|
+
if new_name in all_columns:
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"Column name conflict: '{new_name}' from {upstream_key} "
|
|
208
|
+
f"conflicts with column from {all_columns[new_name]}. "
|
|
209
|
+
f"Use the 'rename' parameter to resolve the conflict."
|
|
210
|
+
)
|
|
211
|
+
select_exprs.append(nw.col(col).alias(new_name))
|
|
212
|
+
join_cols.append(new_name)
|
|
213
|
+
all_columns[new_name] = upstream_key
|
|
214
|
+
else:
|
|
215
|
+
# Keep original name
|
|
216
|
+
if col in all_columns:
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Column name conflict: '{col}' appears in both "
|
|
219
|
+
f"{upstream_key} and {all_columns[col]}. "
|
|
220
|
+
f"Use the 'rename' parameter to resolve the conflict."
|
|
221
|
+
)
|
|
222
|
+
select_exprs.append(nw.col(col))
|
|
223
|
+
join_cols.append(col)
|
|
224
|
+
all_columns[col] = upstream_key
|
|
225
|
+
|
|
226
|
+
upstream_renamed = upstream_ref.select(select_exprs)
|
|
227
|
+
|
|
228
|
+
# Join with existing data
|
|
229
|
+
joined = joined.join(
|
|
230
|
+
upstream_renamed,
|
|
231
|
+
on="sample_uid",
|
|
232
|
+
how="inner", # Only sample_uids present in ALL upstream
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return joined, upstream_mapping
|