metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,150 @@
1
+ """Abstract base class for metadata diff resolvers."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any, NamedTuple
5
+
6
+ import narwhals as nw
7
+
8
+ if TYPE_CHECKING:
9
+ pass
10
+
11
+
12
+ class LazyDiffResult(NamedTuple):
13
+ """Result of diffing with lazy Narwhals LazyFrames (opt-in via lazy=True).
14
+
15
+ Contains lazy Narwhals LazyFrames - users decide when/how to materialize.
16
+
17
+ Users can:
18
+ - Keep lazy for further operations: result.added.filter(...)
19
+ - Materialize to Polars: result.added.collect().to_native()
20
+ - Materialize to Pandas: result.added.collect().to_pandas()
21
+ - Materialize to PyArrow: result.added.collect().to_arrow()
22
+ - Convert to DiffResult: result.collect()
23
+
24
+ Backend execution:
25
+ - SQL stores: All operations stay in SQL until .collect()
26
+ - Polars stores: Operations stay lazy until .collect()
27
+
28
+ Attributes:
29
+ added: New samples (lazy, never None - empty LazyFrame instead)
30
+ Columns: [sample_uid, data_version, ...user columns...]
31
+ changed: Changed samples (lazy, never None)
32
+ Columns: [sample_uid, data_version, ...user columns...]
33
+ removed: Removed samples (lazy, never None)
34
+ Columns: [sample_uid, data_version, ...user columns...]
35
+
36
+ Note:
37
+ May contain additional user columns beyond sample_uid and data_version,
38
+ depending on what was passed to resolve_update() via align_upstream_metadata.
39
+ """
40
+
41
+ added: nw.LazyFrame[Any]
42
+ changed: nw.LazyFrame[Any]
43
+ removed: nw.LazyFrame[Any]
44
+
45
+ def collect(self) -> "DiffResult":
46
+ """Materialize all lazy frames to create a DiffResult.
47
+
48
+ Returns:
49
+ DiffResult with all frames materialized to eager DataFrames.
50
+ """
51
+ return DiffResult(
52
+ added=self.added.collect(),
53
+ changed=self.changed.collect(),
54
+ removed=self.removed.collect(),
55
+ )
56
+
57
+
58
+ class DiffResult(NamedTuple):
59
+ """Result of diffing with eager Narwhals DataFrames (default).
60
+
61
+ Contains materialized Narwhals DataFrames - ready to use immediately.
62
+
63
+ Users can convert to their preferred format:
64
+ - Polars: result.added.to_native()
65
+ - Pandas: result.added.to_pandas()
66
+ - PyArrow: result.added.to_arrow()
67
+
68
+ Attributes:
69
+ added: New samples (eager, never None - empty DataFrame instead)
70
+ Columns: [sample_uid, data_version, ...user columns...]
71
+ changed: Changed samples (eager, never None)
72
+ Columns: [sample_uid, data_version, ...user columns...]
73
+ removed: Removed samples (eager, never None)
74
+ Columns: [sample_uid, data_version, ...user columns...]
75
+
76
+ Note:
77
+ May contain additional user columns beyond sample_uid and data_version,
78
+ depending on what was passed to resolve_update() via align_upstream_metadata.
79
+ """
80
+
81
+ added: nw.DataFrame[Any]
82
+ changed: nw.DataFrame[Any]
83
+ removed: nw.DataFrame[Any]
84
+
85
+
86
+ class MetadataDiffResolver(ABC):
87
+ """Identifies rows with changed data_versions by comparing target with current.
88
+
89
+ The diff resolver compares newly calculated data_versions (target) with
90
+ existing metadata (current) to identify what needs to be written.
91
+
92
+ This is Step 3 in the data versioning process:
93
+ 1. Join upstream features → unified upstream view
94
+ 2. Calculate data_version from upstream → target versions
95
+ 3. Diff with current metadata → identify changes ← THIS STEP
96
+
97
+ All component boundaries use Narwhals LazyFrames for backend-agnostic processing.
98
+
99
+ Examples:
100
+ - NarwhalsDiffResolver: Backend-agnostic using Narwhals expressions
101
+ - IbisDiffResolver: Converts to Ibis internally for SQL processing
102
+
103
+ Important Design:
104
+ Takes lazy Narwhals refs as input, returns LazyDiffResult as output.
105
+ This minimizes query execution:
106
+ - SQL backends: One query with CTEs computes all three categories
107
+ - Polars: Uses lazy operations, splits into three LazyFrames
108
+
109
+ Users can override Feature.resolve_data_version_diff to customize:
110
+ - Ignore certain field changes
111
+ - Apply custom change detection rules
112
+ - Filter out specific samples
113
+ """
114
+
115
+ @abstractmethod
116
+ def find_changes(
117
+ self,
118
+ target_versions: nw.LazyFrame[Any],
119
+ current_metadata: nw.LazyFrame[Any] | None,
120
+ ) -> LazyDiffResult:
121
+ """Find all changes between target and current metadata.
122
+
123
+ Compares target data_versions (newly calculated) with current metadata
124
+ and categorizes all differences.
125
+
126
+ Args:
127
+ target_versions: Narwhals LazyFrame with newly calculated data_versions
128
+ Shape: [sample_uid, data_version (calculated), upstream columns...]
129
+ current_metadata: Narwhals LazyFrame with current metadata, or None
130
+ Shape: [sample_uid, data_version (existing), feature_version, custom columns...]
131
+ Should be pre-filtered by feature_version at the caller level if needed.
132
+
133
+ Returns:
134
+ LazyDiffResult with three lazy Narwhals LazyFrames.
135
+ Caller materializes to DiffResult if needed (for lazy=False).
136
+
137
+ Implementation Note:
138
+ Should build lazy operations without materializing:
139
+ - SQL backends: Build one lazy query with CTEs for all three categories
140
+ - Polars: Use lazy operations, no collect() calls
141
+
142
+ Note:
143
+ For immutable append-only storage, typically only 'added' and 'changed'
144
+ are written. 'removed' is useful for validation/reporting.
145
+
146
+ Feature version filtering should happen at the read_metadata() level,
147
+ not in the diff resolver. The diff resolver just compares whatever
148
+ metadata is passed to it.
149
+ """
150
+ pass
@@ -0,0 +1,108 @@
1
+ """Narwhals implementation of metadata diff resolver.
2
+
3
+ Unified diff resolver that works with any backend (Polars, Ibis/SQL) through Narwhals.
4
+ """
5
+
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ import narwhals as nw
9
+
10
+ from metaxy.data_versioning.diff.base import (
11
+ LazyDiffResult,
12
+ MetadataDiffResolver,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ pass
17
+
18
+
19
+ class NarwhalsDiffResolver(MetadataDiffResolver):
20
+ """Identifies changed rows using Narwhals operations.
21
+
22
+ Uses Narwhals LazyFrames (works with Polars, Ibis, Pandas, PyArrow)
23
+
24
+ Strategy:
25
+ - Categorizes changes into added, changed, and removed
26
+ - Uses LEFT/RIGHT JOINs to identify each category
27
+ - Materializes once and splits into three DataFrames (efficient)
28
+ - Backend-agnostic: same code works for in-memory and SQL backends
29
+
30
+ The underlying backend (Polars vs Ibis) determines execution:
31
+ - Polars backend → operations happen in-memory
32
+ - Ibis backend → operations happen in SQL database
33
+ """
34
+
35
+ def find_changes(
36
+ self,
37
+ target_versions: nw.LazyFrame[Any],
38
+ current_metadata: nw.LazyFrame[Any] | None,
39
+ ) -> LazyDiffResult:
40
+ """Find all changes between target and current.
41
+
42
+ Args:
43
+ target_versions: Narwhals LazyFrame with calculated data_versions
44
+ current_metadata: Narwhals LazyFrame with current metadata, or None.
45
+ Should be pre-filtered by feature_version at caller level if needed.
46
+
47
+ Returns:
48
+ LazyDiffResult with three lazy Narwhals frames (caller materializes if needed)
49
+ """
50
+ # Select only sample_uid and data_version from target_versions
51
+ # (it may have intermediate joined columns from upstream)
52
+ target_versions = target_versions.select(["sample_uid", "data_version"])
53
+
54
+ if current_metadata is None:
55
+ # No existing metadata - all target rows are new
56
+ # Create empty LazyFrame with proper schema
57
+ import polars as pl
58
+
59
+ empty_lazy = nw.from_native(
60
+ pl.LazyFrame({"sample_uid": [], "data_version": []})
61
+ )
62
+
63
+ return LazyDiffResult(
64
+ added=target_versions,
65
+ changed=empty_lazy,
66
+ removed=empty_lazy,
67
+ )
68
+
69
+ # Keep only sample_uid and data_version from current for comparison
70
+ current_comparison = current_metadata.select(
71
+ "sample_uid", nw.col("data_version").alias("__current_data_version")
72
+ )
73
+
74
+ # LEFT JOIN target with current
75
+ compared = target_versions.join(
76
+ current_comparison,
77
+ on="sample_uid",
78
+ how="left",
79
+ )
80
+
81
+ # Build lazy queries for each category
82
+ added_lazy = (
83
+ compared.filter(nw.col("__current_data_version").is_null())
84
+ .drop("__current_data_version")
85
+ .select("sample_uid", "data_version")
86
+ )
87
+
88
+ changed_lazy = (
89
+ compared.filter(
90
+ ~nw.col("__current_data_version").is_null()
91
+ & (nw.col("data_version") != nw.col("__current_data_version"))
92
+ )
93
+ .drop("__current_data_version")
94
+ .select("sample_uid", "data_version")
95
+ )
96
+
97
+ removed_lazy = current_metadata.join(
98
+ target_versions.select("sample_uid"),
99
+ on="sample_uid",
100
+ how="anti",
101
+ ).select("sample_uid", "data_version")
102
+
103
+ # Return lazy frames - caller will materialize if needed
104
+ return LazyDiffResult(
105
+ added=added_lazy,
106
+ changed=changed_lazy,
107
+ removed=removed_lazy,
108
+ )
@@ -0,0 +1,19 @@
1
+ """Hash algorithms supported for data versioning."""
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class HashAlgorithm(Enum):
7
+ """Supported hash algorithms for data versioning.
8
+
9
+ These algorithms are chosen for:
10
+ - Speed (non-cryptographic hashes preferred)
11
+ - Cross-database availability
12
+ - Good collision resistance for data versioning
13
+ """
14
+
15
+ XXHASH64 = "xxhash64" # Fast, available in DuckDB, ClickHouse, Polars
16
+ XXHASH32 = "xxhash32" # Faster for small data, less collision resistant
17
+ WYHASH = "wyhash" # Very fast, Polars-specific
18
+ SHA256 = "sha256" # Cryptographic, slower, universally available
19
+ MD5 = "md5" # Legacy, widely available, not recommended for new code
@@ -0,0 +1,9 @@
1
+ """Upstream joiners for merging upstream feature metadata."""
2
+
3
+ from metaxy.data_versioning.joiners.base import UpstreamJoiner
4
+ from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
5
+
6
+ __all__ = [
7
+ "UpstreamJoiner",
8
+ "NarwhalsJoiner",
9
+ ]
@@ -0,0 +1,70 @@
1
+ """Abstract base class for upstream joiners."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import narwhals as nw
7
+
8
+ if TYPE_CHECKING:
9
+ from metaxy.models.feature_spec import FeatureSpec
10
+ from metaxy.models.plan import FeaturePlan
11
+
12
+
13
+ class UpstreamJoiner(ABC):
14
+ """Joins upstream feature metadata together.
15
+
16
+ The joiner takes upstream feature metadata (which already has data_version columns)
17
+ and joins them together to create a unified view of all dependencies.
18
+
19
+ This is Step 1 in the data versioning process:
20
+ 1. Join upstream features → unified upstream view
21
+ 2. Calculate data_version from upstream → target versions
22
+ 3. Diff with current metadata → identify changes
23
+
24
+ All component boundaries use Narwhals LazyFrames for backend-agnostic processing.
25
+
26
+ Examples:
27
+ - NarwhalsJoiner: Backend-agnostic using Narwhals expressions
28
+ - IbisJoiner: Converts to Ibis internally for SQL processing
29
+ """
30
+
31
+ @abstractmethod
32
+ def join_upstream(
33
+ self,
34
+ upstream_refs: dict[str, nw.LazyFrame[Any]],
35
+ feature_spec: "FeatureSpec",
36
+ feature_plan: "FeaturePlan",
37
+ upstream_columns: dict[str, tuple[str, ...] | None] | None = None,
38
+ upstream_renames: dict[str, dict[str, str] | None] | None = None,
39
+ ) -> tuple[nw.LazyFrame[Any], dict[str, str]]:
40
+ """Join all upstream features together with optional column selection/renaming.
41
+
42
+ Joins upstream feature metadata on sample_uid to create a unified reference
43
+ containing all upstream data_version columns needed for hash calculation,
44
+ plus any additional user-specified columns.
45
+
46
+ Args:
47
+ upstream_refs: Upstream feature metadata Narwhals LazyFrames
48
+ Keys are upstream feature keys (using to_string() format)
49
+ Values are Narwhals LazyFrames with upstream metadata
50
+ feature_spec: Specification of the feature being computed
51
+ feature_plan: Resolved feature plan with dependencies
52
+ upstream_columns: Optional dict mapping upstream feature keys to tuple of columns to keep.
53
+ None or missing key = keep all columns. Empty tuple = only system columns.
54
+ upstream_renames: Optional dict mapping upstream feature keys to rename dicts.
55
+ Applied after column selection.
56
+
57
+ Returns:
58
+ Tuple of (joined_ref, upstream_column_mapping):
59
+ - joined_ref: Narwhals LazyFrame with all upstream data joined
60
+ Contains: sample_uid, data_version columns, and any user columns
61
+ - upstream_column_mapping: Maps upstream feature key -> data_version column name
62
+ Example: {"video": "__upstream_video__data_version"}
63
+
64
+ Note:
65
+ - Uses INNER join by default - only sample_uids present in ALL upstream features
66
+ are included. This ensures we can compute valid data_versions.
67
+ - System columns (sample_uid, data_version) are always preserved
68
+ - User columns are preserved based on columns parameter (default: all)
69
+ """
70
+ pass
@@ -0,0 +1,235 @@
1
+ """Narwhals implementation of upstream joiner.
2
+
3
+ Unified joiner that works with any backend (Polars, Ibis/SQL) through Narwhals.
4
+ """
5
+
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ import narwhals as nw
9
+
10
+ from metaxy.data_versioning.joiners.base import UpstreamJoiner
11
+ from metaxy.models.constants import DROPPABLE_SYSTEM_COLUMNS, ESSENTIAL_SYSTEM_COLUMNS
12
+
13
+ if TYPE_CHECKING:
14
+ from metaxy.models.feature_spec import FeatureSpec
15
+ from metaxy.models.plan import FeaturePlan
16
+
17
+
18
+ class NarwhalsJoiner(UpstreamJoiner):
19
+ """Joins upstream features using Narwhals LazyFrames.
20
+
21
+ Type Parameters:
22
+ TRef = nw.LazyFrame (works with Polars, Ibis, Pandas, PyArrow)
23
+
24
+ Strategy:
25
+ - Starts with first upstream feature
26
+ - Sequentially inner joins remaining upstream features on sample_uid
27
+ - Renames data_version columns to avoid conflicts
28
+ - All operations are lazy (no materialization until collect)
29
+ - Backend-agnostic: same code works for in-memory and SQL backends
30
+
31
+ The underlying backend (Polars vs Ibis) is determined by what's wrapped:
32
+ - nw.from_native(pl.LazyFrame) → stays in Polars
33
+ - nw.from_native(ibis.Table) → stays in SQL until collect()
34
+ """
35
+
36
+ def join_upstream(
37
+ self,
38
+ upstream_refs: dict[str, nw.LazyFrame[Any]],
39
+ feature_spec: "FeatureSpec",
40
+ feature_plan: "FeaturePlan",
41
+ upstream_columns: dict[str, tuple[str, ...] | None] | None = None,
42
+ upstream_renames: dict[str, dict[str, str] | None] | None = None,
43
+ ) -> tuple[nw.LazyFrame[Any], dict[str, str]]:
44
+ """Join upstream Narwhals LazyFrames together with column selection/renaming.
45
+
46
+ Args:
47
+ upstream_refs: Dict of upstream feature key -> Narwhals LazyFrame
48
+ feature_spec: Feature specification
49
+ feature_plan: Feature plan
50
+ upstream_columns: Optional column selection per upstream feature
51
+ upstream_renames: Optional column renaming per upstream feature
52
+
53
+ Returns:
54
+ (joined Narwhals LazyFrame, column mapping)
55
+ """
56
+ if not upstream_refs:
57
+ # No upstream dependencies - source feature
58
+ # Return empty LazyFrame with just sample_uid column (with proper type)
59
+ import polars as pl
60
+
61
+ # Create empty frame with explicit Int64 type for sample_uid
62
+ # This ensures it's not NULL-typed which would fail with Ibis backends
63
+ empty_df = pl.LazyFrame(
64
+ {"sample_uid": pl.Series("sample_uid", [], dtype=pl.Int64)}
65
+ )
66
+ return nw.from_native(empty_df), {}
67
+
68
+ # Initialize parameters if not provided
69
+ upstream_columns = upstream_columns or {}
70
+ upstream_renames = upstream_renames or {}
71
+
72
+ # Use imported constants for system columns
73
+ system_cols = ESSENTIAL_SYSTEM_COLUMNS
74
+ system_cols_to_drop = DROPPABLE_SYSTEM_COLUMNS
75
+
76
+ # Track all column names to detect conflicts
77
+ all_columns: dict[str, str] = {} # column_name -> source_feature
78
+
79
+ # Process and join upstream features
80
+ upstream_keys = sorted(upstream_refs.keys())
81
+ first_key = upstream_keys[0]
82
+ upstream_mapping = {}
83
+
84
+ # Process first upstream feature
85
+ first_ref = upstream_refs[first_key]
86
+ first_columns_spec = upstream_columns.get(first_key)
87
+ first_renames_spec = upstream_renames.get(first_key) or {}
88
+
89
+ # Get column names from first upstream
90
+ # We need to collect schema to know available columns
91
+ # Use lazy evaluation where possible
92
+ first_schema = first_ref.collect_schema()
93
+ available_cols = set(first_schema.names())
94
+
95
+ # Determine columns to select
96
+ if first_columns_spec is None:
97
+ # Keep all columns (new default behavior) except problematic system columns
98
+ cols_to_select = [c for c in available_cols if c not in system_cols_to_drop]
99
+ elif first_columns_spec == ():
100
+ # Keep only essential system columns
101
+ cols_to_select = [c for c in available_cols if c in system_cols]
102
+ else:
103
+ # Keep specified columns plus essential system columns
104
+ requested = set(first_columns_spec)
105
+ # Filter out problematic system columns even if requested
106
+ requested = requested - system_cols_to_drop
107
+ cols_to_select = list(requested | (available_cols & system_cols))
108
+
109
+ # Warn about missing columns
110
+ missing = requested - available_cols
111
+ if missing:
112
+ import warnings
113
+
114
+ warnings.warn(
115
+ f"Columns {missing} requested but not found in upstream feature {first_key}",
116
+ UserWarning,
117
+ )
118
+
119
+ # Build select expressions with renaming for first upstream
120
+ select_exprs = []
121
+ for col in cols_to_select:
122
+ if col == "data_version":
123
+ # Always rename data_version to avoid conflicts
124
+ new_name = f"__upstream_{first_key}__data_version"
125
+ select_exprs.append(nw.col(col).alias(new_name))
126
+ upstream_mapping[first_key] = new_name
127
+ elif col in first_renames_spec:
128
+ # Apply user-specified rename
129
+ new_name = first_renames_spec[col]
130
+ if new_name in all_columns:
131
+ raise ValueError(
132
+ f"Column name conflict: '{new_name}' from {first_key} "
133
+ f"conflicts with column from {all_columns[new_name]}. "
134
+ f"Use the 'rename' parameter to resolve the conflict."
135
+ )
136
+ select_exprs.append(nw.col(col).alias(new_name))
137
+ all_columns[new_name] = first_key
138
+ else:
139
+ # Keep original name
140
+ if col != "sample_uid" and col in all_columns:
141
+ raise ValueError(
142
+ f"Column name conflict: '{col}' appears in both "
143
+ f"{first_key} and {all_columns[col]}. "
144
+ f"Use the 'rename' parameter to resolve the conflict."
145
+ )
146
+ select_exprs.append(nw.col(col))
147
+ if col != "sample_uid":
148
+ all_columns[col] = first_key
149
+
150
+ joined = first_ref.select(select_exprs)
151
+
152
+ # Join remaining upstream features
153
+ for upstream_key in upstream_keys[1:]:
154
+ upstream_ref = upstream_refs[upstream_key]
155
+ columns_spec = upstream_columns.get(upstream_key)
156
+ renames_spec = upstream_renames.get(upstream_key) or {}
157
+
158
+ # Get available columns
159
+ schema = upstream_ref.collect_schema()
160
+ available_cols = set(schema.names())
161
+
162
+ # Determine columns to select
163
+ if columns_spec is None:
164
+ # Keep all columns except problematic system columns
165
+ cols_to_select = [
166
+ c for c in available_cols if c not in system_cols_to_drop
167
+ ]
168
+ elif columns_spec == ():
169
+ # Keep only essential system columns
170
+ cols_to_select = [c for c in available_cols if c in system_cols]
171
+ else:
172
+ # Keep specified columns plus essential system columns
173
+ requested = set(columns_spec)
174
+ # Filter out problematic system columns even if requested
175
+ requested = requested - system_cols_to_drop
176
+ cols_to_select = list(requested | (available_cols & system_cols))
177
+
178
+ # Warn about missing columns
179
+ missing = requested - available_cols
180
+ if missing:
181
+ import warnings
182
+
183
+ warnings.warn(
184
+ f"Columns {missing} requested but not found in upstream feature {upstream_key}",
185
+ UserWarning,
186
+ )
187
+
188
+ # Build select expressions with renaming
189
+ select_exprs = []
190
+ join_cols = [] # Columns to include in join (exclude sample_uid)
191
+
192
+ for col in cols_to_select:
193
+ if col == "sample_uid":
194
+ # Always include sample_uid for joining, but don't duplicate it
195
+ select_exprs.append(nw.col(col))
196
+ elif col == "data_version":
197
+ # Always rename data_version to avoid conflicts
198
+ new_name = f"__upstream_{upstream_key}__data_version"
199
+ select_exprs.append(nw.col(col).alias(new_name))
200
+ join_cols.append(new_name)
201
+ upstream_mapping[upstream_key] = new_name
202
+ elif col in renames_spec:
203
+ # Apply user-specified rename
204
+ new_name = renames_spec[col]
205
+ if new_name in all_columns:
206
+ raise ValueError(
207
+ f"Column name conflict: '{new_name}' from {upstream_key} "
208
+ f"conflicts with column from {all_columns[new_name]}. "
209
+ f"Use the 'rename' parameter to resolve the conflict."
210
+ )
211
+ select_exprs.append(nw.col(col).alias(new_name))
212
+ join_cols.append(new_name)
213
+ all_columns[new_name] = upstream_key
214
+ else:
215
+ # Keep original name
216
+ if col in all_columns:
217
+ raise ValueError(
218
+ f"Column name conflict: '{col}' appears in both "
219
+ f"{upstream_key} and {all_columns[col]}. "
220
+ f"Use the 'rename' parameter to resolve the conflict."
221
+ )
222
+ select_exprs.append(nw.col(col))
223
+ join_cols.append(col)
224
+ all_columns[col] = upstream_key
225
+
226
+ upstream_renamed = upstream_ref.select(select_exprs)
227
+
228
+ # Join with existing data
229
+ joined = joined.join(
230
+ upstream_renamed,
231
+ on="sample_uid",
232
+ how="inner", # Only sample_uids present in ALL upstream
233
+ )
234
+
235
+ return joined, upstream_mapping