metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,371 @@
1
+ """Delta Lake metadata store implemented with delta-rs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterator, Sequence
6
+ from contextlib import contextmanager
7
+ from functools import cached_property
8
+ from pathlib import Path
9
+ from typing import Any, Literal
10
+
11
+ import deltalake
12
+ import narwhals as nw
13
+ import polars as pl
14
+ from narwhals.typing import Frame
15
+ from pydantic import Field
16
+ from typing_extensions import Self
17
+
18
+ from metaxy._utils import switch_implementation_to_polars
19
+ from metaxy.metadata_store.base import MetadataStore, MetadataStoreConfig
20
+ from metaxy.metadata_store.types import AccessMode
21
+ from metaxy.metadata_store.utils import is_local_path
22
+ from metaxy.models.plan import FeaturePlan
23
+ from metaxy.models.types import CoercibleToFeatureKey, FeatureKey
24
+ from metaxy.versioning.polars import PolarsVersioningEngine
25
+ from metaxy.versioning.types import HashAlgorithm
26
+
27
+
28
+ class DeltaMetadataStoreConfig(MetadataStoreConfig):
29
+ """Configuration for DeltaMetadataStore.
30
+
31
+ Example:
32
+ ```python
33
+ config = DeltaMetadataStoreConfig(
34
+ root_path="s3://my-bucket/metaxy",
35
+ storage_options={"AWS_REGION": "us-west-2"},
36
+ layout="nested",
37
+ )
38
+
39
+ store = DeltaMetadataStore.from_config(config)
40
+ ```
41
+ """
42
+
43
+ root_path: str | Path = Field(
44
+ description="Base directory or URI where feature tables are stored.",
45
+ )
46
+ storage_options: dict[str, Any] | None = Field(
47
+ default=None,
48
+ description="Storage backend options passed to delta-rs.",
49
+ )
50
+ layout: Literal["flat", "nested"] = Field(
51
+ default="nested",
52
+ description="Directory layout for feature tables ('nested' or 'flat').",
53
+ )
54
+ delta_write_options: dict[str, Any] | None = Field(
55
+ default=None,
56
+ description="Options passed to deltalake.write_deltalake().",
57
+ )
58
+
59
+
60
+ class DeltaMetadataStore(MetadataStore):
61
+ """
62
+ Delta Lake metadata store backed by [delta-rs](https://github.com/delta-io/delta-rs).
63
+
64
+ It stores feature metadata in Delta Lake tables located under ``root_path``.
65
+ It uses the Polars versioning engine for provenance calculations.
66
+
67
+ Example:
68
+
69
+ ```py
70
+ from metaxy.metadata_store.delta import DeltaMetadataStore
71
+
72
+ store = DeltaMetadataStore(
73
+ root_path="s3://my-bucket/metaxy",
74
+ storage_options={"AWS_REGION": "us-west-2"},
75
+ )
76
+ ```
77
+ """
78
+
79
+ _should_warn_auto_create_tables = False
80
+
81
+ def __init__(
82
+ self,
83
+ root_path: str | Path,
84
+ *,
85
+ storage_options: dict[str, Any] | None = None,
86
+ fallback_stores: list[MetadataStore] | None = None,
87
+ layout: Literal["flat", "nested"] = "nested",
88
+ delta_write_options: dict[str, Any] | None = None,
89
+ **kwargs: Any,
90
+ ) -> None:
91
+ """
92
+ Initialize Delta Lake metadata store.
93
+
94
+ Args:
95
+ root_path: Base directory or URI where feature tables are stored.
96
+ Supports local paths (`/path/to/dir`), `s3://` URLs, and other object store URIs.
97
+ storage_options: Storage backend options passed to delta-rs.
98
+ Example: `{"AWS_REGION": "us-west-2", "AWS_ACCESS_KEY_ID": "...", ...}`
99
+ See https://delta-io.github.io/delta-rs/ for details on supported options.
100
+ fallback_stores: Ordered list of read-only fallback stores.
101
+ layout: Directory layout for feature tables. Options:
102
+
103
+ - `"nested"`: Feature tables stored in nested directories `{part1}/{part2}.delta`
104
+
105
+ - `"flat"`: Feature tables stored as `{part1}__{part2}.delta`
106
+
107
+ delta_write_options: Additional options passed to deltalake.write_deltalake() - see https://delta-io.github.io/delta-rs/upgrade-guides/guide-1.0.0/#write_deltalake-api.
108
+ Overrides default {"schema_mode": "merge"}. Example: {"max_workers": 4}
109
+ **kwargs: Forwarded to [metaxy.metadata_store.base.MetadataStore][metaxy.metadata_store.base.MetadataStore].
110
+ """
111
+ self.storage_options = storage_options or {}
112
+ if layout not in ("flat", "nested"):
113
+ raise ValueError(f"Invalid layout: {layout}. Must be 'flat' or 'nested'.")
114
+ self.layout = layout
115
+ self.delta_write_options = delta_write_options or {}
116
+
117
+ root_str = str(root_path)
118
+ self._is_remote = not is_local_path(root_str)
119
+
120
+ if self._is_remote:
121
+ # Remote path (S3, Azure, GCS, etc.)
122
+ self._root_uri = root_str.rstrip("/")
123
+ else:
124
+ # Local path (including file:// and local:// URLs)
125
+ if root_str.startswith("file://"):
126
+ # Strip file:// prefix
127
+ root_str = root_str[7:]
128
+ elif root_str.startswith("local://"):
129
+ # Strip local:// prefix
130
+ root_str = root_str[8:]
131
+ local_path = Path(root_str).expanduser().resolve()
132
+ self._root_uri = str(local_path)
133
+
134
+ super().__init__(
135
+ fallback_stores=fallback_stores,
136
+ versioning_engine_cls=PolarsVersioningEngine,
137
+ versioning_engine="polars",
138
+ **kwargs,
139
+ )
140
+
141
+ # ===== MetadataStore abstract methods =====
142
+
143
+ def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
144
+ """Check if feature exists in Delta store.
145
+
146
+ Args:
147
+ feature: Feature to check
148
+
149
+ Returns:
150
+ True if feature exists, False otherwise
151
+ """
152
+ feature_key = self._resolve_feature_key(feature)
153
+ return self._table_exists(self._feature_uri(feature_key))
154
+
155
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
156
+ """Use XXHASH64 by default to match other non-SQL stores."""
157
+ return HashAlgorithm.XXHASH64
158
+
159
+ @contextmanager
160
+ def _create_versioning_engine(
161
+ self, plan: FeaturePlan
162
+ ) -> Iterator[PolarsVersioningEngine]:
163
+ """Create Polars versioning engine for Delta store."""
164
+ with self._create_polars_versioning_engine(plan) as engine:
165
+ yield engine
166
+
167
+ @contextmanager
168
+ def open(self, mode: AccessMode = "read") -> Iterator[Self]: # noqa: ARG002
169
+ """Open the Delta Lake store.
170
+
171
+ Delta-rs opens connections lazily per operation, so no connection state management needed.
172
+
173
+ Args:
174
+ mode: Access mode for this connection session (accepted for consistency but not used).
175
+
176
+ Yields:
177
+ Self: The store instance with connection open
178
+ """
179
+ # Increment context depth to support nested contexts
180
+ self._context_depth += 1
181
+
182
+ try:
183
+ # Only perform actual open on first entry
184
+ if self._context_depth == 1:
185
+ # Mark store as open and validate
186
+ # Note: Delta auto-creates tables on first write, no need to pre-create them
187
+ self._is_open = True
188
+ self._validate_after_open()
189
+
190
+ yield self
191
+ finally:
192
+ # Decrement context depth
193
+ self._context_depth -= 1
194
+
195
+ # Only perform actual close on last exit
196
+ if self._context_depth == 0:
197
+ self._is_open = False
198
+
199
+ @cached_property
200
+ def default_delta_write_options(self) -> dict[str, Any]:
201
+ """Default write options for Delta Lake operations.
202
+
203
+ Merges base defaults with user-provided delta_write_options.
204
+ Base defaults: mode="append", schema_mode="merge", storage_options.
205
+ """
206
+ write_kwargs: dict[str, Any] = {
207
+ "mode": "append",
208
+ "schema_mode": "merge", # Allow schema evolution
209
+ "storage_options": self.storage_options or None,
210
+ }
211
+ # Override with custom options from constructor
212
+ write_kwargs.update(self.delta_write_options)
213
+ return write_kwargs
214
+
215
+ # ===== Internal helpers =====
216
+
217
+ def _feature_uri(self, feature_key: FeatureKey) -> str:
218
+ """Return the URI/path used by deltalake for this feature."""
219
+ if self.layout == "nested":
220
+ # Nested layout: store in directories like "part1/part2/part3"
221
+ # Filter out empty parts to avoid creating absolute paths that would
222
+ # cause os.path.join to discard the root_uri
223
+ table_path = "/".join(part for part in feature_key.parts if part)
224
+ else:
225
+ # Flat layout: store in directories like "part1__part2__part3"
226
+ # table_name already handles this correctly via __join
227
+ table_path = feature_key.table_name
228
+ return f"{self._root_uri}/{table_path}.delta"
229
+
230
+ def _table_exists(self, table_uri: str) -> bool:
231
+ """Check whether the provided URI already contains a Delta table.
232
+
233
+ Works for both local and remote (object store) paths.
234
+ """
235
+ # for weird reasons deltalake.DeltaTable.is_deltatable() sometimes hangs in multi-threading settings
236
+ # but a deltalake.DeltaTable can be constructed just fine
237
+ # so we are relying on DeltaTableNotFoundError to check for existence
238
+ from deltalake.exceptions import TableNotFoundError as DeltaTableNotFoundError
239
+
240
+ try:
241
+ _ = deltalake.DeltaTable(
242
+ table_uri, storage_options=self.storage_options, without_files=True
243
+ )
244
+ except DeltaTableNotFoundError:
245
+ return False
246
+ return True
247
+
248
+ # ===== Storage operations =====
249
+
250
+ def write_metadata_to_store(
251
+ self,
252
+ feature_key: FeatureKey,
253
+ df: Frame,
254
+ **kwargs: Any,
255
+ ) -> None:
256
+ """Append metadata to the Delta table for a feature.
257
+
258
+ Args:
259
+ feature_key: Feature key to write to
260
+ df: DataFrame with metadata (already validated)
261
+ **kwargs: Backend-specific parameters (currently unused)
262
+ """
263
+ table_uri = self._feature_uri(feature_key)
264
+
265
+ # Delta Lake auto-creates tables on first write, no need to check existence
266
+ # Convert to Polars and collect lazy frames
267
+ df_polars = switch_implementation_to_polars(df)
268
+
269
+ # Collect lazy frames, keep eager frames as-is
270
+ if isinstance(df_polars, nw.LazyFrame):
271
+ df_native = df_polars.collect().to_native()
272
+ else:
273
+ df_native = df_polars.to_native()
274
+
275
+ assert isinstance(df_native, pl.DataFrame)
276
+
277
+ # Cast Enum columns to String to avoid delta-rs Utf8View incompatibility
278
+ # (delta-rs parquet writer cannot handle Utf8View dictionary values)
279
+ df_native = df_native.with_columns(pl.selectors.by_dtype(pl.Enum).cast(pl.Utf8))
280
+
281
+ # Prepare write parameters for Polars write_delta
282
+ # Extract mode and storage_options as top-level parameters
283
+ write_opts = self.default_delta_write_options.copy()
284
+ mode = write_opts.pop("mode", "append")
285
+ storage_options = write_opts.pop("storage_options", None)
286
+
287
+ # Write using Polars DataFrame.write_delta
288
+ df_native.write_delta(
289
+ table_uri,
290
+ mode=mode,
291
+ storage_options=storage_options,
292
+ delta_write_options=write_opts or None,
293
+ )
294
+
295
+ def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
296
+ """Drop Delta table for the specified feature using soft delete.
297
+
298
+ Uses Delta's delete operation which marks rows as deleted in the transaction log
299
+ rather than physically removing files.
300
+ """
301
+ table_uri = self._feature_uri(feature_key)
302
+
303
+ # Check if table exists first
304
+ if not self._table_exists(table_uri):
305
+ return
306
+
307
+ # Load the Delta table
308
+ delta_table = deltalake.DeltaTable(
309
+ table_uri,
310
+ storage_options=self.storage_options or None,
311
+ without_files=True, # Don't track files for this operation
312
+ )
313
+
314
+ # Use Delta's delete operation - soft delete all rows
315
+ # This marks rows as deleted in transaction log without physically removing files
316
+ delta_table.delete()
317
+
318
+ def read_metadata_in_store(
319
+ self,
320
+ feature: CoercibleToFeatureKey,
321
+ *,
322
+ filters: Sequence[nw.Expr] | None = None,
323
+ columns: Sequence[str] | None = None,
324
+ **kwargs: Any,
325
+ ) -> nw.LazyFrame[Any] | None:
326
+ """Read metadata stored in Delta for a single feature using lazy evaluation.
327
+
328
+ Args:
329
+ feature: Feature to read metadata for
330
+ filters: List of Narwhals filter expressions
331
+ columns: Subset of columns to return
332
+ **kwargs: Backend-specific parameters (currently unused)
333
+ """
334
+ self._check_open()
335
+
336
+ feature_key = self._resolve_feature_key(feature)
337
+ table_uri = self._feature_uri(feature_key)
338
+ if not self._table_exists(table_uri):
339
+ return None
340
+
341
+ # Use scan_delta for lazy evaluation
342
+ lf = pl.scan_delta(
343
+ table_uri,
344
+ storage_options=self.storage_options or None,
345
+ )
346
+
347
+ # Convert to Narwhals
348
+ nw_lazy = nw.from_native(lf)
349
+
350
+ # Apply filters (unpack list, skip if empty)
351
+ if filters:
352
+ nw_lazy = nw_lazy.filter(*filters)
353
+
354
+ # Apply column selection
355
+ if columns is not None:
356
+ nw_lazy = nw_lazy.select(columns)
357
+
358
+ return nw_lazy
359
+
360
+ def display(self) -> str:
361
+ """Return human-readable representation of the store."""
362
+ details = [f"path={self._root_uri}"]
363
+ details.append(f"layout={self.layout}")
364
+ return f"DeltaMetadataStore({', '.join(details)})"
365
+
366
+ def get_store_metadata(self, feature_key: CoercibleToFeatureKey) -> dict[str, Any]:
367
+ return {"uri": self._feature_uri(self._resolve_feature_key(feature_key))}
368
+
369
+ @classmethod
370
+ def config_model(cls) -> type[DeltaMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
371
+ return DeltaMetadataStoreConfig