metaxy 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (74) hide show
  1. metaxy-0.0.0/PKG-INFO +247 -0
  2. metaxy-0.0.0/README.md +220 -0
  3. metaxy-0.0.0/pyproject.toml +112 -0
  4. metaxy-0.0.0/src/metaxy/__init__.py +61 -0
  5. metaxy-0.0.0/src/metaxy/_testing.py +542 -0
  6. metaxy-0.0.0/src/metaxy/_utils.py +16 -0
  7. metaxy-0.0.0/src/metaxy/_version.py +1 -0
  8. metaxy-0.0.0/src/metaxy/cli/app.py +76 -0
  9. metaxy-0.0.0/src/metaxy/cli/context.py +71 -0
  10. metaxy-0.0.0/src/metaxy/cli/graph.py +576 -0
  11. metaxy-0.0.0/src/metaxy/cli/graph_diff.py +290 -0
  12. metaxy-0.0.0/src/metaxy/cli/list.py +42 -0
  13. metaxy-0.0.0/src/metaxy/cli/metadata.py +271 -0
  14. metaxy-0.0.0/src/metaxy/cli/migrations.py +862 -0
  15. metaxy-0.0.0/src/metaxy/cli/push.py +55 -0
  16. metaxy-0.0.0/src/metaxy/config.py +450 -0
  17. metaxy-0.0.0/src/metaxy/data_versioning/__init__.py +24 -0
  18. metaxy-0.0.0/src/metaxy/data_versioning/calculators/__init__.py +13 -0
  19. metaxy-0.0.0/src/metaxy/data_versioning/calculators/base.py +97 -0
  20. metaxy-0.0.0/src/metaxy/data_versioning/calculators/duckdb.py +186 -0
  21. metaxy-0.0.0/src/metaxy/data_versioning/calculators/ibis.py +225 -0
  22. metaxy-0.0.0/src/metaxy/data_versioning/calculators/polars.py +135 -0
  23. metaxy-0.0.0/src/metaxy/data_versioning/diff/__init__.py +15 -0
  24. metaxy-0.0.0/src/metaxy/data_versioning/diff/base.py +150 -0
  25. metaxy-0.0.0/src/metaxy/data_versioning/diff/narwhals.py +108 -0
  26. metaxy-0.0.0/src/metaxy/data_versioning/hash_algorithms.py +19 -0
  27. metaxy-0.0.0/src/metaxy/data_versioning/joiners/__init__.py +9 -0
  28. metaxy-0.0.0/src/metaxy/data_versioning/joiners/base.py +70 -0
  29. metaxy-0.0.0/src/metaxy/data_versioning/joiners/narwhals.py +235 -0
  30. metaxy-0.0.0/src/metaxy/entrypoints.py +309 -0
  31. metaxy-0.0.0/src/metaxy/ext/__init__.py +1 -0
  32. metaxy-0.0.0/src/metaxy/ext/alembic.py +326 -0
  33. metaxy-0.0.0/src/metaxy/ext/sqlmodel.py +172 -0
  34. metaxy-0.0.0/src/metaxy/ext/sqlmodel_system_tables.py +139 -0
  35. metaxy-0.0.0/src/metaxy/graph/__init__.py +21 -0
  36. metaxy-0.0.0/src/metaxy/graph/diff/__init__.py +21 -0
  37. metaxy-0.0.0/src/metaxy/graph/diff/diff_models.py +399 -0
  38. metaxy-0.0.0/src/metaxy/graph/diff/differ.py +740 -0
  39. metaxy-0.0.0/src/metaxy/graph/diff/models.py +418 -0
  40. metaxy-0.0.0/src/metaxy/graph/diff/rendering/__init__.py +18 -0
  41. metaxy-0.0.0/src/metaxy/graph/diff/rendering/base.py +274 -0
  42. metaxy-0.0.0/src/metaxy/graph/diff/rendering/cards.py +188 -0
  43. metaxy-0.0.0/src/metaxy/graph/diff/rendering/formatter.py +805 -0
  44. metaxy-0.0.0/src/metaxy/graph/diff/rendering/graphviz.py +246 -0
  45. metaxy-0.0.0/src/metaxy/graph/diff/rendering/mermaid.py +320 -0
  46. metaxy-0.0.0/src/metaxy/graph/diff/rendering/rich.py +165 -0
  47. metaxy-0.0.0/src/metaxy/graph/diff/rendering/theme.py +48 -0
  48. metaxy-0.0.0/src/metaxy/graph/diff/traversal.py +247 -0
  49. metaxy-0.0.0/src/metaxy/graph/utils.py +58 -0
  50. metaxy-0.0.0/src/metaxy/metadata_store/__init__.py +31 -0
  51. metaxy-0.0.0/src/metaxy/metadata_store/_protocols.py +38 -0
  52. metaxy-0.0.0/src/metaxy/metadata_store/base.py +1676 -0
  53. metaxy-0.0.0/src/metaxy/metadata_store/clickhouse.py +161 -0
  54. metaxy-0.0.0/src/metaxy/metadata_store/duckdb.py +167 -0
  55. metaxy-0.0.0/src/metaxy/metadata_store/exceptions.py +43 -0
  56. metaxy-0.0.0/src/metaxy/metadata_store/ibis.py +451 -0
  57. metaxy-0.0.0/src/metaxy/metadata_store/memory.py +228 -0
  58. metaxy-0.0.0/src/metaxy/metadata_store/sqlite.py +187 -0
  59. metaxy-0.0.0/src/metaxy/metadata_store/system_tables.py +257 -0
  60. metaxy-0.0.0/src/metaxy/migrations/__init__.py +34 -0
  61. metaxy-0.0.0/src/metaxy/migrations/detector.py +153 -0
  62. metaxy-0.0.0/src/metaxy/migrations/executor.py +208 -0
  63. metaxy-0.0.0/src/metaxy/migrations/loader.py +260 -0
  64. metaxy-0.0.0/src/metaxy/migrations/models.py +718 -0
  65. metaxy-0.0.0/src/metaxy/migrations/ops.py +390 -0
  66. metaxy-0.0.0/src/metaxy/models/__init__.py +0 -0
  67. metaxy-0.0.0/src/metaxy/models/bases.py +6 -0
  68. metaxy-0.0.0/src/metaxy/models/constants.py +24 -0
  69. metaxy-0.0.0/src/metaxy/models/feature.py +665 -0
  70. metaxy-0.0.0/src/metaxy/models/feature_spec.py +105 -0
  71. metaxy-0.0.0/src/metaxy/models/field.py +25 -0
  72. metaxy-0.0.0/src/metaxy/models/plan.py +155 -0
  73. metaxy-0.0.0/src/metaxy/models/types.py +157 -0
  74. metaxy-0.0.0/src/metaxy/py.typed +0 -0
metaxy-0.0.0/PKG-INFO ADDED
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.3
2
+ Name: metaxy
3
+ Version: 0.0.0
4
+ Summary: Add your description here
5
+ Author: Daniel Gafni
6
+ Author-email: Daniel Gafni <danielgafni16@gmail.com>
7
+ Requires-Dist: cyclopts==4.0.0b1
8
+ Requires-Dist: narwhals>=2.9.0
9
+ Requires-Dist: polars>=1.33.1
10
+ Requires-Dist: polars-hash>=0.5.1
11
+ Requires-Dist: pydantic>=2.11.9
12
+ Requires-Dist: pydantic-settings>=2.11.0
13
+ Requires-Dist: pyyaml>=6.0.0
14
+ Requires-Dist: tomli>=2.3.0
15
+ Requires-Dist: rich>=13.0.0
16
+ Requires-Dist: pygraphviz>=1.14 ; extra == 'graphviz'
17
+ Requires-Dist: pyarrow>=18.0.0 ; extra == 'ibis'
18
+ Requires-Dist: ibis-framework>=11.0.0 ; extra == 'ibis'
19
+ Requires-Dist: mermaid-py>=0.8.0 ; extra == 'mermaid'
20
+ Requires-Dist: sqlmodel>=0.0.27 ; extra == 'sqlmodel'
21
+ Requires-Python: >=3.10
22
+ Provides-Extra: graphviz
23
+ Provides-Extra: ibis
24
+ Provides-Extra: mermaid
25
+ Provides-Extra: sqlmodel
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Metaxy
29
+
30
+ ## Overview
31
+
32
+ **Metaxy** is a declarative metadata management system for multi-modal data and machine learning pipelines. Metaxy allows statically defining graphs of features with versioned **fields** -- logical components like `audio`, `frames` for `.mp4` files and **columns** for feature metadata stored in Metaxy's metadata store. With this in place, Metaxy provides:
33
+
34
+ - **Sample-level data versioning**: Track field and column lineage, compute versions as hashes of upstream versions for each sample
35
+ - **Incremental computation**: Automatically detect which samples need recomputation when upstream fields change
36
+ - **Migration system**: When feature code changes without changing outputs (refactoring, graph restructuring), Metaxy can reconcile metadata versions without recomputing expensive features
37
+ - **Storage flexibility**: Pluggable backends (DuckDB, ClickHouse, PostgreSQL, SQLite, in-memory) with native SQL optimization where possible
38
+ - **Big Metadata**: Metaxy is designed with large-scale distributed systems in mind and can handle large amounts of metadata efficiently.
39
+
40
+ Metaxy is designed for production data and ML systems where data and features evolve over time, and you need to track what changed, why, and whether expensive recomputation is actually necessary.
41
+
42
+ ## Data Versioning
43
+
44
+ To demonstrate how Metaxy handles data versioning, let's consider a video processing pipeline:
45
+
46
+ ```py
47
+ from metaxy import (
48
+ Feature,
49
+ FeatureDep,
50
+ FeatureKey,
51
+ FeatureSpec,
52
+ FieldDep,
53
+ FieldKey,
54
+ FieldSpec,
55
+ )
56
+
57
+
58
+ class Video(
59
+ Feature,
60
+ spec=FeatureSpec(
61
+ key=FeatureKey(["example", "video"]),
62
+ deps=None, # Root feature
63
+ fields=[
64
+ FieldSpec(
65
+ key=FieldKey(["audio"]),
66
+ code_version=1,
67
+ ),
68
+ FieldSpec(
69
+ key=FieldKey(["frames"]),
70
+ code_version=1,
71
+ ),
72
+ ],
73
+ ),
74
+ ):
75
+ """Video metadata feature (root)."""
76
+
77
+ frames: int
78
+ duration: float
79
+ size: int
80
+
81
+
82
+ class Crop(
83
+ Feature,
84
+ spec=FeatureSpec(
85
+ key=FeatureKey(["example", "crop"]),
86
+ deps=[FeatureDep(key=Video.spec.key)],
87
+ fields=[
88
+ FieldSpec(
89
+ key=FieldKey(["audio"]),
90
+ code_version=1,
91
+ deps=[
92
+ FieldDep(
93
+ feature_key=Video.spec.key,
94
+ fields=[FieldKey(["audio"])],
95
+ )
96
+ ],
97
+ ),
98
+ FieldSpec(
99
+ key=FieldKey(["frames"]),
100
+ code_version=1,
101
+ deps=[
102
+ FieldDep(
103
+ feature_key=Video.spec.key,
104
+ fields=[FieldKey(["frames"])],
105
+ )
106
+ ],
107
+ ),
108
+ ],
109
+ ),
110
+ ):
111
+ pass # omit columns for the sake of simplicity
112
+
113
+
114
+ class FaceDetection(
115
+ Feature,
116
+ spec=FeatureSpec(
117
+ key=FeatureKey(["example", "face_detection"]),
118
+ deps=[
119
+ FeatureDep(
120
+ key=Crop.spec.key,
121
+ )
122
+ ],
123
+ fields=[
124
+ FieldSpec(
125
+ key=FieldKey(["faces"]),
126
+ code_version=1,
127
+ deps=[
128
+ FieldDep(
129
+ feature_key=Crop.spec.key,
130
+ fields=[FieldKey(["frames"])],
131
+ )
132
+ ],
133
+ ),
134
+ ],
135
+ ),
136
+ ):
137
+ pass
138
+
139
+
140
+ class SpeechToText(
141
+ Feature,
142
+ spec=FeatureSpec(
143
+ key=FeatureKey(["overview", "stt"]),
144
+ deps=[
145
+ FeatureDep(
146
+ key=Video.spec.key,
147
+ )
148
+ ],
149
+ fields=[
150
+ FieldSpec(
151
+ key=FieldKey(["transcription"]),
152
+ code_version=1,
153
+ deps=[
154
+ FieldDep(
155
+ feature_key=Video.spec.key,
156
+ fields=[FieldKey(["audio"])],
157
+ )
158
+ ],
159
+ ),
160
+ ],
161
+ ),
162
+ ):
163
+ pass
164
+ ```
165
+
166
+ When provided with this Python module, `metaxy graph render --format mermaid` (that's handy, right?) produces the following graph:
167
+
168
+ ```mermaid
169
+ ---
170
+ title: Feature Graph
171
+ ---
172
+ flowchart TB
173
+ %% Snapshot version: 8468950d
174
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
175
+ example_video["<div style="text-align:left"><b>example/video</b><br/><small>(v: bc9ca835)</small><br/><font
176
+ color="#999">---</font><br/>• audio <small>(v: 22742381)</small><br/>• frames <small>(v: 794116a9)</small></div>"]
177
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><small>(v: 3ac04df8)</small><br/><font
178
+ color="#999">---</font><br/>• audio <small>(v: 76c8bdc9)</small><br/>• frames <small>(v: abc79017)</small></div>"]
179
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/><small>(v: 1ac83b07)</small><br/><font
180
+ color="#999">---</font><br/>• faces <small>(v: 2d75f0bd)</small></div>"]
181
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><small>(v: c83a754a)</small><br/><font
182
+ color="#999">---</font><br/>• transcription <small>(v: ac412b3c)</small></div>"]
183
+ example_video --> example_crop
184
+ example_crop --> example_face_detection
185
+ example_video --> example_stt
186
+ ```
187
+
188
+ Now imagine the `audio` logical field (don't mix up with metadata columns!) of the very first `Video` feature has been changed. Perhaps it has been cleaned or denoised.
189
+
190
+ ```diff
191
+ key=FeatureKey(["example", "video"]),
192
+ deps=None, # Root feature
193
+ fields=[
194
+ FieldSpec(
195
+ key=FieldKey(["audio"]),
196
+ - code_version=1,
197
+ + code_version=2,
198
+ ),
199
+ ```
200
+
201
+ In this case we'd typically want to recompute the downstream `Crop`, `SpeechToText` and `Embeddings` features, but not the `FaceDetection` feature, since it only depends on `frames` and not on `audio`.
202
+
203
+ `metaxy graph diff` reveals exactly that:
204
+
205
+ ```mermaid
206
+ ---
207
+ title: Merged Graph Diff
208
+ ---
209
+ flowchart TB
210
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
211
+
212
+ example_video["<div style="text-align:left"><b>example/video</b><br/><font color="#CC0000">bc9ca8</font> → <font
213
+ color="#00AA00">6db302</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
214
+ color="#CC0000">227423</font> → <font color="#00AA00">09c839</font>)<br/>- frames (794116)</div>"]
215
+ style example_video stroke:#FFA500,stroke-width:3px
216
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><font color="#CC0000">3ac04d</font> → <font
217
+ color="#00AA00">54dc7f</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
218
+ color="#CC0000">76c8bd</font> → <font color="#00AA00">f3130c</font>)<br/>- frames (abc790)</div>"]
219
+ style example_crop stroke:#FFA500,stroke-width:3px
220
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/>1ac83b<br/><font
221
+ color="#999">---</font><br/>- faces (2d75f0)</div>"]
222
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><font color="#CC0000">c83a75</font> → <font
223
+ color="#00AA00">066d34</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">transcription</font> (<font
224
+ color="#CC0000">ac412b</font> → <font color="#00AA00">058410</font>)</div>"]
225
+ style example_stt stroke:#FFA500,stroke-width:3px
226
+
227
+ example_video --> example_crop
228
+ example_crop --> example_face_detection
229
+ example_video --> example_stt
230
+ ```
231
+
232
+ The versions of `audio` fields through the graph as well as the whole `FaceDetection` feature stayed the same!
233
+
234
+ We can use Metaxy's static graph analysis to identify which features need to be recomputed when a new version of a feature is introduced. In addition to feature and field level versions, Metaxy can also compute a sample-level version (may be different for each sample in the one million dataset you have) ahead of computations through the whole graph. This enables exciting features such as processing cost prediction and automatic migrations for metadata.
235
+
236
+ ## Development
237
+
238
+ Setting up the environment:
239
+
240
+ ```shell
241
+ uv sync --all-extras
242
+ uv run prek install
243
+ ```
244
+
245
+ ## Examples
246
+
247
+ See [examples](examples/README.md).
metaxy-0.0.0/README.md ADDED
@@ -0,0 +1,220 @@
1
+ # Metaxy
2
+
3
+ ## Overview
4
+
5
+ **Metaxy** is a declarative metadata management system for multi-modal data and machine learning pipelines. Metaxy allows statically defining graphs of features with versioned **fields** -- logical components like `audio`, `frames` for `.mp4` files and **columns** for feature metadata stored in Metaxy's metadata store. With this in place, Metaxy provides:
6
+
7
+ - **Sample-level data versioning**: Track field and column lineage, compute versions as hashes of upstream versions for each sample
8
+ - **Incremental computation**: Automatically detect which samples need recomputation when upstream fields change
9
+ - **Migration system**: When feature code changes without changing outputs (refactoring, graph restructuring), Metaxy can reconcile metadata versions without recomputing expensive features
10
+ - **Storage flexibility**: Pluggable backends (DuckDB, ClickHouse, PostgreSQL, SQLite, in-memory) with native SQL optimization where possible
11
+ - **Big Metadata**: Metaxy is designed with large-scale distributed systems in mind and can handle large amounts of metadata efficiently.
12
+
13
+ Metaxy is designed for production data and ML systems where data and features evolve over time, and you need to track what changed, why, and whether expensive recomputation is actually necessary.
14
+
15
+ ## Data Versioning
16
+
17
+ To demonstrate how Metaxy handles data versioning, let's consider a video processing pipeline:
18
+
19
+ ```py
20
+ from metaxy import (
21
+ Feature,
22
+ FeatureDep,
23
+ FeatureKey,
24
+ FeatureSpec,
25
+ FieldDep,
26
+ FieldKey,
27
+ FieldSpec,
28
+ )
29
+
30
+
31
+ class Video(
32
+ Feature,
33
+ spec=FeatureSpec(
34
+ key=FeatureKey(["example", "video"]),
35
+ deps=None, # Root feature
36
+ fields=[
37
+ FieldSpec(
38
+ key=FieldKey(["audio"]),
39
+ code_version=1,
40
+ ),
41
+ FieldSpec(
42
+ key=FieldKey(["frames"]),
43
+ code_version=1,
44
+ ),
45
+ ],
46
+ ),
47
+ ):
48
+ """Video metadata feature (root)."""
49
+
50
+ frames: int
51
+ duration: float
52
+ size: int
53
+
54
+
55
+ class Crop(
56
+ Feature,
57
+ spec=FeatureSpec(
58
+ key=FeatureKey(["example", "crop"]),
59
+ deps=[FeatureDep(key=Video.spec.key)],
60
+ fields=[
61
+ FieldSpec(
62
+ key=FieldKey(["audio"]),
63
+ code_version=1,
64
+ deps=[
65
+ FieldDep(
66
+ feature_key=Video.spec.key,
67
+ fields=[FieldKey(["audio"])],
68
+ )
69
+ ],
70
+ ),
71
+ FieldSpec(
72
+ key=FieldKey(["frames"]),
73
+ code_version=1,
74
+ deps=[
75
+ FieldDep(
76
+ feature_key=Video.spec.key,
77
+ fields=[FieldKey(["frames"])],
78
+ )
79
+ ],
80
+ ),
81
+ ],
82
+ ),
83
+ ):
84
+ pass # omit columns for the sake of simplicity
85
+
86
+
87
+ class FaceDetection(
88
+ Feature,
89
+ spec=FeatureSpec(
90
+ key=FeatureKey(["example", "face_detection"]),
91
+ deps=[
92
+ FeatureDep(
93
+ key=Crop.spec.key,
94
+ )
95
+ ],
96
+ fields=[
97
+ FieldSpec(
98
+ key=FieldKey(["faces"]),
99
+ code_version=1,
100
+ deps=[
101
+ FieldDep(
102
+ feature_key=Crop.spec.key,
103
+ fields=[FieldKey(["frames"])],
104
+ )
105
+ ],
106
+ ),
107
+ ],
108
+ ),
109
+ ):
110
+ pass
111
+
112
+
113
+ class SpeechToText(
114
+ Feature,
115
+ spec=FeatureSpec(
116
+ key=FeatureKey(["overview", "stt"]),
117
+ deps=[
118
+ FeatureDep(
119
+ key=Video.spec.key,
120
+ )
121
+ ],
122
+ fields=[
123
+ FieldSpec(
124
+ key=FieldKey(["transcription"]),
125
+ code_version=1,
126
+ deps=[
127
+ FieldDep(
128
+ feature_key=Video.spec.key,
129
+ fields=[FieldKey(["audio"])],
130
+ )
131
+ ],
132
+ ),
133
+ ],
134
+ ),
135
+ ):
136
+ pass
137
+ ```
138
+
139
+ When provided with this Python module, `metaxy graph render --format mermaid` (that's handy, right?) produces the following graph:
140
+
141
+ ```mermaid
142
+ ---
143
+ title: Feature Graph
144
+ ---
145
+ flowchart TB
146
+ %% Snapshot version: 8468950d
147
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
148
+ example_video["<div style="text-align:left"><b>example/video</b><br/><small>(v: bc9ca835)</small><br/><font
149
+ color="#999">---</font><br/>• audio <small>(v: 22742381)</small><br/>• frames <small>(v: 794116a9)</small></div>"]
150
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><small>(v: 3ac04df8)</small><br/><font
151
+ color="#999">---</font><br/>• audio <small>(v: 76c8bdc9)</small><br/>• frames <small>(v: abc79017)</small></div>"]
152
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/><small>(v: 1ac83b07)</small><br/><font
153
+ color="#999">---</font><br/>• faces <small>(v: 2d75f0bd)</small></div>"]
154
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><small>(v: c83a754a)</small><br/><font
155
+ color="#999">---</font><br/>• transcription <small>(v: ac412b3c)</small></div>"]
156
+ example_video --> example_crop
157
+ example_crop --> example_face_detection
158
+ example_video --> example_stt
159
+ ```
160
+
161
+ Now imagine the `audio` logical field (don't mix up with metadata columns!) of the very first `Video` feature has been changed. Perhaps it has been cleaned or denoised.
162
+
163
+ ```diff
164
+ key=FeatureKey(["example", "video"]),
165
+ deps=None, # Root feature
166
+ fields=[
167
+ FieldSpec(
168
+ key=FieldKey(["audio"]),
169
+ - code_version=1,
170
+ + code_version=2,
171
+ ),
172
+ ```
173
+
174
+ In this case we'd typically want to recompute the downstream `Crop`, `SpeechToText` and `Embeddings` features, but not the `FaceDetection` feature, since it only depends on `frames` and not on `audio`.
175
+
176
+ `metaxy graph diff` reveals exactly that:
177
+
178
+ ```mermaid
179
+ ---
180
+ title: Merged Graph Diff
181
+ ---
182
+ flowchart TB
183
+ %%{init: {'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'themeVariables': {'fontSize': '14px'}}}%%
184
+
185
+ example_video["<div style="text-align:left"><b>example/video</b><br/><font color="#CC0000">bc9ca8</font> → <font
186
+ color="#00AA00">6db302</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
187
+ color="#CC0000">227423</font> → <font color="#00AA00">09c839</font>)<br/>- frames (794116)</div>"]
188
+ style example_video stroke:#FFA500,stroke-width:3px
189
+ example_crop["<div style="text-align:left"><b>example/crop</b><br/><font color="#CC0000">3ac04d</font> → <font
190
+ color="#00AA00">54dc7f</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">audio</font> (<font
191
+ color="#CC0000">76c8bd</font> → <font color="#00AA00">f3130c</font>)<br/>- frames (abc790)</div>"]
192
+ style example_crop stroke:#FFA500,stroke-width:3px
193
+ example_face_detection["<div style="text-align:left"><b>example/face_detection</b><br/>1ac83b<br/><font
194
+ color="#999">---</font><br/>- faces (2d75f0)</div>"]
195
+ example_stt["<div style="text-align:left"><b>example/stt</b><br/><font color="#CC0000">c83a75</font> → <font
196
+ color="#00AA00">066d34</font><br/><font color="#999">---</font><br/>- <font color="#FFAA00">transcription</font> (<font
197
+ color="#CC0000">ac412b</font> → <font color="#00AA00">058410</font>)</div>"]
198
+ style example_stt stroke:#FFA500,stroke-width:3px
199
+
200
+ example_video --> example_crop
201
+ example_crop --> example_face_detection
202
+ example_video --> example_stt
203
+ ```
204
+
205
+ The versions of `audio` fields through the graph as well as the whole `FaceDetection` feature stayed the same!
206
+
207
+ We can use Metaxy's static graph analysis to identify which features need to be recomputed when a new version of a feature is introduced. In addition to feature and field level versions, Metaxy can also compute a sample-level version (may be different for each sample in the one million dataset you have) ahead of computations through the whole graph. This enables exciting features such as processing cost prediction and automatic migrations for metadata.
208
+
209
+ ## Development
210
+
211
+ Setting up the environment:
212
+
213
+ ```shell
214
+ uv sync --all-extras
215
+ uv run prek install
216
+ ```
217
+
218
+ ## Examples
219
+
220
+ See [examples](examples/README.md).
@@ -0,0 +1,112 @@
1
+ [project]
2
+ name = "metaxy"
3
+ version = "0.0.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Daniel Gafni", email = "danielgafni16@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "cyclopts==4.0.0b1",
12
+ "narwhals>=2.9.0",
13
+ "polars>=1.33.1",
14
+ "polars-hash>=0.5.1",
15
+ "pydantic>=2.11.9",
16
+ "pydantic-settings>=2.11.0",
17
+ "pyyaml>=6.0.0",
18
+ "tomli>=2.3.0",
19
+ "rich>=13.0.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ ibis = [
24
+ "pyarrow>=18.0.0",
25
+ "ibis-framework>=11.0.0",
26
+ ]
27
+ mermaid = [
28
+ "mermaid-py>=0.8.0",
29
+ ]
30
+ graphviz = [
31
+ "pygraphviz>=1.14",
32
+ ]
33
+ sqlmodel = [
34
+ "sqlmodel>=0.0.27",
35
+ ]
36
+
37
+ [project.scripts]
38
+ metaxy = "metaxy.cli.app:main"
39
+
40
+ [tool.pyrefly]
41
+ project-includes = ["src", "tests"]
42
+ # search-path = ["src"]
43
+ project-excludes = [
44
+ "**/node_modules",
45
+ "**/__pycache__",
46
+ "**/*venv/**/*",
47
+ ]
48
+
49
+ [build-system]
50
+ requires = ["uv_build>=0.8.14,<0.9.0"]
51
+ build-backend = "uv_build"
52
+
53
+ [dependency-groups]
54
+ dev = [
55
+ "examples",
56
+ "ipython>=8.37.0",
57
+ "duckdb>=1.4.1",
58
+ "pyrefly>=0.34.0",
59
+ "pytest>=8.4.2",
60
+ "pytest-cases>=3.9.1",
61
+ "ruff>=0.13.1",
62
+ "syrupy>=5.0.0",
63
+ "ibis-framework[duckdb,sqlite,clickhouse]>=11.0.0",
64
+ "basedpyright>=1.32.1",
65
+ "vulture>=2.14",
66
+ "prek>=0.2.11",
67
+ ]
68
+
69
+ [tool.ruff]
70
+ target-version = "py310"
71
+
72
+ [tool.ruff.lint]
73
+ extend-select = [
74
+ "I",
75
+ "TID252",
76
+ "UP"
77
+ ]
78
+
79
+ [tool.ruff.lint.flake8-tidy-imports]
80
+ # Ban certain modules from being imported at module level, instead requiring
81
+ # that they're imported lazily (e.g., within a function definition).
82
+ banned-module-level-imports = ["ibis", "duckdb"]
83
+
84
+ [tool.uv.workspace]
85
+ members = [
86
+ "examples",
87
+ ]
88
+
89
+ [tool.uv.sources]
90
+ examples = { workspace = true }
91
+
92
+ [tool.vulture]
93
+ exclude = [
94
+ "src/metaxy/_testing.py",
95
+ "src/metaxy/cli/",
96
+ "examples/**/feature*.py",
97
+ ]
98
+ min_confidence = 60
99
+
100
+ [tool.pyright]
101
+ reportAny = "none"
102
+ reportExplicitAny = "none"
103
+ include = [
104
+ "src/metaxy",
105
+ "tests",
106
+ "examples",
107
+ ]
108
+ exclude = [
109
+ "**/node_modules",
110
+ "**/__pycache__",
111
+ "**/*venv/**/*",
112
+ ]
@@ -0,0 +1,61 @@
1
+ from metaxy.config import MetaxyConfig, StoreConfig
2
+ from metaxy.entrypoints import (
3
+ load_features,
4
+ load_module_entrypoint,
5
+ load_package_entrypoints,
6
+ )
7
+ from metaxy.metadata_store import (
8
+ InMemoryMetadataStore,
9
+ MetadataStore,
10
+ )
11
+ from metaxy.migrations import (
12
+ BaseOperation,
13
+ CustomMigration,
14
+ DataVersionReconciliation,
15
+ DiffMigration,
16
+ FullGraphMigration,
17
+ MetadataBackfill,
18
+ Migration,
19
+ MigrationExecutor,
20
+ MigrationResult,
21
+ SystemTableStorage,
22
+ detect_migration,
23
+ )
24
+ from metaxy.models.feature import Feature, FeatureGraph, get_feature_by_key, graph
25
+ from metaxy.models.feature_spec import FeatureDep, FeatureSpec
26
+ from metaxy.models.field import FieldDep, FieldSpec, SpecialFieldDep
27
+ from metaxy.models.types import FeatureDepMetadata, FeatureKey, FieldKey
28
+
29
+ __all__ = [
30
+ "Feature",
31
+ "FeatureGraph",
32
+ "graph",
33
+ "get_feature_by_key",
34
+ "FeatureDep",
35
+ "FeatureDepMetadata",
36
+ "FeatureSpec",
37
+ "FieldDep",
38
+ "FieldSpec",
39
+ "SpecialFieldDep",
40
+ "FeatureKey",
41
+ "FieldKey",
42
+ "MetadataStore",
43
+ "InMemoryMetadataStore",
44
+ "load_features",
45
+ "load_config_entrypoints",
46
+ "load_module_entrypoint",
47
+ "load_package_entrypoints",
48
+ "Migration",
49
+ "DiffMigration",
50
+ "FullGraphMigration",
51
+ "CustomMigration",
52
+ "MigrationResult",
53
+ "MigrationExecutor",
54
+ "SystemTableStorage",
55
+ "BaseOperation",
56
+ "DataVersionReconciliation",
57
+ "MetadataBackfill",
58
+ "detect_migration",
59
+ "MetaxyConfig",
60
+ "StoreConfig",
61
+ ]