metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""Field mapping system for automatic field dependency resolution.
|
|
2
|
+
|
|
3
|
+
This module provides a flexible system for defining how fields map to upstream
|
|
4
|
+
dependencies, supporting both automatic mapping patterns and explicit configurations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import TYPE_CHECKING, Literal
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
|
14
|
+
from pydantic import Field as PydanticField
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from metaxy.models.types import (
|
|
18
|
+
CoercibleToFieldKey,
|
|
19
|
+
FeatureKey,
|
|
20
|
+
FieldKey,
|
|
21
|
+
ValidatedFieldKeyAdapter,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FieldsMappingType(str, Enum):
|
|
29
|
+
"""Type of fields mapping between a field key and the upstream field keys."""
|
|
30
|
+
|
|
31
|
+
DEFAULT = "default"
|
|
32
|
+
SPECIFIC = "specific"
|
|
33
|
+
ALL = "all"
|
|
34
|
+
NONE = "none"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FieldsMappingResolutionContext(BaseModel):
|
|
38
|
+
"""Context for resolving field mappings.
|
|
39
|
+
|
|
40
|
+
This contains all the information needed to resolve field dependencies
|
|
41
|
+
including the upstream feature being mapped against.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
model_config = ConfigDict(frozen=True)
|
|
45
|
+
|
|
46
|
+
field_key: FieldKey
|
|
47
|
+
"""The downstream field key being resolved."""
|
|
48
|
+
|
|
49
|
+
upstream_feature: FeatureSpec
|
|
50
|
+
"""The upstream feature spec being resolved against."""
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def upstream_feature_key(self) -> FeatureKey:
|
|
54
|
+
"""Get the upstream feature key."""
|
|
55
|
+
return self.upstream_feature.key
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def upstream_feature_fields(self) -> set[FieldKey]:
|
|
59
|
+
"""Get the set of field keys from the upstream feature."""
|
|
60
|
+
return {field.key for field in self.upstream_feature.fields}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class BaseFieldsMapping(BaseModel, ABC): # pyright: ignore[reportUnsafeMultipleInheritance]
|
|
64
|
+
"""Base class for field mapping configurations.
|
|
65
|
+
|
|
66
|
+
Field mappings define how a field automatically resolves its dependencies
|
|
67
|
+
based on upstream feature fields.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
model_config = ConfigDict(frozen=True)
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def resolve_field_deps(
|
|
74
|
+
self,
|
|
75
|
+
context: FieldsMappingResolutionContext,
|
|
76
|
+
) -> set[FieldKey]:
|
|
77
|
+
"""Resolve automatic field mapping to explicit FieldDep list.
|
|
78
|
+
|
|
79
|
+
This method should be overridden by concrete implementations.
|
|
80
|
+
|
|
81
|
+
Arguments:
|
|
82
|
+
context: The resolution context containing field key and upstream feature.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Set of [FieldKey][metaxy.models.types.FieldKey] instances for matching fields
|
|
86
|
+
"""
|
|
87
|
+
raise NotImplementedError
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SpecificFieldsMapping(BaseFieldsMapping):
|
|
91
|
+
"""Field mapping that explicitly depends on specific upstream fields."""
|
|
92
|
+
|
|
93
|
+
type: Literal[FieldsMappingType.SPECIFIC] = FieldsMappingType.SPECIFIC
|
|
94
|
+
mapping: dict[FieldKey, set[FieldKey]]
|
|
95
|
+
|
|
96
|
+
def resolve_field_deps(
|
|
97
|
+
self,
|
|
98
|
+
context: FieldsMappingResolutionContext,
|
|
99
|
+
) -> set[FieldKey]:
|
|
100
|
+
desired_upstream_fields = self.mapping.get(context.field_key, set())
|
|
101
|
+
return desired_upstream_fields & context.upstream_feature_fields
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class AllFieldsMapping(BaseFieldsMapping):
|
|
105
|
+
"""Field mapping that explicitly depends on all upstream fields."""
|
|
106
|
+
|
|
107
|
+
type: Literal[FieldsMappingType.ALL] = FieldsMappingType.ALL
|
|
108
|
+
|
|
109
|
+
def resolve_field_deps(
|
|
110
|
+
self,
|
|
111
|
+
context: FieldsMappingResolutionContext,
|
|
112
|
+
) -> set[FieldKey]:
|
|
113
|
+
return context.upstream_feature_fields
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class NoneFieldsMapping(BaseFieldsMapping):
|
|
117
|
+
"""Field mapping that never matches any upstream fields."""
|
|
118
|
+
|
|
119
|
+
type: Literal[FieldsMappingType.NONE] = FieldsMappingType.NONE
|
|
120
|
+
|
|
121
|
+
def resolve_field_deps(
|
|
122
|
+
self,
|
|
123
|
+
context: FieldsMappingResolutionContext,
|
|
124
|
+
) -> set[FieldKey]:
|
|
125
|
+
return set()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class DefaultFieldsMapping(BaseFieldsMapping):
|
|
129
|
+
"""Default automatic field mapping configuration.
|
|
130
|
+
|
|
131
|
+
When used, automatically maps fields to matching upstream fields based on field keys.
|
|
132
|
+
|
|
133
|
+
Attributes:
|
|
134
|
+
match_suffix: If True, allows suffix matching (e.g., "french" matches "audio/french")
|
|
135
|
+
exclude_fields: List of field keys to exclude from auto-mapping
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
type: Literal[FieldsMappingType.DEFAULT] = FieldsMappingType.DEFAULT
|
|
139
|
+
match_suffix: bool = False
|
|
140
|
+
exclude_fields: list[FieldKey] = PydanticField(default_factory=list)
|
|
141
|
+
|
|
142
|
+
def resolve_field_deps(
|
|
143
|
+
self,
|
|
144
|
+
context: FieldsMappingResolutionContext,
|
|
145
|
+
) -> set[FieldKey]:
|
|
146
|
+
res = set()
|
|
147
|
+
|
|
148
|
+
for upstream_field_key in context.upstream_feature_fields:
|
|
149
|
+
# Skip excluded fields
|
|
150
|
+
if upstream_field_key in self.exclude_fields:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
# Check for exact match
|
|
154
|
+
if upstream_field_key == context.field_key:
|
|
155
|
+
res.add(upstream_field_key)
|
|
156
|
+
# Check for suffix match if enabled
|
|
157
|
+
elif self.match_suffix and self._is_suffix_match(
|
|
158
|
+
context.field_key, upstream_field_key
|
|
159
|
+
):
|
|
160
|
+
res.add(upstream_field_key)
|
|
161
|
+
|
|
162
|
+
# If no fields matched, return ALL fields from this upstream feature
|
|
163
|
+
# (excluding any explicitly excluded fields)
|
|
164
|
+
if not res:
|
|
165
|
+
for upstream_field_key in context.upstream_feature_fields:
|
|
166
|
+
if upstream_field_key not in self.exclude_fields:
|
|
167
|
+
res.add(upstream_field_key)
|
|
168
|
+
|
|
169
|
+
return res
|
|
170
|
+
|
|
171
|
+
def _is_suffix_match(
|
|
172
|
+
self, field_key: FieldKey, upstream_field_key: FieldKey
|
|
173
|
+
) -> bool:
|
|
174
|
+
"""Check if field_key is a suffix of upstream_field_key.
|
|
175
|
+
|
|
176
|
+
For hierarchical keys like "audio/french", this checks if "french"
|
|
177
|
+
matches the suffix.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
field_key: The field key for which to resolve dependencies.
|
|
181
|
+
upstream_fields_by_feature_key: Mapping of upstream feature keys to their fields.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
True if field_key is a suffix of upstream_field_key
|
|
185
|
+
"""
|
|
186
|
+
# For single-part keys, check if it's the last part of a multi-part key
|
|
187
|
+
if len(field_key.parts) == 1 and len(upstream_field_key.parts) > 1:
|
|
188
|
+
return field_key.parts[0] == upstream_field_key.parts[-1]
|
|
189
|
+
|
|
190
|
+
# For multi-part keys, check if all parts match as suffix
|
|
191
|
+
if len(field_key.parts) <= len(upstream_field_key.parts):
|
|
192
|
+
return upstream_field_key.parts[-len(field_key.parts) :] == field_key.parts
|
|
193
|
+
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class FieldsMapping(BaseModel):
|
|
198
|
+
"""Base class for field mapping configurations.
|
|
199
|
+
|
|
200
|
+
Field mappings define how a field automatically resolves its dependencies
|
|
201
|
+
based on upstream feature fields. This is separate from explicit field
|
|
202
|
+
dependencies which are defined directly.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
model_config = ConfigDict(frozen=True)
|
|
206
|
+
# mapping: BaseFieldsMapping
|
|
207
|
+
mapping: (
|
|
208
|
+
AllFieldsMapping
|
|
209
|
+
| SpecificFieldsMapping
|
|
210
|
+
| NoneFieldsMapping
|
|
211
|
+
| DefaultFieldsMapping
|
|
212
|
+
) = PydanticField(..., discriminator="type")
|
|
213
|
+
|
|
214
|
+
def resolve_field_deps(
|
|
215
|
+
self,
|
|
216
|
+
context: FieldsMappingResolutionContext,
|
|
217
|
+
) -> set[FieldKey]:
|
|
218
|
+
"""Resolve field dependencies based on upstream feature fields.
|
|
219
|
+
|
|
220
|
+
Invokes the provided mapping to resolve dependencies.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
context: The resolution context containing field key and upstream feature.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Set of [FieldKey][metaxy.models.types.FieldKey] instances for matching fields
|
|
227
|
+
"""
|
|
228
|
+
return self.mapping.resolve_field_deps(context)
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def default(
|
|
232
|
+
cls,
|
|
233
|
+
*,
|
|
234
|
+
match_suffix: bool = False,
|
|
235
|
+
exclude_fields: list[FieldKey] | None = None,
|
|
236
|
+
) -> Self:
|
|
237
|
+
"""Create a default field mapping configuration.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
match_suffix: If True, allows suffix matching (e.g., "french" matches "audio/french")
|
|
241
|
+
exclude_fields: List of field keys to exclude from auto-mapping
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Configured FieldsMapping instance.
|
|
245
|
+
"""
|
|
246
|
+
return cls(
|
|
247
|
+
mapping=DefaultFieldsMapping(
|
|
248
|
+
match_suffix=match_suffix,
|
|
249
|
+
exclude_fields=exclude_fields or [],
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def specific(
|
|
255
|
+
cls, mapping: dict[CoercibleToFieldKey, set[CoercibleToFieldKey]]
|
|
256
|
+
) -> Self:
|
|
257
|
+
"""Create a field mapping that maps downstream field keys into specific upstream field keys.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
mapping: Mapping of downstream field keys to sets of upstream field keys.
|
|
261
|
+
Keys and values can be strings, sequences, or FieldKey instances.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Configured FieldsMapping instance.
|
|
265
|
+
"""
|
|
266
|
+
# Validate and coerce the mapping keys and values
|
|
267
|
+
validated_mapping: dict[FieldKey, set[FieldKey]] = {}
|
|
268
|
+
for key, value_set in mapping.items():
|
|
269
|
+
validated_key = ValidatedFieldKeyAdapter.validate_python(key)
|
|
270
|
+
validated_values = {
|
|
271
|
+
ValidatedFieldKeyAdapter.validate_python(v) for v in value_set
|
|
272
|
+
}
|
|
273
|
+
validated_mapping[validated_key] = validated_values
|
|
274
|
+
|
|
275
|
+
return cls(mapping=SpecificFieldsMapping(mapping=validated_mapping))
|
|
276
|
+
|
|
277
|
+
@classmethod
|
|
278
|
+
def all(cls) -> Self:
|
|
279
|
+
"""Create a field mapping that explicitly depends on all upstream fields.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Configured FieldsMapping instance.
|
|
283
|
+
|
|
284
|
+
Examples:
|
|
285
|
+
>>> # Use in field specifications
|
|
286
|
+
>>> FieldSpec(
|
|
287
|
+
... key="combined",
|
|
288
|
+
... fields_mapping=FieldsMapping.all()
|
|
289
|
+
... )
|
|
290
|
+
"""
|
|
291
|
+
return cls(mapping=AllFieldsMapping())
|
|
292
|
+
|
|
293
|
+
@classmethod
|
|
294
|
+
def none(cls) -> Self:
|
|
295
|
+
"""Create a field mapping that explicitly depends on no upstream fields.
|
|
296
|
+
|
|
297
|
+
This is typically useful when explicitly defining [FieldSpec.deps][metaxy.models.field.FieldSpec] instead.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Configured FieldsMapping instance.
|
|
301
|
+
"""
|
|
302
|
+
return cls(mapping=NoneFieldsMapping())
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
FieldsMappingAdapter = TypeAdapter(
|
|
306
|
+
AllFieldsMapping | SpecificFieldsMapping | NoneFieldsMapping | DefaultFieldsMapping
|
|
307
|
+
)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, NamedTuple
|
|
4
|
+
|
|
5
|
+
import narwhals as nw
|
|
6
|
+
import sqlglot
|
|
7
|
+
from pydantic import field_serializer, model_validator
|
|
8
|
+
from sqlglot import exp
|
|
9
|
+
from sqlglot.errors import ParseError
|
|
10
|
+
|
|
11
|
+
from metaxy.models.bases import FrozenBaseModel
|
|
12
|
+
|
|
13
|
+
LiteralValue = bool | int | float | str | None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FilterParseError(ValueError):
|
|
17
|
+
"""Raised when a filter string cannot be parsed into a supported expression."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OperandInfo(NamedTuple):
|
|
21
|
+
expr: nw.Expr
|
|
22
|
+
is_literal: bool
|
|
23
|
+
literal_value: LiteralValue
|
|
24
|
+
is_column: bool
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NarwhalsFilter(FrozenBaseModel):
|
|
28
|
+
"""Pydantic model for serializable Narwhals filter expressions."""
|
|
29
|
+
|
|
30
|
+
expression: sqlglot.exp.Expression
|
|
31
|
+
source: str | None = None
|
|
32
|
+
|
|
33
|
+
model_config = {
|
|
34
|
+
"arbitrary_types_allowed": True,
|
|
35
|
+
"extra": "forbid",
|
|
36
|
+
"frozen": True,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
@model_validator(mode="before")
|
|
40
|
+
@classmethod
|
|
41
|
+
def _parse_expression_from_string(cls, data: Any) -> Any:
|
|
42
|
+
if isinstance(data, str):
|
|
43
|
+
expression = _parse_to_sqlglot_expression(data)
|
|
44
|
+
return {"expression": expression, "source": data}
|
|
45
|
+
return data
|
|
46
|
+
|
|
47
|
+
@field_serializer("expression")
|
|
48
|
+
def _serialize_expression(self, expression: sqlglot.exp.Expression) -> str:
|
|
49
|
+
return expression.sql()
|
|
50
|
+
|
|
51
|
+
def to_expr(self) -> nw.Expr:
|
|
52
|
+
"""Convert the stored expression into a Narwhals ``Expr``."""
|
|
53
|
+
return _expression_to_narwhals(self.expression)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_filter_string(filter_string: str) -> nw.Expr:
|
|
57
|
+
"""Parse a SQL WHERE-like string into a Narwhals expression.
|
|
58
|
+
|
|
59
|
+
The parser understands SQL `WHERE` clauses composed of comparison operators, logical operators, parentheses,
|
|
60
|
+
dotted identifiers, and literal values (strings, numbers, booleans, ``NULL``).
|
|
61
|
+
|
|
62
|
+
This functionality is implemented with [SQLGlot](https://sqlglot.com/).
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
```python
|
|
66
|
+
parse_filter_string("NOT (status = 'deleted') AND deleted_at = NULL")
|
|
67
|
+
# Returns: (~(nw.col("status") == "deleted")) & nw.col("deleted_at").is_null()
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
return NarwhalsFilter.model_validate(filter_string).to_expr()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _parse_to_sqlglot_expression(filter_string: str) -> sqlglot.exp.Expression:
|
|
74
|
+
if not filter_string or not filter_string.strip():
|
|
75
|
+
raise FilterParseError("Filter string cannot be empty.")
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
parsed = sqlglot.parse_one(filter_string)
|
|
79
|
+
except ParseError as exc:
|
|
80
|
+
msg = f"Failed to parse filter string: {exc}"
|
|
81
|
+
raise FilterParseError(msg) from exc
|
|
82
|
+
|
|
83
|
+
if parsed is None:
|
|
84
|
+
raise FilterParseError(
|
|
85
|
+
f"Failed to parse filter string into an expression for {filter_string}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return parsed
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _expression_to_narwhals(node: exp.Expression) -> nw.Expr:
|
|
92
|
+
"""Convert a SQLGlot expression AST node to a Narwhals expression."""
|
|
93
|
+
node = _strip_parens(node)
|
|
94
|
+
|
|
95
|
+
# Logical operators
|
|
96
|
+
if isinstance(node, exp.Not):
|
|
97
|
+
operand = node.this
|
|
98
|
+
if operand is None:
|
|
99
|
+
raise FilterParseError("NOT operator requires an operand.")
|
|
100
|
+
return ~_expression_to_narwhals(operand)
|
|
101
|
+
|
|
102
|
+
if isinstance(node, exp.And):
|
|
103
|
+
return _expression_to_narwhals(node.this) & _expression_to_narwhals(
|
|
104
|
+
node.expression
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if isinstance(node, exp.Or):
|
|
108
|
+
return _expression_to_narwhals(node.this) | _expression_to_narwhals(
|
|
109
|
+
node.expression
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Comparison operators - direct mapping to Narwhals operations
|
|
113
|
+
if isinstance(node, (exp.EQ, exp.NEQ, exp.GT, exp.LT, exp.GTE, exp.LTE)):
|
|
114
|
+
left = getattr(node, "this", None)
|
|
115
|
+
right = getattr(node, "expression", None)
|
|
116
|
+
if left is None or right is None:
|
|
117
|
+
raise FilterParseError(
|
|
118
|
+
f"Comparison operator {type(node).__name__} requires two operands."
|
|
119
|
+
)
|
|
120
|
+
left_operand = _operand_info(left)
|
|
121
|
+
right_operand = _operand_info(right)
|
|
122
|
+
|
|
123
|
+
# Handle NULL comparisons with IS NULL / IS NOT NULL
|
|
124
|
+
null_comparison = _maybe_null_comparison(left_operand, right_operand, node)
|
|
125
|
+
if null_comparison is not None:
|
|
126
|
+
return null_comparison
|
|
127
|
+
|
|
128
|
+
# Apply the appropriate Narwhals operator
|
|
129
|
+
if isinstance(node, exp.EQ):
|
|
130
|
+
return left_operand.expr == right_operand.expr
|
|
131
|
+
elif isinstance(node, exp.NEQ):
|
|
132
|
+
return left_operand.expr != right_operand.expr
|
|
133
|
+
elif isinstance(node, exp.GT):
|
|
134
|
+
return left_operand.expr > right_operand.expr
|
|
135
|
+
elif isinstance(node, exp.LT):
|
|
136
|
+
return left_operand.expr < right_operand.expr
|
|
137
|
+
elif isinstance(node, exp.GTE):
|
|
138
|
+
return left_operand.expr >= right_operand.expr
|
|
139
|
+
elif isinstance(node, exp.LTE):
|
|
140
|
+
return left_operand.expr <= right_operand.expr
|
|
141
|
+
|
|
142
|
+
# Terminal nodes (operands)
|
|
143
|
+
if isinstance(
|
|
144
|
+
node,
|
|
145
|
+
(
|
|
146
|
+
exp.Column,
|
|
147
|
+
exp.Identifier,
|
|
148
|
+
exp.Boolean,
|
|
149
|
+
exp.Literal,
|
|
150
|
+
exp.Null,
|
|
151
|
+
exp.Neg,
|
|
152
|
+
),
|
|
153
|
+
):
|
|
154
|
+
return _operand_info(node).expr
|
|
155
|
+
|
|
156
|
+
raise FilterParseError(f"Unsupported expression: {node.sql()}")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _operand_info(node: exp.Expression) -> OperandInfo:
|
|
160
|
+
"""Extract operand information from a SQLGlot expression node."""
|
|
161
|
+
node = _strip_parens(node)
|
|
162
|
+
|
|
163
|
+
if isinstance(node, exp.Column):
|
|
164
|
+
return OperandInfo(
|
|
165
|
+
expr=nw.col(_column_name(node)),
|
|
166
|
+
is_literal=False,
|
|
167
|
+
literal_value=None,
|
|
168
|
+
is_column=True,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if isinstance(node, exp.Identifier):
|
|
172
|
+
return OperandInfo(
|
|
173
|
+
expr=nw.col(_column_name(node)),
|
|
174
|
+
is_literal=False,
|
|
175
|
+
literal_value=None,
|
|
176
|
+
is_column=True,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if isinstance(node, exp.Neg):
|
|
180
|
+
inner = node.this
|
|
181
|
+
if inner is None:
|
|
182
|
+
raise FilterParseError("Unary minus requires an operand.")
|
|
183
|
+
operand = _operand_info(inner)
|
|
184
|
+
if not operand.is_literal or not isinstance(
|
|
185
|
+
operand.literal_value, (int, float)
|
|
186
|
+
):
|
|
187
|
+
raise FilterParseError("Unary minus only supported for numeric literals.")
|
|
188
|
+
value = -operand.literal_value
|
|
189
|
+
return OperandInfo(
|
|
190
|
+
expr=nw.lit(value), is_literal=True, literal_value=value, is_column=False
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if isinstance(node, exp.Literal):
|
|
194
|
+
value = _literal_to_python(node)
|
|
195
|
+
return OperandInfo(
|
|
196
|
+
expr=nw.lit(value), is_literal=True, literal_value=value, is_column=False
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if isinstance(node, exp.Boolean):
|
|
200
|
+
value = _literal_to_python(node)
|
|
201
|
+
return OperandInfo(
|
|
202
|
+
expr=nw.lit(value), is_literal=True, literal_value=value, is_column=False
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if isinstance(node, exp.Null):
|
|
206
|
+
return OperandInfo(
|
|
207
|
+
expr=nw.lit(None), is_literal=True, literal_value=None, is_column=False
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
raise FilterParseError(f"Unsupported operand: {node.sql()}")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _maybe_null_comparison(
|
|
214
|
+
left: OperandInfo,
|
|
215
|
+
right: OperandInfo,
|
|
216
|
+
node: exp.Expression,
|
|
217
|
+
) -> nw.Expr | None:
|
|
218
|
+
"""Handle SQL NULL comparisons, converting to IS NULL / IS NOT NULL."""
|
|
219
|
+
if left.is_literal and left.literal_value is None and right.is_column:
|
|
220
|
+
column_expr = right.expr
|
|
221
|
+
if isinstance(node, exp.EQ):
|
|
222
|
+
return column_expr.is_null()
|
|
223
|
+
if isinstance(node, exp.NEQ):
|
|
224
|
+
return ~column_expr.is_null()
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
if right.is_literal and right.literal_value is None and left.is_column:
|
|
228
|
+
column_expr = left.expr
|
|
229
|
+
if isinstance(node, exp.EQ):
|
|
230
|
+
return column_expr.is_null()
|
|
231
|
+
if isinstance(node, exp.NEQ):
|
|
232
|
+
return ~column_expr.is_null()
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _literal_to_python(node: exp.Expression) -> LiteralValue:
|
|
239
|
+
"""Convert a SQLGlot literal node to a Python value."""
|
|
240
|
+
match node:
|
|
241
|
+
case exp.Null():
|
|
242
|
+
return None
|
|
243
|
+
case exp.Boolean():
|
|
244
|
+
return node.this is True or str(node.this).lower() == "true"
|
|
245
|
+
case exp.Literal():
|
|
246
|
+
literal = node
|
|
247
|
+
if literal.is_string:
|
|
248
|
+
return literal.name
|
|
249
|
+
if literal.is_int:
|
|
250
|
+
return int(literal.this)
|
|
251
|
+
if literal.is_number:
|
|
252
|
+
return float(literal.this)
|
|
253
|
+
return literal.this
|
|
254
|
+
case _:
|
|
255
|
+
raise FilterParseError(f"Unsupported literal: {node.sql()}")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _strip_parens(node: exp.Expression) -> exp.Expression:
|
|
259
|
+
"""Remove surrounding parentheses from an expression."""
|
|
260
|
+
current = node
|
|
261
|
+
while isinstance(current, exp.Paren) and current.this is not None:
|
|
262
|
+
current = current.this
|
|
263
|
+
return current
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _identifier_part_to_string(part: exp.Expression | str) -> str:
|
|
267
|
+
"""Convert a column identifier part to a string."""
|
|
268
|
+
if isinstance(part, exp.Identifier):
|
|
269
|
+
return part.name
|
|
270
|
+
if isinstance(part, exp.Star):
|
|
271
|
+
return "*"
|
|
272
|
+
if isinstance(part, exp.Expression):
|
|
273
|
+
return part.sql(dialect="")
|
|
274
|
+
return str(part)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _column_name(node: exp.Expression) -> str:
|
|
278
|
+
"""Extract the column name from a Column or Identifier node."""
|
|
279
|
+
if isinstance(node, exp.Column):
|
|
280
|
+
parts = [_identifier_part_to_string(part) for part in node.parts or ()]
|
|
281
|
+
name = ".".join(part for part in parts if part)
|
|
282
|
+
elif isinstance(node, exp.Identifier):
|
|
283
|
+
name = node.name
|
|
284
|
+
else:
|
|
285
|
+
name = node.sql(dialect="")
|
|
286
|
+
|
|
287
|
+
name = name.strip()
|
|
288
|
+
if not name:
|
|
289
|
+
raise FilterParseError("Column reference is malformed.")
|
|
290
|
+
return name
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
__all__ = [
|
|
294
|
+
"FilterParseError",
|
|
295
|
+
"NarwhalsFilter",
|
|
296
|
+
"parse_filter_string",
|
|
297
|
+
]
|